This in particular allows to remove inline asm, which is the case for x86 in this patch.
-- Christophe
From 85ec857da4251e3709f5c53213ddd5ec39d21895 Mon Sep 17 00:00:00 2001 From: Christophe Gisquet <[email protected]> Date: Sat, 8 Feb 2014 17:48:43 +0100 Subject: [PATCH 8/9] dcadsp: move test out of loop and dspize The vector dequantization has a test in a loop preventing effective SIMD implementation. By moving it out of the loop, this loop can be DSPized. Therefore, modify the current DSP implementation. For x86 Arrandale: C SSE SSE2 SSE4 win32: 260 162 119 104 win64: 242 N/A 89 72 --- libavcodec/arm/dca.h | 23 ----------------- libavcodec/arm/dcadsp_init_arm.c | 42 ++++++++++++++++++++++++++++++ libavcodec/dcadec.c | 41 ++++++++++-------------------- libavcodec/dcadsp.c | 21 ++++++++++----- libavcodec/dcadsp.h | 8 +++++- libavcodec/x86/dca.h | 55 ---------------------------------------- libavcodec/x86/dcadsp.asm | 55 +++++++++++++++++++++++++++++----------- libavcodec/x86/dcadsp_init.c | 18 ++++++++----- 8 files changed, 129 insertions(+), 134 deletions(-) delete mode 100644 libavcodec/x86/dca.h diff --git a/libavcodec/arm/dca.h b/libavcodec/arm/dca.h index 580bd75..4aed576 100644 --- a/libavcodec/arm/dca.h +++ b/libavcodec/arm/dca.h @@ -81,27 +81,4 @@ static inline int decode_blockcodes(int code1, int code2, int levels, #endif -#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y - -#define int8x8_fmul_int32 int8x8_fmul_int32 -static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp, - float *dst, const int8_t *src, int scale) -{ - __asm__ ("vcvt.f32.s32 %2, %2, #4 \n" - "vld1.8 {d0}, [%1,:64] \n" - "vmovl.s8 q0, d0 \n" - "vmovl.s16 q1, d1 \n" - "vmovl.s16 q0, d0 \n" - "vcvt.f32.s32 q0, q0 \n" - "vcvt.f32.s32 q1, q1 \n" - "vmul.f32 q0, q0, %y2 \n" - "vmul.f32 q1, q1, %y2 \n" - "vst1.32 {q0-q1}, [%m0,:128] \n" - : "=Um"(*(float (*)[8])dst) - : "r"(src), "x"(scale) - : "d0", "d1", "d2", "d3"); -} - -#endif - #endif /* AVCODEC_ARM_DCA_H */ diff --git a/libavcodec/arm/dcadsp_init_arm.c b/libavcodec/arm/dcadsp_init_arm.c index 2ea1289..6359ec3 100644 --- a/libavcodec/arm/dcadsp_init_arm.c +++ b/libavcodec/arm/dcadsp_init_arm.c @@ -53,6 +53,45 @@ void ff_synth_filter_float_neon(FFTContext *imdct, float out[32], const float in[32], float scale); +#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y + +static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp, + float *dst, const int8_t *src, int scale) +{ + __asm__ ("vcvt.f32.s32 %2, %2, #4 \n" + "vld1.8 {d0}, [%1,:64] \n" + "vmovl.s8 q0, d0 \n" + "vmovl.s16 q1, d1 \n" + "vmovl.s16 q0, d0 \n" + "vcvt.f32.s32 q0, q0 \n" + "vcvt.f32.s32 q1, q1 \n" + "vmul.f32 q0, q0, %y2 \n" + "vmul.f32 q1, q1, %y2 \n" + "vst1.32 {q0-q1}, [%m0,:128] \n" + : "=Um"(*(float (*)[8])dst) + : "r"(src), "x"(scale) + : "d0", "d1", "d2", "d3"); +} + +static void decode_hf_neon(float dst[DCA_SUBBANDS][8], + const int32_t vq_num[DCA_SUBBANDS], + const int8_t hf_vq[1024][32], intptr_t vq_offset, + int32_t scale[DCA_SUBBANDS][2], + intptr_t start, intptr_t end) +{ + int l; + for (l = start; l < end; l++) { + /* 1 vector -> 32 samples but we only need the 8 samples + * for this subsubframe. */ + int hfvq = vq_num[l]; + + int8x8_fmul_int32(dst[l], hf_vq[hfvq] + vq_offset, scale[l][0]); + } +} + +#endif + + av_cold void ff_dcadsp_init_arm(DCADSPContext *s) { int cpu_flags = av_get_cpu_flags(); @@ -65,6 +104,9 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s) if (have_neon(cpu_flags)) { s->lfe_fir[0] = ff_dca_lfe_fir0_neon; s->lfe_fir[1] = ff_dca_lfe_fir1_neon; +#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y + s->decode_hf = decode_hf_neon; +#endif } } diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c index 076a225..6aee7bf 100644 --- a/libavcodec/dcadec.c +++ b/libavcodec/dcadec.c @@ -50,14 +50,11 @@ #if ARCH_ARM # include "arm/dca.h" #endif -#if ARCH_X86 -# include "x86/dca.h" -#endif //#define TRACE #define DCA_PRIM_CHANNELS_MAX (7) -#define DCA_SUBBANDS (32) +// DCA_SUBBANDS defined in dcadsp.h #define DCA_ABITS_MAX (32) /* Should be 28 */ #define DCA_SUBSUBFRAMES_MAX (4) #define DCA_SUBFRAMES_MAX (16) @@ -340,7 +337,7 @@ typedef struct { int prediction_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< prediction VQ coefs int bitalloc[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< bit allocation index int transition_mode[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< transition mode (transients) - int scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][2]; ///< scale factors (2 if transient) + int32_t scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][2];///< scale factors (2 if transient) int joint_huff[DCA_PRIM_CHANNELS_MAX]; ///< joint subband scale factors codebook int joint_scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< joint subband scale factors float downmix_coef[DCA_PRIM_CHANNELS_MAX + 1][2]; ///< stereo downmix coefficients @@ -353,7 +350,7 @@ typedef struct { uint8_t core_downmix_amode; ///< audio channel arrangement of embedded downmix uint16_t core_downmix_codes[DCA_PRIM_CHANNELS_MAX + 1][4]; ///< embedded downmix coefficients (9-bit codes) - int high_freq_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< VQ encoded high frequency subbands + int32_t high_freq_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< VQ encoded high frequency subbands float lfe_data[2 * DCA_LFE_MAX * (DCA_BLOCKS_MAX + 4)]; ///< Low frequency effect data int lfe_scale_factor; @@ -1088,14 +1085,6 @@ static int decode_blockcodes(int code1, int code2, int levels, int32_t *values) static const uint8_t abits_sizes[7] = { 7, 10, 12, 13, 15, 17, 19 }; static const uint8_t abits_levels[7] = { 3, 5, 7, 9, 13, 17, 25 }; -#ifndef int8x8_fmul_int32 -static inline void int8x8_fmul_int32(DCADSPContext *dsp, float *dst, - const int8_t *src, int scale) -{ - dsp->int8x8_fmul_int32(dst, src, scale); -} -#endif - static int dca_subsubframe(DCAContext *s, int base_channel, int block_index) { int k, l; @@ -1215,21 +1204,17 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index) /* * Decode VQ encoded high frequencies */ - for (l = s->vq_start_subband[k]; l < s->subband_activity[k]; l++) { - /* 1 vector -> 32 samples but we only need the 8 samples - * for this subsubframe. */ - int hfvq = s->high_freq_vq[k][l]; - - if (!s->debug_flag & 0x01) { - av_log(s->avctx, AV_LOG_DEBUG, - "Stream with high frequencies VQ coding\n"); - s->debug_flag |= 0x01; - } - - int8x8_fmul_int32(&s->dcadsp, subband_samples[k][l], - &high_freq_vq[hfvq][subsubframe * 8], - s->scale_factor[k][l][0]); + if ((!s->debug_flag & 0x01) && + s->subband_activity[k] > s->vq_start_subband[k]) { + av_log(s->avctx, AV_LOG_DEBUG, + "Stream with high frequencies VQ coding\n"); + s->debug_flag |= 0x01; } + + s->dcadsp.decode_hf(subband_samples[k], s->high_freq_vq[k], + high_freq_vq, subsubframe * 8, + s->scale_factor[k], s->vq_start_subband[k], + s->subband_activity[k]); } /* Check for DSYNC after subsubframe */ diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c index e9c7682..1e09bd3 100644 --- a/libavcodec/dcadsp.c +++ b/libavcodec/dcadsp.c @@ -24,12 +24,21 @@ #include "libavutil/intreadwrite.h" #include "dcadsp.h" -static void int8x8_fmul_int32_c(float *dst, const int8_t *src, int scale) +static void decode_hf_c(float dst[DCA_SUBBANDS][8], + const int32_t vq_num[DCA_SUBBANDS], + const int8_t hf_vq[1024][32], intptr_t vq_offset, + int32_t scale[DCA_SUBBANDS][2], + intptr_t start, intptr_t end) { - float fscale = scale / 16.0; - int i; - for (i = 0; i < 8; i++) - dst[i] = src[i] * fscale; + int l; + for (l = start; l < end; l++) { + /* 1 vector -> 32 samples but we only need the 8 samples + * for this subsubframe. */ + int i, hfvq = vq_num[l]; + float fscale = scale[l][0] / 16.0; + for (i = 0; i < 8; i++) + dst[l][i] = hf_vq[hfvq][vq_offset + i] * fscale; + } } static inline void @@ -103,7 +112,7 @@ av_cold void ff_dcadsp_init(DCADSPContext *s) s->lfe_fir[0] = dca_lfe_fir0_c; s->lfe_fir[1] = dca_lfe_fir1_c; s->qmf_32_subbands = dca_qmf_32_subbands; - s->int8x8_fmul_int32 = int8x8_fmul_int32_c; + s->decode_hf = decode_hf_c; if (ARCH_ARM) ff_dcadsp_init_arm(s); if (ARCH_X86) ff_dcadsp_init_x86(s); } diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h index 3e04426..15105f0 100644 --- a/libavcodec/dcadsp.h +++ b/libavcodec/dcadsp.h @@ -22,6 +22,8 @@ #include "avfft.h" #include "synth_filter.h" +#define DCA_SUBBANDS (32) + typedef struct DCADSPContext { void (*lfe_fir[2])(float *out, const float *in, const float *coefs, float scale); @@ -31,7 +33,11 @@ typedef struct DCADSPContext { int *synth_buf_offset, float synth_buf2[32], const float window[512], float *samples_out, float raXin[32], float scale); - void (*int8x8_fmul_int32)(float *dst, const int8_t *src, int scale); + void (*decode_hf)(float dst[DCA_SUBBANDS][8], + const int32_t vq_num[DCA_SUBBANDS], + const int8_t hf_vq[1024][32], intptr_t vq_offset, + int32_t scale[DCA_SUBBANDS][2], + intptr_t start, intptr_t end); } DCADSPContext; void ff_dcadsp_init(DCADSPContext *s); diff --git a/libavcodec/x86/dca.h b/libavcodec/x86/dca.h deleted file mode 100644 index ab175b3..0000000 --- a/libavcodec/x86/dca.h +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2012-2014 Christophe Gisquet <[email protected]> - * - * This file is part of Libav. - * - * Libav is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License as published by the Free Software Foundation; either - * version 2.1 of the License, or (at your option) any later version. - * - * Libav is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public - * License along with Libav; if not, write to the Free Software - * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include "config.h" - -#if ARCH_X86_64 && HAVE_SSE2_INLINE -# include "libavutil/x86/asm.h" -# include "libavutil/mem.h" -#include "libavcodec/dcadsp.h" - -# define int8x8_fmul_int32 int8x8_fmul_int32 -static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp, - float *dst, const int8_t *src, int scale) -{ - DECLARE_ALIGNED(16, static const uint32_t, inverse16) = 0x3D800000; - __asm__ volatile ( - "cvtsi2ss %2, %%xmm0 \n\t" - "mulss %3, %%xmm0 \n\t" - "movq (%1), %%xmm1 \n\t" - "punpcklbw %%xmm1, %%xmm1 \n\t" - "movaps %%xmm1, %%xmm2 \n\t" - "punpcklwd %%xmm1, %%xmm1 \n\t" - "punpckhwd %%xmm2, %%xmm2 \n\t" - "psrad $24, %%xmm1 \n\t" - "psrad $24, %%xmm2 \n\t" - "shufps $0, %%xmm0, %%xmm0 \n\t" - "cvtdq2ps %%xmm1, %%xmm1 \n\t" - "cvtdq2ps %%xmm2, %%xmm2 \n\t" - "mulps %%xmm0, %%xmm1 \n\t" - "mulps %%xmm0, %%xmm2 \n\t" - "movaps %%xmm1, 0(%0) \n\t" - "movaps %%xmm2, 16(%0) \n\t" - :: "r"(dst), "r"(src), "m"(scale), "m"(inverse16) - XMM_CLOBBERS_ONLY("xmm0", "xmm1", "xmm2") - ); -} - -#endif /* ARCH_X86_64 && HAVE_SSE2_INLINE */ diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm index 10dfaf6..f29d369 100644 --- a/libavcodec/x86/dcadsp.asm +++ b/libavcodec/x86/dcadsp.asm @@ -26,18 +26,37 @@ pf_inv16: times 4 dd 0x3D800000 ; 1/16 SECTION_TEXT -; void int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale) -%macro INT8X8_FMUL_INT32 0 -cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale - cvtsi2ss m0, scalem +; decode_hf(float dst[DCA_SUBBANDS][8], const int32_t vq_num[DCA_SUBBANDS], +; const int8_t hf_vq[1024][32], intptr_t vq_offset, +; int32_t scale[DCA_SUBBANDS][2], +; intptr_t start, intptr_t end) +%macro DECODE_HF 0 +cglobal decode_hf, 6,6,5, dst, num, src, offset, scale, start, end + cmp startq, endm + je .end + lea srcq, [srcq + offsetq] + shl startq, 2 + mov offsetd, endm +%define DICT offsetq + shl offsetq, 2 + mov endm, offsetq +.loop: +%if ARCH_X86_64 + mov offsetd, [scaleq + 2*startq] + cvtsi2ss m0, offsetd +%else + cvtsi2ss m0, [scaleq + 2*startq] +%endif + mov offsetd, [numq + startq] mulss m0, [pf_inv16] + shl DICT, 5 shufps m0, m0, 0 %if cpuflag(sse2) %if cpuflag(sse4) - pmovsxbd m1, [srcq+0] - pmovsxbd m2, [srcq+4] + pmovsxbd m1, [srcq + DICT + 0] + pmovsxbd m2, [srcq + DICT + 4] %else - movq m1, [srcq] + movq m1, [srcq + DICT] punpcklbw m1, m1 mova m2, m1 punpcklwd m1, m1 @@ -48,8 +67,8 @@ cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale cvtdq2ps m1, m1 cvtdq2ps m2, m2 %else - movd mm0, [srcq+0] - movd mm1, [srcq+4] + movd mm0, [srcq + DICT + 0] + movd mm1, [srcq + DICT + 4] punpcklbw mm0, mm0 punpcklbw mm1, mm1 movq mm2, mm0 @@ -67,27 +86,33 @@ cglobal int8x8_fmul_int32, 3,3,5, dst, src, scale cvtpi2ps m3, mm2 cvtpi2ps m4, mm3 shufps m0, m0, 0 - emms shufps m1, m3, q1010 shufps m2, m4, q1010 %endif mulps m1, m0 mulps m2, m0 - mova [dstq+ 0], m1 - mova [dstq+16], m2 + mova [dstq + 8*startq + 0], m1 + mova [dstq + 8*startq + 16], m2 + add startq, 4 + cmp startq, endm + jl .loop +.end: +%if cpuflag(sse2) == 0 + emms +%endif REP_RET %endmacro %if ARCH_X86_32 INIT_XMM sse -INT8X8_FMUL_INT32 +DECODE_HF %endif INIT_XMM sse2 -INT8X8_FMUL_INT32 +DECODE_HF INIT_XMM sse4 -INT8X8_FMUL_INT32 +DECODE_HF ; %1=v0/v1 %2=in1 %3=in2 %macro FIR_LOOP 2-3 diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c index 140965a..19cbf75 100644 --- a/libavcodec/x86/dcadsp_init.c +++ b/libavcodec/x86/dcadsp_init.c @@ -23,9 +23,15 @@ #include "libavutil/x86/cpu.h" #include "libavcodec/dcadsp.h" -void ff_int8x8_fmul_int32_sse(float *dst, const int8_t *src, int scale); -void ff_int8x8_fmul_int32_sse2(float *dst, const int8_t *src, int scale); -void ff_int8x8_fmul_int32_sse4(float *dst, const int8_t *src, int scale); +void ff_decode_hf_sse(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS], + const int8_t hf_vq[1024][32], intptr_t vq_offset, + int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end); +void ff_decode_hf_sse2(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS], + const int8_t hf_vq[1024][32], intptr_t vq_offset, + int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end); +void ff_decode_hf_sse4(float dst[DCA_SUBBANDS][8], const int vq_num[DCA_SUBBANDS], + const int8_t hf_vq[1024][32], intptr_t vq_offset, + int scale[DCA_SUBBANDS][2], intptr_t start, intptr_t end); void ff_dca_lfe_fir0_sse(float *out, const float *in, const float *coefs, float scale); void ff_dca_lfe_fir1_sse(float *out, const float *in, const float *coefs, @@ -37,18 +43,18 @@ av_cold void ff_dcadsp_init_x86(DCADSPContext *s) if (EXTERNAL_SSE(cpu_flags)) { #if ARCH_X86_32 - s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse; + s->decode_hf = ff_decode_hf_sse; #endif s->lfe_fir[0] = ff_dca_lfe_fir0_sse; s->lfe_fir[1] = ff_dca_lfe_fir1_sse; } if (EXTERNAL_SSE2(cpu_flags)) { - s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse2; + s->decode_hf = ff_decode_hf_sse2; } if (EXTERNAL_SSE4(cpu_flags)) { - s->int8x8_fmul_int32 = ff_int8x8_fmul_int32_sse4; + s->decode_hf = ff_decode_hf_sse4; } } -- 1.8.0.msysgit.0
_______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
