On 2014-02-14 16:03:13 +0000, Christophe Gisquet wrote:
> The vector dequantization has a test in a loop preventing effective SIMD
> implementation. By moving it out of the loop, this loop can be DSPized.
>
> Therefore, modify the current DSP implementation. In particular, the
> DSP implementation no longer has to handle null loop sizes.
> ---
> libavcodec/arm/dca.h | 23 ----------------------
> libavcodec/arm/dcadsp_init_arm.c | 41
> ++++++++++++++++++++++++++++++++++++++++
> libavcodec/dcadec.c | 30 +++++++----------------------
> libavcodec/dcadsp.c | 21 ++++++++++++++------
> libavcodec/dcadsp.h | 8 +++++++-
> 5 files changed, 70 insertions(+), 53 deletions(-)
>
> diff --git a/libavcodec/arm/dca.h b/libavcodec/arm/dca.h
> index 580bd75..4aed576 100644
> --- a/libavcodec/arm/dca.h
> +++ b/libavcodec/arm/dca.h
> @@ -81,27 +81,4 @@ static inline int decode_blockcodes(int code1, int code2,
> int levels,
>
> #endif
>
> -#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y
> -
> -#define int8x8_fmul_int32 int8x8_fmul_int32
> -static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
> - float *dst, const int8_t *src, int
> scale)
> -{
> - __asm__ ("vcvt.f32.s32 %2, %2, #4 \n"
> - "vld1.8 {d0}, [%1,:64] \n"
> - "vmovl.s8 q0, d0 \n"
> - "vmovl.s16 q1, d1 \n"
> - "vmovl.s16 q0, d0 \n"
> - "vcvt.f32.s32 q0, q0 \n"
> - "vcvt.f32.s32 q1, q1 \n"
> - "vmul.f32 q0, q0, %y2 \n"
> - "vmul.f32 q1, q1, %y2 \n"
> - "vst1.32 {q0-q1}, [%m0,:128] \n"
> - : "=Um"(*(float (*)[8])dst)
> - : "r"(src), "x"(scale)
> - : "d0", "d1", "d2", "d3");
> -}
> -
> -#endif
> -
> #endif /* AVCODEC_ARM_DCA_H */
> diff --git a/libavcodec/arm/dcadsp_init_arm.c
> b/libavcodec/arm/dcadsp_init_arm.c
> index 2ea1289..9942581 100644
> --- a/libavcodec/arm/dcadsp_init_arm.c
> +++ b/libavcodec/arm/dcadsp_init_arm.c
> @@ -53,6 +53,44 @@ void ff_synth_filter_float_neon(FFTContext *imdct,
> float out[32], const float in[32],
> float scale);
>
> +#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y
> +
> +static inline void int8x8_fmul_int32(av_unused DCADSPContext *dsp,
> + float *dst, const int8_t *src, int
> scale)
> +{
> + __asm__ ("vcvt.f32.s32 %2, %2, #4 \n"
> + "vld1.8 {d0}, [%1,:64] \n"
> + "vmovl.s8 q0, d0 \n"
> + "vmovl.s16 q1, d1 \n"
> + "vmovl.s16 q0, d0 \n"
> + "vcvt.f32.s32 q0, q0 \n"
> + "vcvt.f32.s32 q1, q1 \n"
> + "vmul.f32 q0, q0, %y2 \n"
> + "vmul.f32 q1, q1, %y2 \n"
> + "vst1.32 {q0-q1}, [%m0,:128] \n"
> + : "=Um"(*(float (*)[8])dst)
> + : "r"(src), "x"(scale)
> + : "d0", "d1", "d2", "d3");
> +}
> +
> +static void decode_hf_neon(float dst[DCA_SUBBANDS][8],
> + const int32_t vq_num[DCA_SUBBANDS],
> + const int8_t hf_vq[1024][32], intptr_t vq_offset,
> + int32_t scale[DCA_SUBBANDS][2],
> + intptr_t start, intptr_t end)
> +{
> + int l;
> + for (l = start; l < end; l++) {
> + /* 1 vector -> 32 samples but we only need the 8 samples
> + * for this subsubframe. */
> + int hfvq = vq_num[l];
> +
> + int8x8_fmul_int32(dst[l], hf_vq[hfvq] + vq_offset, scale[l][0]);
> + }
> +}
> +
> +#endif /* HAVE_NEON_INLINE && HAVE_ASM_MOD_Y */
> +
> av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
> {
> int cpu_flags = av_get_cpu_flags();
> @@ -65,6 +103,9 @@ av_cold void ff_dcadsp_init_arm(DCADSPContext *s)
> if (have_neon(cpu_flags)) {
> s->lfe_fir[0] = ff_dca_lfe_fir0_neon;
> s->lfe_fir[1] = ff_dca_lfe_fir1_neon;
> +#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y
> + s->decode_hf = decode_hf_neon;
> +#endif
ok but not optimal, I'll rewrite it as external asm, doesn't block this patch
> }
> }
>
> diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
> index 2d88cb4..371c838 100644
> --- a/libavcodec/dcadec.c
> +++ b/libavcodec/dcadec.c
> @@ -50,14 +50,10 @@
> #if ARCH_ARM
> # include "arm/dca.h"
> #endif
> -#if ARCH_X86
> -# include "x86/dca.h"
> -#endif
>
> //#define TRACE
>
> #define DCA_PRIM_CHANNELS_MAX (7)
> -#define DCA_SUBBANDS (32)
> #define DCA_ABITS_MAX (32) /* Should be 28 */
> #define DCA_SUBSUBFRAMES_MAX (4)
> #define DCA_SUBFRAMES_MAX (16)
> @@ -340,7 +336,7 @@ typedef struct {
> int prediction_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///<
> prediction VQ coefs
> int bitalloc[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< bit
> allocation index
> int transition_mode[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///<
> transition mode (transients)
> - int scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][2]; ///< scale
> factors (2 if transient)
> + int32_t scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS][2];///< scale
> factors (2 if transient)
Don't we assume that int is 32 bit wide? I guess much more would break
if that assumption doesn't hold. change of course ok when using it in asm.
> int joint_huff[DCA_PRIM_CHANNELS_MAX]; ///< joint
> subband scale factors codebook
> int joint_scale_factor[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< joint
> subband scale factors
> float downmix_coef[DCA_PRIM_CHANNELS_MAX + 1][2]; ///< stereo
> downmix coefficients
> @@ -353,7 +349,7 @@ typedef struct {
> uint8_t core_downmix_amode; ///< audio
> channel arrangement of embedded downmix
> uint16_t core_downmix_codes[DCA_PRIM_CHANNELS_MAX + 1][4]; ///<
> embedded downmix coefficients (9-bit codes)
>
> - int high_freq_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< VQ
> encoded high frequency subbands
> + int32_t high_freq_vq[DCA_PRIM_CHANNELS_MAX][DCA_SUBBANDS]; ///< VQ
> encoded high frequency subbands
>
> float lfe_data[2 * DCA_LFE_MAX * (DCA_BLOCKS_MAX + 4)]; ///< Low
> frequency effect data
> int lfe_scale_factor;
> @@ -1088,14 +1084,6 @@ static int decode_blockcodes(int code1, int code2, int
> levels, int32_t *values)
> static const uint8_t abits_sizes[7] = { 7, 10, 12, 13, 15, 17, 19 };
> static const uint8_t abits_levels[7] = { 3, 5, 7, 9, 13, 17, 25 };
>
> -#ifndef int8x8_fmul_int32
> -static inline void int8x8_fmul_int32(DCADSPContext *dsp, float *dst,
> - const int8_t *src, int scale)
> -{
> - dsp->int8x8_fmul_int32(dst, src, scale);
> -}
> -#endif
> -
> static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
> {
> int k, l;
> @@ -1215,20 +1203,16 @@ static int dca_subsubframe(DCAContext *s, int
> base_channel, int block_index)
> /*
> * Decode VQ encoded high frequencies
> */
> - for (l = s->vq_start_subband[k]; l < s->subband_activity[k]; l++) {
> - /* 1 vector -> 32 samples but we only need the 8 samples
> - * for this subsubframe. */
> - int hfvq = s->high_freq_vq[k][l];
> -
> + if (s->subband_activity[k] > s->vq_start_subband[k]) {
> if (!s->debug_flag & 0x01) {
> av_log(s->avctx, AV_LOG_DEBUG,
> "Stream with high frequencies VQ coding\n");
> s->debug_flag |= 0x01;
> }
> -
> - int8x8_fmul_int32(&s->dcadsp, subband_samples[k][l],
> - &high_freq_vq[hfvq][subsubframe * 8],
> - s->scale_factor[k][l][0]);
> + s->dcadsp.decode_hf(subband_samples[k], s->high_freq_vq[k],
> + high_freq_vq, subsubframe * 8,
> + s->scale_factor[k], s->vq_start_subband[k],
> + s->subband_activity[k]);
> }
> }
>
> diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
> index 30b732a..10fadd6 100644
> --- a/libavcodec/dcadsp.c
> +++ b/libavcodec/dcadsp.c
> @@ -24,12 +24,21 @@
> #include "libavutil/intreadwrite.h"
> #include "dcadsp.h"
>
> -static void int8x8_fmul_int32_c(float *dst, const int8_t *src, int scale)
> +static void decode_hf_c(float dst[DCA_SUBBANDS][8],
> + const int32_t vq_num[DCA_SUBBANDS],
> + const int8_t hf_vq[1024][32], intptr_t vq_offset,
> + int32_t scale[DCA_SUBBANDS][2],
> + intptr_t start, intptr_t end)
> {
> - float fscale = scale / 16.0;
> - int i;
> - for (i = 0; i < 8; i++)
> - dst[i] = src[i] * fscale;
> + int l;
> + for (l = start; l < end; l++) {
> + /* 1 vector -> 32 samples but we only need the 8 samples
> + * for this subsubframe. */
> + int i, hfvq = vq_num[l];
> + float fscale = scale[l][0] / 16.0;
> + for (i = 0; i < 8; i++)
> + dst[l][i] = hf_vq[hfvq][vq_offset + i] * fscale;
> + }
> }
>
> static inline void
> @@ -97,7 +106,7 @@ av_cold void ff_dcadsp_init(DCADSPContext *s)
> s->lfe_fir[0] = dca_lfe_fir0_c;
> s->lfe_fir[1] = dca_lfe_fir1_c;
> s->qmf_32_subbands = dca_qmf_32_subbands;
> - s->int8x8_fmul_int32 = int8x8_fmul_int32_c;
> + s->decode_hf = decode_hf_c;
> if (ARCH_ARM) ff_dcadsp_init_arm(s);
> if (ARCH_X86) ff_dcadsp_init_x86(s);
> }
> diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h
> index e4c1bc7..0fa75a5 100644
> --- a/libavcodec/dcadsp.h
> +++ b/libavcodec/dcadsp.h
> @@ -22,6 +22,8 @@
> #include "avfft.h"
> #include "synth_filter.h"
>
> +#define DCA_SUBBANDS 32
> +
> typedef struct DCADSPContext {
> void (*lfe_fir[2])(float *out, const float *in, const float *coefs);
> void (*qmf_32_subbands)(float samples_in[32][8], int sb_act,
> @@ -30,7 +32,11 @@ typedef struct DCADSPContext {
> int *synth_buf_offset, float synth_buf2[32],
> const float window[512], float *samples_out,
> float raXin[32], float scale);
> - void (*int8x8_fmul_int32)(float *dst, const int8_t *src, int scale);
> + void (*decode_hf)(float dst[DCA_SUBBANDS][8],
> + const int32_t vq_num[DCA_SUBBANDS],
> + const int8_t hf_vq[1024][32], intptr_t vq_offset,
> + int32_t scale[DCA_SUBBANDS][2],
> + intptr_t start, intptr_t end);
> } DCADSPContext;
>
> void ff_dcadsp_init(DCADSPContext *s);
ok and queued
Janne
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel