---
Hi,

Could someone test the patch below with a modern Intel CPU other than Atom?
I'm getting slower results for the SSE version than the MMX version on
Athlon64, but SSE is faster on Atom.  I'm guessing it's another Athlon issue,
but it could be something else...

Thanks,
Justin

 libavcodec/ac3dec.c             |   23 ++++++-
 libavcodec/fmtconvert.c         |   20 ++++++
 libavcodec/fmtconvert.h         |    9 +++
 libavcodec/x86/fmtconvert.asm   |  137 +++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/fmtconvert_mmx.c |   30 +++++++++
 5 files changed, 218 insertions(+), 1 deletions(-)

diff --git a/libavcodec/ac3dec.c b/libavcodec/ac3dec.c
index 015ebae..1c27f7d 100644
--- a/libavcodec/ac3dec.c
+++ b/libavcodec/ac3dec.c
@@ -24,6 +24,8 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
+#define CONFIG_AUDIO_FLOAT 1
+
 #include <stdio.h>
 #include <stddef.h>
 #include <math.h>
@@ -189,7 +191,11 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
     av_lfg_init(&s->dith_state, 0);
 
     /* set scale value for float to int16 conversion */
+#if CONFIG_AUDIO_FLOAT
+    s->mul_bias = 1.0f;
+#else
     s->mul_bias = 32767.0f;
+#endif
 
     /* allow downmixing to stereo or mono */
     if (avctx->channels > 0 && avctx->request_channels > 0 &&
@@ -204,7 +210,12 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
         if (!s->input_buffer)
             return AVERROR(ENOMEM);
 
+#if CONFIG_AUDIO_FLOAT
+    avctx->sample_fmt = AV_SAMPLE_FMT_FLT;
+#else
     avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+#endif
+
     return 0;
 }
 
@@ -1299,7 +1310,11 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size,
     const uint8_t *buf = avpkt->data;
     int buf_size = avpkt->size;
     AC3DecodeContext *s = avctx->priv_data;
+#if CONFIG_AUDIO_FLOAT
+    float *out_samples = data;
+#else
     int16_t *out_samples = (int16_t *)data;
+#endif
     int blk, ch, err;
     const uint8_t *channel_map;
     const float *output[AC3_MAX_CHANNELS];
@@ -1405,10 +1420,16 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size,
             av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n");
             err = 1;
         }
+        {START_TIMER;
+#if CONFIG_AUDIO_FLOAT
+        s->fmt_conv.float_interleave(out_samples, output, 256, s->out_channels);
+#else
         s->fmt_conv.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
+#endif
+        STOP_TIMER("float output");}
         out_samples += 256 * s->out_channels;
     }
-    *data_size = s->num_blocks * 256 * avctx->channels * sizeof (int16_t);
+    *data_size = s->num_blocks * 256 * avctx->channels * sizeof(*out_samples);
     return FFMIN(buf_size, s->frame_size);
 }
 
diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c
index e970755..58fece7 100644
--- a/libavcodec/fmtconvert.c
+++ b/libavcodec/fmtconvert.c
@@ -56,11 +56,31 @@ static void float_to_int16_interleave_c(int16_t *dst, const float **src,
     }
 }
 
+void ff_float_interleave_c(float *dst, const float **src, unsigned int len,
+                           int channels)
+{
+    int j, c;
+    unsigned int i;
+    if (channels == 2) {
+        for (i = 0; i < len; i++) {
+            dst[2*i]   = src[0][i];
+            dst[2*i+1] = src[1][i];
+        }
+    } else if (channels == 1 && len < INT_MAX / sizeof(float)) {
+        memcpy(dst, src[0], len * sizeof(float));
+    } else {
+        for (c = 0; c < channels; c++)
+            for (i = 0, j = c; i < len; i++, j += channels)
+                dst[j] = src[c][i];
+    }
+}
+
 av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
 {
     c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
     c->float_to_int16             = float_to_int16_c;
     c->float_to_int16_interleave  = float_to_int16_interleave_c;
+    c->float_interleave           = ff_float_interleave_c;
 
     if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
     if (HAVE_ALTIVEC) ff_fmt_convert_init_altivec(c, avctx);
diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
index e0afee4..d774113 100644
--- a/libavcodec/fmtconvert.h
+++ b/libavcodec/fmtconvert.h
@@ -68,8 +68,17 @@ typedef struct FmtConvertContext {
      */
     void (*float_to_int16_interleave)(int16_t *dst, const float **src,
                                       long len, int channels);
+
+    /**
+     * Convert an array of interleaved float to multiple arrays of float.
+     */
+    void (*float_interleave)(float *dst, const float **src, unsigned int len,
+                             int channels);
 } FmtConvertContext;
 
+void ff_float_interleave_c(float *dst, const float **src, unsigned int len,
+                           int channels);
+
 void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);
 
 void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index ddcbab4..9732197 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -89,3 +89,140 @@ FLOAT_TO_INT16_INTERLEAVE6 3dnow
 %undef pswapd
 FLOAT_TO_INT16_INTERLEAVE6 3dn2
 %undef cvtps2pi
+
+;-----------------------------------------------------------------------------
+; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
+;-----------------------------------------------------------------------------
+
+%macro FLOAT_INTERLEAVE6 1
+cglobal float_interleave6_%1, 2,7,4, dst, src, src1, src2, src3, src4, src5
+%ifdef ARCH_X86_64
+    %define lend r10d
+    mov     lend, r2d
+%else
+    %define lend dword r2m
+%endif
+    mov    src1q, [srcq+1*gprsize]
+    mov    src2q, [srcq+2*gprsize]
+    mov    src3q, [srcq+3*gprsize]
+    mov    src4q, [srcq+4*gprsize]
+    mov    src5q, [srcq+5*gprsize]
+    mov     srcq, [srcq]
+    sub    src1q, srcq
+    sub    src2q, srcq
+    sub    src3q, srcq
+    sub    src4q, srcq
+    sub    src5q, srcq
+.loop:
+%ifidn %1, sse
+    movlps    m0, [srcq]
+    movhps    m0, [srcq+src3q]
+    movlps    m1, [srcq+src1q]
+    movhps    m1, [srcq+src4q]
+    movlps    m2, [srcq+src2q]
+    movhps    m2, [srcq+src5q]
+    movlhps   m3, m0
+    movhlps   m3, m0
+    unpcklps  m0, m1
+    unpckhps  m1, m2
+    unpcklps  m2, m3
+    movlhps   m3, m0
+    movhlps   m3, m0
+    movlhps   m0, m2
+    shufps    m2, m1, 0xee
+    movlhps   m1, m3
+    movaps [dstq   ], m0
+    movaps [dstq+16], m1
+    movaps [dstq+32], m2
+%else ; mmx
+    movq       m0, [srcq]
+    movq       m1, [srcq+src1q]
+    movq       m2, [srcq+src2q]
+    movq       m3, [srcq+src3q]
+    movq       m4, [srcq+src4q]
+    movq       m5, [srcq+src5q]
+
+    movq       m6, m0
+    punpckldq  m6, m1
+    movq       m7, m2
+    punpckldq  m7, m3
+    movq [dstq  ], m6
+    movq [dstq+8], m7
+
+    movq       m6, m4
+    punpckldq  m6, m5
+    movq       m7, m0
+    punpckhdq  m7, m1
+    movq [dstq+16], m6
+    movq [dstq+24], m7
+
+    movq       m6, m2
+    punpckhdq  m6, m3
+    movq       m7, m4
+    punpckhdq  m7, m5
+    movq [dstq+32], m6
+    movq [dstq+40], m7
+%endif
+    add      srcq, 8
+    add      dstq, 48
+    sub      lend, 2
+    jg .loop
+%ifidn %1, mmx
+    emms
+%endif
+    REP_RET
+%endmacro
+
+INIT_MMX
+FLOAT_INTERLEAVE6 mmx
+INIT_XMM
+FLOAT_INTERLEAVE6 sse
+
+;-----------------------------------------------------------------------------
+; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
+;-----------------------------------------------------------------------------
+
+%macro FLOAT_INTERLEAVE2 2
+cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1
+    mov     src1q, [srcq+gprsize]
+    mov      srcq, [srcq        ]
+    sub     src1q, srcq
+.loop
+    MOVPS      m0, [srcq             ]
+    MOVPS      m1, [srcq+src1q       ]
+    MOVPS      m3, [srcq      +mmsize]
+    MOVPS      m4, [srcq+src1q+mmsize]
+
+    MOVPS      m2, m0
+    PUNPCKLDQ  m0, m1
+    PUNPCKHDQ  m2, m1
+
+    MOVPS      m1, m3
+    PUNPCKLDQ  m3, m4
+    PUNPCKHDQ  m1, m4
+
+    MOVPS [dstq         ], m0
+    MOVPS [dstq+1*mmsize], m2
+    MOVPS [dstq+2*mmsize], m3
+    MOVPS [dstq+3*mmsize], m1
+
+    add      srcq, mmsize*2
+    add      dstq, mmsize*4
+    sub      lend, mmsize/2
+    jg .loop
+%ifidn %1, mmx
+    emms
+%endif
+    REP_RET
+%endmacro
+
+INIT_MMX
+%define MOVPS     movq
+%define PUNPCKLDQ punpckldq
+%define PUNPCKHDQ punpckhdq
+FLOAT_INTERLEAVE2 mmx, 0
+INIT_XMM
+%define MOVPS     movaps
+%define PUNPCKLDQ unpcklps
+%define PUNPCKHDQ unpckhps
+FLOAT_INTERLEAVE2 sse, 5
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
index 847bd80..61a4272 100644
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -235,11 +235,40 @@ static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long
         float_to_int16_interleave_3dnow(dst, src, len, channels);
 }
 
+void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
+void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
+
+void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
+void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);
+
+static void float_interleave_mmx(float *dst, const float **src,
+                                 unsigned int len, int channels)
+{
+    if (channels == 2) {
+        ff_float_interleave2_mmx(dst, src, len);
+    } else if (channels == 6)
+        ff_float_interleave6_mmx(dst, src, len);
+    else
+        ff_float_interleave_c(dst, src, len, channels);
+}
+
+static void float_interleave_sse(float *dst, const float **src,
+                                 unsigned int len, int channels)
+{
+    if (channels == 2) {
+        ff_float_interleave2_sse(dst, src, len);
+    } else if (channels == 6)
+        ff_float_interleave6_sse(dst, src, len);
+    else
+        ff_float_interleave_c(dst, src, len, channels);
+}
+
 void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
 {
     int mm_flags = av_get_cpu_flags();
 
     if (mm_flags & AV_CPU_FLAG_MMX) {
+        c->float_interleave = float_interleave_mmx;
 
         if(mm_flags & AV_CPU_FLAG_3DNOW){
             if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
@@ -256,6 +285,7 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
             c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
             c->float_to_int16 = float_to_int16_sse;
             c->float_to_int16_interleave = float_to_int16_interleave_sse;
+            c->float_interleave = float_interleave_sse;
         }
         if(mm_flags & AV_CPU_FLAG_SSE2){
             c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to