---
Hi,
Could someone test the patch below with a modern Intel CPU other than Atom?
I'm getting slower results for the SSE version than the MMX version on
Athlon64, but SSE is faster on Atom. I'm guessing it's another Athlon issue,
but it could be something else...
Thanks,
Justin
libavcodec/ac3dec.c | 23 ++++++-
libavcodec/fmtconvert.c | 20 ++++++
libavcodec/fmtconvert.h | 9 +++
libavcodec/x86/fmtconvert.asm | 137 +++++++++++++++++++++++++++++++++++++++
libavcodec/x86/fmtconvert_mmx.c | 30 +++++++++
5 files changed, 218 insertions(+), 1 deletions(-)
diff --git a/libavcodec/ac3dec.c b/libavcodec/ac3dec.c
index 015ebae..1c27f7d 100644
--- a/libavcodec/ac3dec.c
+++ b/libavcodec/ac3dec.c
@@ -24,6 +24,8 @@
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#define CONFIG_AUDIO_FLOAT 1
+
#include <stdio.h>
#include <stddef.h>
#include <math.h>
@@ -189,7 +191,11 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
av_lfg_init(&s->dith_state, 0);
/* set scale value for float to int16 conversion */
+#if CONFIG_AUDIO_FLOAT
+ s->mul_bias = 1.0f;
+#else
s->mul_bias = 32767.0f;
+#endif
/* allow downmixing to stereo or mono */
if (avctx->channels > 0 && avctx->request_channels > 0 &&
@@ -204,7 +210,12 @@ static av_cold int ac3_decode_init(AVCodecContext *avctx)
if (!s->input_buffer)
return AVERROR(ENOMEM);
+#if CONFIG_AUDIO_FLOAT
+ avctx->sample_fmt = AV_SAMPLE_FMT_FLT;
+#else
avctx->sample_fmt = AV_SAMPLE_FMT_S16;
+#endif
+
return 0;
}
@@ -1299,7 +1310,11 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size,
const uint8_t *buf = avpkt->data;
int buf_size = avpkt->size;
AC3DecodeContext *s = avctx->priv_data;
+#if CONFIG_AUDIO_FLOAT
+ float *out_samples = data;
+#else
int16_t *out_samples = (int16_t *)data;
+#endif
int blk, ch, err;
const uint8_t *channel_map;
const float *output[AC3_MAX_CHANNELS];
@@ -1405,10 +1420,16 @@ static int ac3_decode_frame(AVCodecContext * avctx, void *data, int *data_size,
av_log(avctx, AV_LOG_ERROR, "error decoding the audio block\n");
err = 1;
}
+ {START_TIMER;
+#if CONFIG_AUDIO_FLOAT
+ s->fmt_conv.float_interleave(out_samples, output, 256, s->out_channels);
+#else
s->fmt_conv.float_to_int16_interleave(out_samples, output, 256, s->out_channels);
+#endif
+ STOP_TIMER("float output");}
out_samples += 256 * s->out_channels;
}
- *data_size = s->num_blocks * 256 * avctx->channels * sizeof (int16_t);
+ *data_size = s->num_blocks * 256 * avctx->channels * sizeof(*out_samples);
return FFMIN(buf_size, s->frame_size);
}
diff --git a/libavcodec/fmtconvert.c b/libavcodec/fmtconvert.c
index e970755..58fece7 100644
--- a/libavcodec/fmtconvert.c
+++ b/libavcodec/fmtconvert.c
@@ -56,11 +56,31 @@ static void float_to_int16_interleave_c(int16_t *dst, const float **src,
}
}
+void ff_float_interleave_c(float *dst, const float **src, unsigned int len,
+ int channels)
+{
+ int j, c;
+ unsigned int i;
+ if (channels == 2) {
+ for (i = 0; i < len; i++) {
+ dst[2*i] = src[0][i];
+ dst[2*i+1] = src[1][i];
+ }
+ } else if (channels == 1 && len < INT_MAX / sizeof(float)) {
+ memcpy(dst, src[0], len * sizeof(float));
+ } else {
+ for (c = 0; c < channels; c++)
+ for (i = 0, j = c; i < len; i++, j += channels)
+ dst[j] = src[c][i];
+ }
+}
+
av_cold void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx)
{
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
c->float_to_int16 = float_to_int16_c;
c->float_to_int16_interleave = float_to_int16_interleave_c;
+ c->float_interleave = ff_float_interleave_c;
if (ARCH_ARM) ff_fmt_convert_init_arm(c, avctx);
if (HAVE_ALTIVEC) ff_fmt_convert_init_altivec(c, avctx);
diff --git a/libavcodec/fmtconvert.h b/libavcodec/fmtconvert.h
index e0afee4..d774113 100644
--- a/libavcodec/fmtconvert.h
+++ b/libavcodec/fmtconvert.h
@@ -68,8 +68,17 @@ typedef struct FmtConvertContext {
*/
void (*float_to_int16_interleave)(int16_t *dst, const float **src,
long len, int channels);
+
+ /**
+ * Convert an array of interleaved float to multiple arrays of float.
+ */
+ void (*float_interleave)(float *dst, const float **src, unsigned int len,
+ int channels);
} FmtConvertContext;
+void ff_float_interleave_c(float *dst, const float **src, unsigned int len,
+ int channels);
+
void ff_fmt_convert_init(FmtConvertContext *c, AVCodecContext *avctx);
void ff_fmt_convert_init_arm(FmtConvertContext *c, AVCodecContext *avctx);
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index ddcbab4..9732197 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -89,3 +89,140 @@ FLOAT_TO_INT16_INTERLEAVE6 3dnow
%undef pswapd
FLOAT_TO_INT16_INTERLEAVE6 3dn2
%undef cvtps2pi
+
+;-----------------------------------------------------------------------------
+; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
+;-----------------------------------------------------------------------------
+
+%macro FLOAT_INTERLEAVE6 1
+cglobal float_interleave6_%1, 2,7,4, dst, src, src1, src2, src3, src4, src5
+%ifdef ARCH_X86_64
+ %define lend r10d
+ mov lend, r2d
+%else
+ %define lend dword r2m
+%endif
+ mov src1q, [srcq+1*gprsize]
+ mov src2q, [srcq+2*gprsize]
+ mov src3q, [srcq+3*gprsize]
+ mov src4q, [srcq+4*gprsize]
+ mov src5q, [srcq+5*gprsize]
+ mov srcq, [srcq]
+ sub src1q, srcq
+ sub src2q, srcq
+ sub src3q, srcq
+ sub src4q, srcq
+ sub src5q, srcq
+.loop:
+%ifidn %1, sse
+ movlps m0, [srcq]
+ movhps m0, [srcq+src3q]
+ movlps m1, [srcq+src1q]
+ movhps m1, [srcq+src4q]
+ movlps m2, [srcq+src2q]
+ movhps m2, [srcq+src5q]
+ movlhps m3, m0
+ movhlps m3, m0
+ unpcklps m0, m1
+ unpckhps m1, m2
+ unpcklps m2, m3
+ movlhps m3, m0
+ movhlps m3, m0
+ movlhps m0, m2
+ shufps m2, m1, 0xee
+ movlhps m1, m3
+ movaps [dstq ], m0
+ movaps [dstq+16], m1
+ movaps [dstq+32], m2
+%else ; mmx
+ movq m0, [srcq]
+ movq m1, [srcq+src1q]
+ movq m2, [srcq+src2q]
+ movq m3, [srcq+src3q]
+ movq m4, [srcq+src4q]
+ movq m5, [srcq+src5q]
+
+ movq m6, m0
+ punpckldq m6, m1
+ movq m7, m2
+ punpckldq m7, m3
+ movq [dstq ], m6
+ movq [dstq+8], m7
+
+ movq m6, m4
+ punpckldq m6, m5
+ movq m7, m0
+ punpckhdq m7, m1
+ movq [dstq+16], m6
+ movq [dstq+24], m7
+
+ movq m6, m2
+ punpckhdq m6, m3
+ movq m7, m4
+ punpckhdq m7, m5
+ movq [dstq+32], m6
+ movq [dstq+40], m7
+%endif
+ add srcq, 8
+ add dstq, 48
+ sub lend, 2
+ jg .loop
+%ifidn %1, mmx
+ emms
+%endif
+ REP_RET
+%endmacro
+
+INIT_MMX
+FLOAT_INTERLEAVE6 mmx
+INIT_XMM
+FLOAT_INTERLEAVE6 sse
+
+;-----------------------------------------------------------------------------
+; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
+;-----------------------------------------------------------------------------
+
+%macro FLOAT_INTERLEAVE2 2
+cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1
+ mov src1q, [srcq+gprsize]
+ mov srcq, [srcq ]
+ sub src1q, srcq
+.loop
+ MOVPS m0, [srcq ]
+ MOVPS m1, [srcq+src1q ]
+ MOVPS m3, [srcq +mmsize]
+ MOVPS m4, [srcq+src1q+mmsize]
+
+ MOVPS m2, m0
+ PUNPCKLDQ m0, m1
+ PUNPCKHDQ m2, m1
+
+ MOVPS m1, m3
+ PUNPCKLDQ m3, m4
+ PUNPCKHDQ m1, m4
+
+ MOVPS [dstq ], m0
+ MOVPS [dstq+1*mmsize], m2
+ MOVPS [dstq+2*mmsize], m3
+ MOVPS [dstq+3*mmsize], m1
+
+ add srcq, mmsize*2
+ add dstq, mmsize*4
+ sub lend, mmsize/2
+ jg .loop
+%ifidn %1, mmx
+ emms
+%endif
+ REP_RET
+%endmacro
+
+INIT_MMX
+%define MOVPS movq
+%define PUNPCKLDQ punpckldq
+%define PUNPCKHDQ punpckhdq
+FLOAT_INTERLEAVE2 mmx, 0
+INIT_XMM
+%define MOVPS movaps
+%define PUNPCKLDQ unpcklps
+%define PUNPCKHDQ unpckhps
+FLOAT_INTERLEAVE2 sse, 5
diff --git a/libavcodec/x86/fmtconvert_mmx.c b/libavcodec/x86/fmtconvert_mmx.c
index 847bd80..61a4272 100644
--- a/libavcodec/x86/fmtconvert_mmx.c
+++ b/libavcodec/x86/fmtconvert_mmx.c
@@ -235,11 +235,40 @@ static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long
float_to_int16_interleave_3dnow(dst, src, len, channels);
}
+void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
+void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
+
+void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
+void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);
+
+static void float_interleave_mmx(float *dst, const float **src,
+ unsigned int len, int channels)
+{
+ if (channels == 2) {
+ ff_float_interleave2_mmx(dst, src, len);
+ } else if (channels == 6)
+ ff_float_interleave6_mmx(dst, src, len);
+ else
+ ff_float_interleave_c(dst, src, len, channels);
+}
+
+static void float_interleave_sse(float *dst, const float **src,
+ unsigned int len, int channels)
+{
+ if (channels == 2) {
+ ff_float_interleave2_sse(dst, src, len);
+ } else if (channels == 6)
+ ff_float_interleave6_sse(dst, src, len);
+ else
+ ff_float_interleave_c(dst, src, len, channels);
+}
+
void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
{
int mm_flags = av_get_cpu_flags();
if (mm_flags & AV_CPU_FLAG_MMX) {
+ c->float_interleave = float_interleave_mmx;
if(mm_flags & AV_CPU_FLAG_3DNOW){
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
@@ -256,6 +285,7 @@ void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
c->float_to_int16 = float_to_int16_sse;
c->float_to_int16_interleave = float_to_int16_interleave_sse;
+ c->float_interleave = float_interleave_sse;
}
if(mm_flags & AV_CPU_FLAG_SSE2){
c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel