It is currently declared as a macro who is set to inlinable functions,
among which a Neon implementation.
Add a DSP parameter to the macro, so that the implementation can either
be an inline function, or a call to the function found in the context,
which is the default.
On an Arrandale CPU, gain for an SSE2 function of that inlining vs. a call:
- Win32: 29 to 26 cycles
- Win64: 25 to 23 cycles
---
libavcodec/arm/dca.h | 5 +++--
libavcodec/dcadec.c | 16 +++-------------
libavcodec/dcadsp.c | 9 +++++++++
libavcodec/dcadsp.h | 4 ++++
4 files changed, 19 insertions(+), 15 deletions(-)
diff --git a/libavcodec/arm/dca.h b/libavcodec/arm/dca.h
index 39ec2b6..6b4d5c3 100644
--- a/libavcodec/arm/dca.h
+++ b/libavcodec/arm/dca.h
@@ -82,8 +82,8 @@ static inline int decode_blockcodes(int code1, int code2, int
levels,
#if HAVE_NEON_INLINE && HAVE_ASM_MOD_Y
-#define int8x8_fmul_int32 int8x8_fmul_int32
-static inline void int8x8_fmul_int32(float *dst, const int8_t *src, int scale)
+#undef int8x8_fmul_int32
+static void int8x8_fmul_int32(float *dst, const int8_t *src, int scale)
{
__asm__ ("vcvt.f32.s32 %2, %2, #4 \n"
"vld1.8 {d0}, [%1,:64] \n"
@@ -99,6 +99,7 @@ static inline void int8x8_fmul_int32(float *dst, const int8_t
*src, int scale)
: "r"(src), "x"(scale)
: "d0", "d1", "d2", "d3");
}
+#define int8x8_fmul_int32(dsp) int8x8_fmul_int32
#endif
diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
index f9e39bc..a3ca02c 100644
--- a/libavcodec/dcadec.c
+++ b/libavcodec/dcadec.c
@@ -1085,16 +1085,6 @@ static int decode_blockcodes(int code1, int code2, int
levels, int32_t *values)
static const uint8_t abits_sizes[7] = { 7, 10, 12, 13, 15, 17, 19 };
static const uint8_t abits_levels[7] = { 3, 5, 7, 9, 13, 17, 25 };
-#ifndef int8x8_fmul_int32
-static inline void int8x8_fmul_int32(float *dst, const int8_t *src, int scale)
-{
- float fscale = scale / 16.0;
- int i;
- for (i = 0; i < 8; i++)
- dst[i] = src[i] * fscale;
-}
-#endif
-
static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
{
int k, l;
@@ -1219,9 +1209,9 @@ static int dca_subsubframe(DCAContext *s, int
base_channel, int block_index)
s->debug_flag |= 0x01;
}
- int8x8_fmul_int32(subband_samples[k][l],
- &high_freq_vq[hfvq][subsubframe * 8],
- s->scale_factor[k][l][0]);
+ int8x8_fmul_int32(s->dcadsp)(subband_samples[k][l],
+ &high_freq_vq[hfvq][subsubframe * 8],
+ s->scale_factor[k][l][0]);
}
}
diff --git a/libavcodec/dcadsp.c b/libavcodec/dcadsp.c
index 57d716e..b984864 100644
--- a/libavcodec/dcadsp.c
+++ b/libavcodec/dcadsp.c
@@ -24,6 +24,14 @@
#include "libavutil/intreadwrite.h"
#include "dcadsp.h"
+static void int8x8_fmul_int32_c(float *dst, const int8_t *src, int scale)
+{
+ float fscale = scale / 16.0;
+ int i;
+ for (i = 0; i < 8; i++)
+ dst[i] = src[i] * fscale;
+}
+
static void dca_lfe_fir_c(float *out, const float *in, const float *coefs,
int decifactor, float scale)
{
@@ -78,5 +86,6 @@ av_cold void ff_dcadsp_init(DCADSPContext *s)
{
s->lfe_fir = dca_lfe_fir_c;
s->qmf_32_subbands = dca_qmf_32_subbands;
+ s->int8x8_fmul_int32 = int8x8_fmul_int32_c;
if (ARCH_ARM) ff_dcadsp_init_arm(s);
}
diff --git a/libavcodec/dcadsp.h b/libavcodec/dcadsp.h
index ec88be7..3feea9f 100644
--- a/libavcodec/dcadsp.h
+++ b/libavcodec/dcadsp.h
@@ -31,8 +31,12 @@ typedef struct DCADSPContext {
int *synth_buf_offset, float synth_buf2[32],
const float window[512], float *samples_out,
float raXin[32], float scale);
+ void (*int8x8_fmul_int32)(float *dst, const int8_t *src, int scale);
} DCADSPContext;
+/** Default define to allow switching from inlinable function to dsp */
+#define int8x8_fmul_int32(dsp) dsp.int8x8_fmul_int32
+
void ff_dcadsp_init(DCADSPContext *s);
void ff_dcadsp_init_arm(DCADSPContext *s);
--
1.8.0.msysgit.0
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel