2.5-3 times faster, 5% overall speedup on bourne.rvmb
---
libavcodec/arm/Makefile | 4 +
libavcodec/arm/rv30dsp_init_neon.c | 36 +++++++++++
libavcodec/arm/rv34dsp_neon.S | 121 ++++++++++++++++++++++++++++++++++++
libavcodec/arm/rv40dsp_init_neon.c | 6 ++
libavcodec/rv30dsp.c | 3 +
libavcodec/rv34dsp.h | 2 +
6 files changed, 172 insertions(+), 0 deletions(-)
create mode 100644 libavcodec/arm/rv30dsp_init_neon.c
create mode 100644 libavcodec/arm/rv34dsp_neon.S
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index d6a6961..d680f6e 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -62,7 +62,10 @@ NEON-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_neon.o
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \
arm/synth_filter_neon.o \
+NEON-OBJS-$(CONFIG_RV30_DECODER) += arm/rv34dsp_neon.o \
+
NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_neon.o \
+ arm/rv34dsp_neon.o \
NEON-OBJS-$(CONFIG_VP3_DECODER) += arm/vp3dsp_neon.o
@@ -79,6 +82,7 @@ OBJS-$(HAVE_NEON) +=
arm/dsputil_init_neon.o \
arm/fmtconvert_neon.o \
arm/int_neon.o \
arm/mpegvideo_neon.o \
+ arm/rv30dsp_init_neon.o \
arm/rv40dsp_init_neon.o \
arm/simple_idct_neon.o \
$(NEON-OBJS-yes)
diff --git a/libavcodec/arm/rv30dsp_init_neon.c
b/libavcodec/arm/rv30dsp_init_neon.c
new file mode 100644
index 0000000..0943db4
--- /dev/null
+++ b/libavcodec/arm/rv30dsp_init_neon.c
@@ -0,0 +1,36 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2011 Janne Grunau <[email protected]>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/rv34dsp.h"
+
+void ff_rv34_inv_transform_neon(DCTELEM *block);
+void ff_rv34_inv_transform_noround_neon(DCTELEM *block);
+
+void ff_rv30dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
+{
+ c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon;
+ c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon;
+
+ return;
+}
diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S
new file mode 100644
index 0000000..9414db2
--- /dev/null
+++ b/libavcodec/arm/rv34dsp_neon.S
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2011 Janne Grunau <[email protected]>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavcodec/arm/asm.S"
+
+.macro rv34_row_transform
+ mov r1, #16
+ mov r3, #13
+ vmov.s32 d0[0], r3
+ vld1.16 {d28}, [r0], r1 @ load block[i+8*0]
+ vld1.16 {d29}, [r0], r1 @ load block[i+8*1]
+ mov r3, #7
+ vmov.s16 d0[2], r3
+ vmov.s32 d1[0], r3
+ mov r3, #17
+ vmov.s16 d0[3], r3
+ vmov.s32 d1[1], r3
+ vld1.16 {d30}, [r0], r1 @ load block[i+8*2]
+ vmull.s16 q12, d29, d0[2] @ z2 = block[i+8*1]*7
+ vld1.16 {d31}, [r0], r1 @ load block[i+8*3]
+ vmull.s16 q13, d29, d0[3] @ z3 = block[i+8*1]*17
+ vaddl.s16 q10, d28, d30 @ block[i+8*0] + block[i+8*2]
+ vmul.i32 q10, q10, d0[0] @ z0 = 13*(block[i+8*0] +
block[i+8*2])
+ vsubl.s16 q11, d28, d30 @ block[i+8*0] - block[i+8*2]
+ vmul.i32 q11, q11, d0[0] @ z1 = 13*(block[i+8*0] -
block[i+8*2])
+ vmlsl.s16 q12, d31, d0[3] @ z2 = block[i+8*1]*7 -
block[i+8*3]*17
+ vadd.s32 q2, q11, q12 @ z1 + z2
+ vsub.s32 q3, q11, q12 @ z1 - z2
+ vmlal.s16 q13, d31, d0[2] @ z3 = block[i+8*1]*17 +
block[i+8*3]*7
+ vadd.s32 q1, q10, q13 @ z0 + z3
+ vsub.s32 q8, q10, q13 @ z0 - z3
+.endm
+
+ .text
+ .align
+
+/* void ff_rv34_inv_transform_neon(DCTELEM *block); */
+function ff_rv34_inv_transform_neon, export=1
+ pld [r0]
+ mov r2, r0
+ rv34_row_transform
+ vtrn.32 q1, q2
+ vtrn.32 q3, q8
+ vswp d3, d6
+ vswp d5, d16
+ vadd.s32 q10, q1, q3
+ vsub.s32 q11, q1, q3
+ vmul.s32 q12, q2, d1[0]
+ vmul.s32 q9, q2, d1[1]
+ vmul.s32 q13, q11, d0[0]
+ vmls.s32 q12, q8, d1[1]
+ vmul.s32 q14, q10, d0[0]
+ vmla.s32 q9, q8, d1[0]
+ vadd.s32 q2, q13, q12 @ z1 + z2
+ vadd.s32 q1, q14, q9 @ z0 + z3
+ vsub.s32 q3, q13, q12 @ z1 - z2
+ vrshrn.s32 d1, q2, #10 @ (z1 + z2) >> 10
+ vrshrn.s32 d0, q1, #10 @ (z0 + z3) >> 10
+ vsub.s32 q4, q14, q9 @ z0 - z3
+ vrshrn.s32 d2, q3, #10 @ (z1 - z2) >> 10
+ vrshrn.s32 d3, q4, #10 @ (z0 - z3) >> 10
+ vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
+ vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r2], r1
+ vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r2], r1
+ vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r2], r1
+ bx lr
+endfunc
+
+/* void rv34_inv_transform_noround_neon(DCTELEM *block); */
+function ff_rv34_inv_transform_noround_neon, export=1
+ pld [r0]
+ mov r2, r0
+ rv34_row_transform
+ vtrn.32 q1, q2
+ vtrn.32 q3, q8
+ vswp d3, d6
+ vswp d5, d16
+ vadd.s32 q10, q1, q3 @ temp[4*0+i] + temp[4*2+i]
+ vsub.s32 q11, q1, q3 @ temp[4*0+i] - temp[4*2+i]
+ vmul.s32 q15, q2, d1[0] @ 7* temp[4*1+i]
+ vmls.s32 q15, q8, d1[1] @ z2 = 7* temp[4*1+i] -
17*temp[4*3+i]
+ vmul.s32 q9, q2, d1[1] @ 17* temp[4*1+i]
+ vmla.s32 q9, q8, d1[0] @ z3 = 17* temp[4*1+i] +
7*temp[4*3+i]
+ vmov.s32 d1, #3
+ vmul.s32 q14, q10, d0[0] @ z0
+ vmul.s32 q8, q11, d0[0] @ z1
+ vadd.s32 q10, q14, q9 @ (z0 + z3)
+ vadd.s32 q11, q8, q15 @ (z1 + z2)
+ vsub.s32 q12, q8, q15 @ (z1 - z2)
+ vsub.s32 q13, q14, q9 @ (z0 - z3)
+ vmul.s32 q10, q10, d1[0] @ (z0 + z3)*3
+ vmul.s32 q11, q11, d1[0] @ (z1 + z2)*3
+ vmul.s32 q12, q12, d1[0] @ (z1 - z2)*3
+ vmul.s32 q13, q13, d1[0] @ (z0 - z3)*3
+ vshrn.s32 d0, q10, #11 @ (z0 + z3)*3 >> 11
+ vshrn.s32 d1, q11, #11 @ (z1 + z2)*3 >> 11
+ vshrn.s32 d2, q12, #11 @ (z1 - z2)*3 >> 11
+ vshrn.s32 d3, q13, #11 @ (z0 - z3)*3 >> 11
+ vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r2], r1
+ vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r2], r1
+ vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r2], r1
+ vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r2], r1
+ bx lr
+endfunc
\ No newline at end of file
diff --git a/libavcodec/arm/rv40dsp_init_neon.c
b/libavcodec/arm/rv40dsp_init_neon.c
index 6b34a00..81c2e13 100644
--- a/libavcodec/arm/rv40dsp_init_neon.c
+++ b/libavcodec/arm/rv40dsp_init_neon.c
@@ -30,6 +30,9 @@ void ff_put_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int,
int, int, int);
void ff_avg_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_rv34_inv_transform_neon(DCTELEM *block);
+void ff_rv34_inv_transform_noround_neon(DCTELEM *block);
+
void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
{
@@ -38,5 +41,8 @@ void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon;
+ c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon;
+ c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon;
+
return;
}
diff --git a/libavcodec/rv30dsp.c b/libavcodec/rv30dsp.c
index bcd1a46..3187cc2 100644
--- a/libavcodec/rv30dsp.c
+++ b/libavcodec/rv30dsp.c
@@ -297,4 +297,7 @@ av_cold void ff_rv30dsp_init(RV34DSPContext *c, DSPContext*
dsp) {
c->put_chroma_pixels_tab[1] = dsp->put_h264_chroma_pixels_tab[1];
c->avg_chroma_pixels_tab[0] = dsp->avg_h264_chroma_pixels_tab[0];
c->avg_chroma_pixels_tab[1] = dsp->avg_h264_chroma_pixels_tab[1];
+
+ if (HAVE_NEON)
+ ff_rv30dsp_init_neon(c, dsp);
}
diff --git a/libavcodec/rv34dsp.h b/libavcodec/rv34dsp.h
index 5f1f359..16e1823 100644
--- a/libavcodec/rv34dsp.h
+++ b/libavcodec/rv34dsp.h
@@ -49,6 +49,8 @@ void ff_rv30dsp_init(RV34DSPContext *c, DSPContext* dsp);
void ff_rv34dsp_init(RV34DSPContext *c, DSPContext* dsp);
void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp);
+void ff_rv30dsp_init_neon(RV34DSPContext *c, DSPContext *dsp);
+
void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp);
void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext *dsp);
--
1.7.7
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel