The change is splitted into two patches to ease reviewing. The first patch
adds the infrastructure and copies the h264 asm and renames the functions.
The second patch changes the functions to the rv40 variants. The
modifications are just the straight forward implementations of the
variable bias rv40 does instead of the rounded shift.
---8<---
based on the h264 asm
3% faster overall for the first 30 seconds of bourne.rmvb
57.845s vs 59.627s (fastest out of 10 tries each) on pandaboard
---
libavcodec/arm/Makefile | 3 +
libavcodec/arm/rv40dsp_init_neon.c | 42 +++++
libavcodec/arm/rv40dsp_neon.S | 295 ++++++++++++++++++++++++++++++++++++
libavcodec/rv34dsp.h | 1 +
libavcodec/rv40dsp.c | 2 +
5 files changed, 343 insertions(+), 0 deletions(-)
create mode 100644 libavcodec/arm/rv40dsp_init_neon.c
create mode 100644 libavcodec/arm/rv40dsp_neon.S
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 3374f0e..d6a6961 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -62,6 +62,8 @@ NEON-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_neon.o
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \
arm/synth_filter_neon.o \
+NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_neon.o \
+
NEON-OBJS-$(CONFIG_VP3_DECODER) += arm/vp3dsp_neon.o
NEON-OBJS-$(CONFIG_VP5_DECODER) += arm/vp56dsp_neon.o \
@@ -77,5 +79,6 @@ OBJS-$(HAVE_NEON) +=
arm/dsputil_init_neon.o \
arm/fmtconvert_neon.o \
arm/int_neon.o \
arm/mpegvideo_neon.o \
+ arm/rv40dsp_init_neon.o \
arm/simple_idct_neon.o \
$(NEON-OBJS-yes)
diff --git a/libavcodec/arm/rv40dsp_init_neon.c
b/libavcodec/arm/rv40dsp_init_neon.c
new file mode 100644
index 0000000..1a7a213
--- /dev/null
+++ b/libavcodec/arm/rv40dsp_init_neon.c
@@ -0,0 +1,42 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2011 Janne Grunau <[email protected]>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/rv34dsp.h"
+
+void ff_put_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_put_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+
+void ff_avg_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+
+void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext* dsp)
+{
+
+ c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_neon;
+ c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_neon;
+ c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon;
+ c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon;
+
+ return;
+}
diff --git a/libavcodec/arm/rv40dsp_neon.S b/libavcodec/arm/rv40dsp_neon.S
new file mode 100644
index 0000000..3a465a2
--- /dev/null
+++ b/libavcodec/arm/rv40dsp_neon.S
@@ -0,0 +1,295 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <[email protected]>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+
+/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+ .macro rv40_chroma_mc8 type
+function ff_\type\()_rv40_chroma_mc8_neon, export=1
+ push {r4-r7, lr}
+ ldrd r4, [sp, #20]
+.ifc \type,avg
+ mov lr, r0
+.endif
+ pld [r1]
+ pld [r1, r2]
+
+A muls r7, r4, r5
+T mul r7, r4, r5
+T cmp r7, #0
+ rsb r6, r7, r5, lsl #3
+ rsb ip, r7, r4, lsl #3
+ sub r4, r7, r4, lsl #3
+ sub r4, r4, r5, lsl #3
+ add r4, r4, #64
+
+ beq 2f
+
+ add r5, r1, r2
+
+ vdup.8 d0, r4
+ lsl r4, r2, #1
+ vdup.8 d1, ip
+ vld1.64 {d4, d5}, [r1], r4
+ vdup.8 d2, r6
+ vld1.64 {d6, d7}, [r5], r4
+ vdup.8 d3, r7
+
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+
+1: pld [r5]
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d5, d1
+ vld1.64 {d4, d5}, [r1], r4
+ vmlal.u8 q8, d6, d2
+ vext.8 d5, d4, d5, #1
+ vmlal.u8 q8, d7, d3
+ vmull.u8 q9, d6, d0
+ subs r3, r3, #2
+ vmlal.u8 q9, d7, d1
+ vmlal.u8 q9, d4, d2
+ vmlal.u8 q9, d5, d3
+ vrshrn.u16 d16, q8, #6
+ vld1.64 {d6, d7}, [r5], r4
+ pld [r1]
+ vrshrn.u16 d17, q9, #6
+.ifc \type,avg
+ vld1.64 {d20}, [lr,:64], r2
+ vld1.64 {d21}, [lr,:64], r2
+ vrhadd.u8 q8, q8, q10
+.endif
+ vext.8 d7, d6, d7, #1
+ vst1.64 {d16}, [r0,:64], r2
+ vst1.64 {d17}, [r0,:64], r2
+ bgt 1b
+
+ pop {r4-r7, pc}
+
+2: tst r6, r6
+ add ip, ip, r6
+ vdup.8 d0, r4
+ vdup.8 d1, ip
+
+ beq 4f
+
+ add r5, r1, r2
+ lsl r4, r2, #1
+ vld1.64 {d4}, [r1], r4
+ vld1.64 {d6}, [r5], r4
+
+3: pld [r5]
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d6, d1
+ vld1.64 {d4}, [r1], r4
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d4, d1
+ vld1.64 {d6}, [r5], r4
+ vrshrn.u16 d16, q8, #6
+ vrshrn.u16 d17, q9, #6
+.ifc \type,avg
+ vld1.64 {d20}, [lr,:64], r2
+ vld1.64 {d21}, [lr,:64], r2
+ vrhadd.u8 q8, q8, q10
+.endif
+ subs r3, r3, #2
+ pld [r1]
+ vst1.64 {d16}, [r0,:64], r2
+ vst1.64 {d17}, [r0,:64], r2
+ bgt 3b
+
+ pop {r4-r7, pc}
+
+4: vld1.64 {d4, d5}, [r1], r2
+ vld1.64 {d6, d7}, [r1], r2
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+
+5: pld [r1]
+ subs r3, r3, #2
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d5, d1
+ vld1.64 {d4, d5}, [r1], r2
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d7, d1
+ pld [r1]
+ vext.8 d5, d4, d5, #1
+ vrshrn.u16 d16, q8, #6
+ vrshrn.u16 d17, q9, #6
+.ifc \type,avg
+ vld1.64 {d20}, [lr,:64], r2
+ vld1.64 {d21}, [lr,:64], r2
+ vrhadd.u8 q8, q8, q10
+.endif
+ vld1.64 {d6, d7}, [r1], r2
+ vext.8 d7, d6, d7, #1
+ vst1.64 {d16}, [r0,:64], r2
+ vst1.64 {d17}, [r0,:64], r2
+ bgt 5b
+
+ pop {r4-r7, pc}
+endfunc
+ .endm
+
+/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+ .macro rv40_chroma_mc4 type
+function ff_\type\()_rv40_chroma_mc4_neon, export=1
+ push {r4-r7, lr}
+ ldrd r4, [sp, #20]
+.ifc \type,avg
+ mov lr, r0
+.endif
+ pld [r1]
+ pld [r1, r2]
+
+A muls r7, r4, r5
+T mul r7, r4, r5
+T cmp r7, #0
+ rsb r6, r7, r5, lsl #3
+ rsb ip, r7, r4, lsl #3
+ sub r4, r7, r4, lsl #3
+ sub r4, r4, r5, lsl #3
+ add r4, r4, #64
+
+ beq 2f
+
+ add r5, r1, r2
+
+ vdup.8 d0, r4
+ lsl r4, r2, #1
+ vdup.8 d1, ip
+ vld1.64 {d4}, [r1], r4
+ vdup.8 d2, r6
+ vld1.64 {d6}, [r5], r4
+ vdup.8 d3, r7
+
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d4, d5
+ vtrn.32 d6, d7
+
+ vtrn.32 d0, d1
+ vtrn.32 d2, d3
+
+1: pld [r5]
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d6, d2
+ vld1.64 {d4}, [r1], r4
+ vext.8 d5, d4, d5, #1
+ vtrn.32 d4, d5
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d4, d2
+ vld1.64 {d6}, [r5], r4
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ vrshrn.u16 d16, q8, #6
+ subs r3, r3, #2
+ pld [r1]
+.ifc \type,avg
+ vld1.32 {d20[0]}, [lr,:32], r2
+ vld1.32 {d20[1]}, [lr,:32], r2
+ vrhadd.u8 d16, d16, d20
+.endif
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d6, d7
+ vst1.32 {d16[0]}, [r0,:32], r2
+ vst1.32 {d16[1]}, [r0,:32], r2
+ bgt 1b
+
+ pop {r4-r7, pc}
+
+2: tst r6, r6
+ add ip, ip, r6
+ vdup.8 d0, r4
+ vdup.8 d1, ip
+ vtrn.32 d0, d1
+
+ beq 4f
+
+ vext.32 d1, d0, d1, #1
+ add r5, r1, r2
+ lsl r4, r2, #1
+ vld1.32 {d4[0]}, [r1], r4
+ vld1.32 {d4[1]}, [r5], r4
+
+3: pld [r5]
+ vmull.u8 q8, d4, d0
+ vld1.32 {d4[0]}, [r1], r4
+ vmull.u8 q9, d4, d1
+ vld1.32 {d4[1]}, [r5], r4
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ vrshrn.u16 d16, q8, #6
+.ifc \type,avg
+ vld1.32 {d20[0]}, [lr,:32], r2
+ vld1.32 {d20[1]}, [lr,:32], r2
+ vrhadd.u8 d16, d16, d20
+.endif
+ subs r3, r3, #2
+ pld [r1]
+ vst1.32 {d16[0]}, [r0,:32], r2
+ vst1.32 {d16[1]}, [r0,:32], r2
+ bgt 3b
+
+ pop {r4-r7, pc}
+
+4: vld1.64 {d4}, [r1], r2
+ vld1.64 {d6}, [r1], r2
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d4, d5
+ vtrn.32 d6, d7
+
+5: vmull.u8 q8, d4, d0
+ vmull.u8 q9, d6, d0
+ subs r3, r3, #2
+ vld1.64 {d4}, [r1], r2
+ vext.8 d5, d4, d5, #1
+ vtrn.32 d4, d5
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ pld [r1]
+ vrshrn.u16 d16, q8, #6
+.ifc \type,avg
+ vld1.32 {d20[0]}, [lr,:32], r2
+ vld1.32 {d20[1]}, [lr,:32], r2
+ vrhadd.u8 d16, d16, d20
+.endif
+ vld1.64 {d6}, [r1], r2
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d6, d7
+ pld [r1]
+ vst1.32 {d16[0]}, [r0,:32], r2
+ vst1.32 {d16[1]}, [r0,:32], r2
+ bgt 5b
+
+ pop {r4-r7, pc}
+endfunc
+ .endm
+
+
+ .text
+ .align
+
+ rv40_chroma_mc8 put
+ rv40_chroma_mc8 avg
+ rv40_chroma_mc4 put
+ rv40_chroma_mc4 avg
diff --git a/libavcodec/rv34dsp.h b/libavcodec/rv34dsp.h
index e1def7d..935b3d9 100644
--- a/libavcodec/rv34dsp.h
+++ b/libavcodec/rv34dsp.h
@@ -46,5 +46,6 @@ void ff_rv30dsp_init(RV34DSPContext *c, DSPContext* dsp);
void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp);
void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp);
+void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext *dsp);
#endif /* AVCODEC_RV34DSP_H */
diff --git a/libavcodec/rv40dsp.c b/libavcodec/rv40dsp.c
index c54f965..f116c20 100644
--- a/libavcodec/rv40dsp.c
+++ b/libavcodec/rv40dsp.c
@@ -370,4 +370,6 @@ av_cold void ff_rv40dsp_init(RV34DSPContext *c, DSPContext*
dsp) {
if (HAVE_MMX)
ff_rv40dsp_init_x86(c, dsp);
+ if (HAVE_NEON)
+ ff_rv40dsp_init_x86(c, dsp);
}
--
1.7.6.1
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel