based on the h264 NEON optimizations
2.7 times faster than C, 3% faster overall for the first 30 seconds
of bourne.rmvb. 57.845s vs 59.627s (fastest out of 10 tries each)
on pandaboard
---
libavcodec/arm/Makefile | 3 +
libavcodec/arm/rv40dsp_init_neon.c | 42 +++++
libavcodec/arm/rv40dsp_neon.S | 319 ++++++++++++++++++++++++++++++++++++
libavcodec/rv34dsp.h | 1 +
libavcodec/rv40dsp.c | 2 +
5 files changed, 367 insertions(+), 0 deletions(-)
create mode 100644 libavcodec/arm/rv40dsp_init_neon.c
create mode 100644 libavcodec/arm/rv40dsp_neon.S
diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 3374f0e..d6a6961 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -62,6 +62,8 @@ NEON-OBJS-$(CONFIG_AC3DSP) += arm/ac3dsp_neon.o
NEON-OBJS-$(CONFIG_DCA_DECODER) += arm/dcadsp_neon.o \
arm/synth_filter_neon.o \
+NEON-OBJS-$(CONFIG_RV40_DECODER) += arm/rv40dsp_neon.o \
+
NEON-OBJS-$(CONFIG_VP3_DECODER) += arm/vp3dsp_neon.o
NEON-OBJS-$(CONFIG_VP5_DECODER) += arm/vp56dsp_neon.o \
@@ -77,5 +79,6 @@ OBJS-$(HAVE_NEON) +=
arm/dsputil_init_neon.o \
arm/fmtconvert_neon.o \
arm/int_neon.o \
arm/mpegvideo_neon.o \
+ arm/rv40dsp_init_neon.o \
arm/simple_idct_neon.o \
$(NEON-OBJS-yes)
diff --git a/libavcodec/arm/rv40dsp_init_neon.c
b/libavcodec/arm/rv40dsp_init_neon.c
new file mode 100644
index 0000000..6b34a00
--- /dev/null
+++ b/libavcodec/arm/rv40dsp_init_neon.c
@@ -0,0 +1,42 @@
+/*
+ * ARM NEON optimised DSP functions
+ * Copyright (c) 2011 Janne Grunau <[email protected]>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavcodec/avcodec.h"
+#include "libavcodec/rv34dsp.h"
+
+void ff_put_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_put_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+
+void ff_avg_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+
+void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
+{
+
+ c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_neon;
+ c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_neon;
+ c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon;
+ c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon;
+
+ return;
+}
diff --git a/libavcodec/arm/rv40dsp_neon.S b/libavcodec/arm/rv40dsp_neon.S
new file mode 100644
index 0000000..00db28d
--- /dev/null
+++ b/libavcodec/arm/rv40dsp_neon.S
@@ -0,0 +1,319 @@
+/*
+ * Copyright (c) 2008 Mans Rullgard <[email protected]>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "asm.S"
+
+const rv40bias
+ .short 0,16,32,16,32,28,32,28,0,32,16,32,32,28,32,28
+endconst
+
+/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+ .macro rv40_chroma_mc8 type
+function ff_\type\()_rv40_chroma_mc8_neon, export=1
+ push {r4-r7, lr}
+ ldrd r4, [sp, #20]
+.ifc \type,avg
+ mov lr, r0
+.endif
+ pld [r1]
+ pld [r1, r2]
+
+ movrel r6, rv40bias
+ mov r7, r5, lsr #1
+ add r6, r6, r7, lsl #3
+ mov r7, r4, lsr #1
+ add r6, r6, r7, lsl #1
+A muls r7, r4, r5
+T mul r7, r4, r5
+T cmp r7, #0
+ vld1.16 {d22[],d23[]}, [r6]
+ rsb r6, r7, r5, lsl #3
+ rsb ip, r7, r4, lsl #3
+ sub r4, r7, r4, lsl #3
+ sub r4, r4, r5, lsl #3
+ add r4, r4, #64
+
+ beq 2f
+
+ add r5, r1, r2
+
+ vdup.8 d0, r4
+ lsl r4, r2, #1
+ vdup.8 d1, ip
+ vld1.64 {d4, d5}, [r1], r4
+ vdup.8 d2, r6
+ vld1.64 {d6, d7}, [r5], r4
+ vdup.8 d3, r7
+
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+
+1: pld [r5]
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d5, d1
+ vld1.64 {d4, d5}, [r1], r4
+ vmlal.u8 q8, d6, d2
+ vext.8 d5, d4, d5, #1
+ vmlal.u8 q8, d7, d3
+ vmull.u8 q9, d6, d0
+ subs r3, r3, #2
+ vmlal.u8 q9, d7, d1
+ vadd.i16 q8, q8, q11
+ vmlal.u8 q9, d4, d2
+ vmlal.u8 q9, d5, d3
+ vshrn.u16 d16, q8, #6
+ vadd.i16 q9, q9, q11
+ vld1.64 {d6, d7}, [r5], r4
+ pld [r1]
+ vshrn.u16 d17, q9, #6
+.ifc \type,avg
+ vld1.64 {d20}, [lr,:64], r2
+ vld1.64 {d21}, [lr,:64], r2
+ vrhadd.u8 q8, q8, q10
+.endif
+ vext.8 d7, d6, d7, #1
+ vst1.64 {d16}, [r0,:64], r2
+ vst1.64 {d17}, [r0,:64], r2
+ bgt 1b
+
+ pop {r4-r7, pc}
+
+2: tst r6, r6
+ add ip, ip, r6
+ vdup.8 d0, r4
+ vdup.8 d1, ip
+
+ beq 4f
+
+ add r5, r1, r2
+ lsl r4, r2, #1
+ vld1.64 {d4}, [r1], r4
+ vld1.64 {d6}, [r5], r4
+
+3: pld [r5]
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d6, d1
+ vld1.64 {d4}, [r1], r4
+ vmull.u8 q9, d6, d0
+ vadd.i16 q8, q8, q11
+ vmlal.u8 q9, d4, d1
+ vld1.64 {d6}, [r5], r4
+ vadd.i16 q9, q9, q11
+ vshrn.u16 d16, q8, #6
+ vshrn.u16 d17, q9, #6
+.ifc \type,avg
+ vld1.64 {d20}, [lr,:64], r2
+ vld1.64 {d21}, [lr,:64], r2
+ vrhadd.u8 q8, q8, q10
+.endif
+ subs r3, r3, #2
+ pld [r1]
+ vst1.64 {d16}, [r0,:64], r2
+ vst1.64 {d17}, [r0,:64], r2
+ bgt 3b
+
+ pop {r4-r7, pc}
+
+4: vld1.64 {d4, d5}, [r1], r2
+ vld1.64 {d6, d7}, [r1], r2
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+
+5: pld [r1]
+ subs r3, r3, #2
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d5, d1
+ vld1.64 {d4, d5}, [r1], r2
+ vmull.u8 q9, d6, d0
+ vadd.i16 q8, q8, q11
+ vmlal.u8 q9, d7, d1
+ pld [r1]
+ vext.8 d5, d4, d5, #1
+ vadd.i16 q9, q9, q11
+ vshrn.u16 d16, q8, #6
+ vshrn.u16 d17, q9, #6
+.ifc \type,avg
+ vld1.64 {d20}, [lr,:64], r2
+ vld1.64 {d21}, [lr,:64], r2
+ vrhadd.u8 q8, q8, q10
+.endif
+ vld1.64 {d6, d7}, [r1], r2
+ vext.8 d7, d6, d7, #1
+ vst1.64 {d16}, [r0,:64], r2
+ vst1.64 {d17}, [r0,:64], r2
+ bgt 5b
+
+ pop {r4-r7, pc}
+endfunc
+ .endm
+
+/* chroma_mc4(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
+ .macro rv40_chroma_mc4 type
+function ff_\type\()_rv40_chroma_mc4_neon, export=1
+ push {r4-r7, lr}
+ ldrd r4, [sp, #20]
+.ifc \type,avg
+ mov lr, r0
+.endif
+ pld [r1]
+ pld [r1, r2]
+
+ movrel r6, rv40bias
+ mov r7, r5, lsr #1
+ add r6, r6, r7, lsl #3
+ mov r7, r4, lsr #1
+ add r6, r6, r7, lsl #1
+A muls r7, r4, r5
+T mul r7, r4, r5
+T cmp r7, #0
+ vld1.16 {d22[],d23[]}, [r6]
+ rsb r6, r7, r5, lsl #3
+ rsb ip, r7, r4, lsl #3
+ sub r4, r7, r4, lsl #3
+ sub r4, r4, r5, lsl #3
+ add r4, r4, #64
+
+ beq 2f
+
+ add r5, r1, r2
+
+ vdup.8 d0, r4
+ lsl r4, r2, #1
+ vdup.8 d1, ip
+ vld1.64 {d4}, [r1], r4
+ vdup.8 d2, r6
+ vld1.64 {d6}, [r5], r4
+ vdup.8 d3, r7
+
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d4, d5
+ vtrn.32 d6, d7
+
+ vtrn.32 d0, d1
+ vtrn.32 d2, d3
+
+1: pld [r5]
+ vmull.u8 q8, d4, d0
+ vmlal.u8 q8, d6, d2
+ vld1.64 {d4}, [r1], r4
+ vext.8 d5, d4, d5, #1
+ vtrn.32 d4, d5
+ vmull.u8 q9, d6, d0
+ vmlal.u8 q9, d4, d2
+ vld1.64 {d6}, [r5], r4
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ vadd.i16 q8, q8, q11
+ vshrn.u16 d16, q8, #6
+ subs r3, r3, #2
+ pld [r1]
+.ifc \type,avg
+ vld1.32 {d20[0]}, [lr,:32], r2
+ vld1.32 {d20[1]}, [lr,:32], r2
+ vrhadd.u8 d16, d16, d20
+.endif
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d6, d7
+ vst1.32 {d16[0]}, [r0,:32], r2
+ vst1.32 {d16[1]}, [r0,:32], r2
+ bgt 1b
+
+ pop {r4-r7, pc}
+
+2: tst r6, r6
+ add ip, ip, r6
+ vdup.8 d0, r4
+ vdup.8 d1, ip
+ vtrn.32 d0, d1
+
+ beq 4f
+
+ vext.32 d1, d0, d1, #1
+ add r5, r1, r2
+ lsl r4, r2, #1
+ vld1.32 {d4[0]}, [r1], r4
+ vld1.32 {d4[1]}, [r5], r4
+
+3: pld [r5]
+ vmull.u8 q8, d4, d0
+ vld1.32 {d4[0]}, [r1], r4
+ vmull.u8 q9, d4, d1
+ vld1.32 {d4[1]}, [r5], r4
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ vadd.i16 q8, q8, q11
+ vshrn.u16 d16, q8, #6
+.ifc \type,avg
+ vld1.32 {d20[0]}, [lr,:32], r2
+ vld1.32 {d20[1]}, [lr,:32], r2
+ vrhadd.u8 d16, d16, d20
+.endif
+ subs r3, r3, #2
+ pld [r1]
+ vst1.32 {d16[0]}, [r0,:32], r2
+ vst1.32 {d16[1]}, [r0,:32], r2
+ bgt 3b
+
+ pop {r4-r7, pc}
+
+4: vld1.64 {d4}, [r1], r2
+ vld1.64 {d6}, [r1], r2
+ vext.8 d5, d4, d5, #1
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d4, d5
+ vtrn.32 d6, d7
+
+5: vmull.u8 q8, d4, d0
+ vmull.u8 q9, d6, d0
+ subs r3, r3, #2
+ vld1.64 {d4}, [r1], r2
+ vext.8 d5, d4, d5, #1
+ vtrn.32 d4, d5
+ vadd.i16 d16, d16, d17
+ vadd.i16 d17, d18, d19
+ vadd.i16 q8, q8, q11
+ pld [r1]
+ vshrn.u16 d16, q8, #6
+.ifc \type,avg
+ vld1.32 {d20[0]}, [lr,:32], r2
+ vld1.32 {d20[1]}, [lr,:32], r2
+ vrhadd.u8 d16, d16, d20
+.endif
+ vld1.64 {d6}, [r1], r2
+ vext.8 d7, d6, d7, #1
+ vtrn.32 d6, d7
+ pld [r1]
+ vst1.32 {d16[0]}, [r0,:32], r2
+ vst1.32 {d16[1]}, [r0,:32], r2
+ bgt 5b
+
+ pop {r4-r7, pc}
+endfunc
+ .endm
+
+
+ .text
+ .align
+
+ rv40_chroma_mc8 put
+ rv40_chroma_mc8 avg
+ rv40_chroma_mc4 put
+ rv40_chroma_mc4 avg
diff --git a/libavcodec/rv34dsp.h b/libavcodec/rv34dsp.h
index e1def7d..935b3d9 100644
--- a/libavcodec/rv34dsp.h
+++ b/libavcodec/rv34dsp.h
@@ -46,5 +46,6 @@ void ff_rv30dsp_init(RV34DSPContext *c, DSPContext* dsp);
void ff_rv40dsp_init(RV34DSPContext *c, DSPContext* dsp);
void ff_rv40dsp_init_x86(RV34DSPContext *c, DSPContext *dsp);
+void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext *dsp);
#endif /* AVCODEC_RV34DSP_H */
diff --git a/libavcodec/rv40dsp.c b/libavcodec/rv40dsp.c
index c54f965..a9e7005 100644
--- a/libavcodec/rv40dsp.c
+++ b/libavcodec/rv40dsp.c
@@ -370,4 +370,6 @@ av_cold void ff_rv40dsp_init(RV34DSPContext *c, DSPContext*
dsp) {
if (HAVE_MMX)
ff_rv40dsp_init_x86(c, dsp);
+ if (HAVE_NEON)
+ ff_rv40dsp_init_neon(c, dsp);
}
--
1.7.7
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel