3 times faster, overall not much faster
---
libavcodec/arm/rv40dsp_init_neon.c | 6 ++
libavcodec/arm/rv40dsp_neon.S | 98 ++++++++++++++++++++++++++++++++++++
2 files changed, 104 insertions(+), 0 deletions(-)
diff --git a/libavcodec/arm/rv40dsp_init_neon.c
b/libavcodec/arm/rv40dsp_init_neon.c
index 81c2e13..b976e7b 100644
--- a/libavcodec/arm/rv40dsp_init_neon.c
+++ b/libavcodec/arm/rv40dsp_init_neon.c
@@ -30,6 +30,9 @@ void ff_put_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int,
int, int, int);
void ff_avg_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
+void ff_rv40_weight_func_16_neon(uint8_t *, uint8_t *, uint8_t *, int, int,
int);
+void ff_rv40_weight_func_8_neon(uint8_t *, uint8_t *, uint8_t *, int, int,
int);
+
void ff_rv34_inv_transform_neon(DCTELEM *block);
void ff_rv34_inv_transform_noround_neon(DCTELEM *block);
@@ -41,6 +44,9 @@ void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon;
+ c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_neon;
+ c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_neon;
+
c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon;
c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon;
diff --git a/libavcodec/arm/rv40dsp_neon.S b/libavcodec/arm/rv40dsp_neon.S
index 00db28d..181c491 100644
--- a/libavcodec/arm/rv40dsp_neon.S
+++ b/libavcodec/arm/rv40dsp_neon.S
@@ -1,5 +1,6 @@
/*
* Copyright (c) 2008 Mans Rullgard <[email protected]>
+ * Copyright (c) 2011 Janne Grunau <[email protected]>
*
* This file is part of Libav.
*
@@ -317,3 +318,100 @@ endfunc
rv40_chroma_mc8 avg
rv40_chroma_mc4 put
rv40_chroma_mc4 avg
+
+
+/* void ff_rv40_weight_func_16_neon(uint8_t *dst, uint8_t *src1, uint8_t
*src2, int w1, int w2, int stride) */
+function ff_rv40_weight_func_16_neon, export=1
+ push {r4-r5, lr}
+ ldrd r4, [sp, #12]
+ pld [r1]
+ pld [r2]
+ vmov.s16 d0[0], r3
+ mov r3, #16
+ vmov.s16 d0[1], r4
+1:
+ vld1.64 {d2,d3}, [r1], r5
+ vld1.64 {d4,d5}, [r2], r5
+ vmovl.u8 q8, d2
+ vmovl.u8 q9, d3
+ vmovl.u8 q10, d4
+ vmovl.u8 q11, d5
+ vmull.u16 q2, d16, d0[1]
+ vmull.u16 q3, d17, d0[1]
+ vmull.u16 q8, d18, d0[1]
+ vmull.u16 q9, d19, d0[1]
+ vmull.u16 q12, d20, d0[0]
+ vmull.u16 q13, d21, d0[0]
+ vmull.u16 q14, d22, d0[0]
+ vmull.u16 q15, d23, d0[0]
+ vshrn.i32 d4, q2, #9
+ vshrn.i32 d5, q3, #9
+ vshrn.i32 d6, q8, #9
+ vshrn.i32 d7, q9, #9
+ vshrn.i32 d16, q12, #9
+ vshrn.i32 d17, q13, #9
+ vshrn.i32 d18, q14, #9
+ vshrn.i32 d19, q15, #9
+ vadd.u16 q2, q2, q8
+ vadd.u16 q3, q3, q9
+ vrshrn.i16 d2, q2, #5
+ vrshrn.i16 d3, q3, #5
+ subs r3, r3, #1
+ vst1.64 {d2,d3}, [r0], r5
+ bne 1b
+ pop {r4-r5, pc}
+endfunc
+
+/* void ff_rv40_weight_func_8_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int w1, int w2, int stride) */
+function ff_rv40_weight_func_8_neon, export=1
+ @ r0 = dest, r1 = src1, r2
= src2, r3 = w1
+ push {r4-r5, lr}
+ ldrd r4, [sp, #12] @ r4 = w2, r5 = stride
+ pld [r1]
+ pld [r2]
+ vmov.s16 d0[0], r3
+ mov r3, #4
+ vmov.s16 d0[1], r4
+1:
+ vld1.8 {d2}, [r1], r5
+ vld1.8 {d4}, [r1], r5
+ vld1.8 {d3}, [r2], r5
+ vld1.8 {d5}, [r2], r5
+
+ vmovl.u8 q8, d2
+ vmovl.u8 q10, d4
+ vmovl.u8 q9, d3
+ vmovl.u8 q11, d5
+
+ vmull.u16 q12, d16, d0[1]
+ vmull.u16 q13, d17, d0[1]
+ vmull.u16 q14, d18, d0[0]
+ vmull.u16 q15, d19, d0[0]
+
+ vshrn.i32 d2, q12, #9
+ vshrn.i32 d3, q13, #9
+ vshrn.i32 d6, q14, #9
+ vshrn.i32 d7, q15, #9
+
+ vmull.u16 q2, d20, d0[1]
+ vmull.u16 q8, d21, d0[1]
+ vmull.u16 q9, d22, d0[0]
+ vmull.u16 q10, d23, d0[0]
+
+ vshrn.i32 d24, q2, #9
+ vshrn.i32 d25, q8, #9
+ vshrn.i32 d4, q9, #9
+ vshrn.i32 d5, q10, #9
+
+ vadd.u16 q1, q1, q3
+ vadd.u16 q2, q2, q12
+ vrshrn.i16 d1, q1, #5
+ vrshrn.i16 d2, q2, #5
+
+ subs r3, r3, #1
+
+ vst1.8 {d1}, [r0], r5
+ vst1.8 {d2}, [r0], r5
+ bne 1b
+ pop {r4-r5, pc}
+endfunc
--
1.7.7
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel