[libav-devel] [PATCH 4/6] rv40: NEON optimised rv40_weight_func[8|16]

Janne Grunau Tue, 04 Oct 2011 13:32:30 -0700

3 times faster, overall not much faster
---
 libavcodec/arm/rv40dsp_init_neon.c |    6 ++
 libavcodec/arm/rv40dsp_neon.S      |   98 ++++++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+), 0 deletions(-)


diff --git a/libavcodec/arm/rv40dsp_init_neon.c 
b/libavcodec/arm/rv40dsp_init_neon.c
index 81c2e13..b976e7b 100644
--- a/libavcodec/arm/rv40dsp_init_neon.c
+++ b/libavcodec/arm/rv40dsp_init_neon.c
@@ -30,6 +30,9 @@ void ff_put_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, 
int, int, int);
 void ff_avg_rv40_chroma_mc8_neon(uint8_t *, uint8_t *, int, int, int, int);
 void ff_avg_rv40_chroma_mc4_neon(uint8_t *, uint8_t *, int, int, int, int);
 
+void ff_rv40_weight_func_16_neon(uint8_t *, uint8_t *, uint8_t *, int, int, 
int);
+void ff_rv40_weight_func_8_neon(uint8_t *, uint8_t *, uint8_t *, int, int, 
int);
+
 void ff_rv34_inv_transform_neon(DCTELEM *block);
 void ff_rv34_inv_transform_noround_neon(DCTELEM *block);
 
@@ -41,6 +44,9 @@ void ff_rv40dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
     c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_neon;
     c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_neon;
 
+    c->rv40_weight_pixels_tab[0] = ff_rv40_weight_func_16_neon;
+    c->rv40_weight_pixels_tab[1] = ff_rv40_weight_func_8_neon;
+
     c->rv34_inv_transform_tab[0] = ff_rv34_inv_transform_neon;
     c->rv34_inv_transform_tab[1] = ff_rv34_inv_transform_noround_neon;
 
diff --git a/libavcodec/arm/rv40dsp_neon.S b/libavcodec/arm/rv40dsp_neon.S
index 00db28d..181c491 100644
--- a/libavcodec/arm/rv40dsp_neon.S
+++ b/libavcodec/arm/rv40dsp_neon.S
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2008 Mans Rullgard <[email protected]>
+ * Copyright (c) 2011 Janne Grunau <[email protected]>
  *
  * This file is part of Libav.
  *
@@ -317,3 +318,100 @@ endfunc
         rv40_chroma_mc8 avg
         rv40_chroma_mc4 put
         rv40_chroma_mc4 avg
+
+
+/* void ff_rv40_weight_func_16_neon(uint8_t *dst, uint8_t *src1, uint8_t 
*src2, int w1, int w2, int stride) */
+function ff_rv40_weight_func_16_neon, export=1
+        push            {r4-r5, lr}
+        ldrd            r4,       [sp, #12]
+        pld             [r1]
+        pld             [r2]
+        vmov.s16        d0[0],    r3
+        mov             r3,       #16
+        vmov.s16        d0[1],    r4
+1:
+        vld1.64         {d2,d3},  [r1],   r5
+        vld1.64         {d4,d5},  [r2],   r5
+        vmovl.u8        q8,       d2
+        vmovl.u8        q9,       d3
+        vmovl.u8        q10,      d4
+        vmovl.u8        q11,      d5
+        vmull.u16       q2,       d16,    d0[1]
+        vmull.u16       q3,       d17,    d0[1]
+        vmull.u16       q8,       d18,    d0[1]
+        vmull.u16       q9,       d19,    d0[1]
+        vmull.u16       q12,      d20,    d0[0]
+        vmull.u16       q13,      d21,    d0[0]
+        vmull.u16       q14,      d22,    d0[0]
+        vmull.u16       q15,      d23,    d0[0]
+        vshrn.i32       d4,       q2,     #9
+        vshrn.i32       d5,       q3,     #9
+        vshrn.i32       d6,       q8,     #9
+        vshrn.i32       d7,       q9,     #9
+        vshrn.i32       d16,      q12,    #9
+        vshrn.i32       d17,      q13,    #9
+        vshrn.i32       d18,      q14,    #9
+        vshrn.i32       d19,      q15,    #9
+        vadd.u16        q2,       q2,     q8
+        vadd.u16        q3,       q3,     q9
+        vrshrn.i16      d2,       q2,     #5
+        vrshrn.i16      d3,       q3,     #5
+        subs            r3,       r3,     #1
+        vst1.64         {d2,d3},  [r0],   r5
+        bne 1b
+        pop             {r4-r5, pc}
+endfunc
+
+/* void ff_rv40_weight_func_8_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2, 
int w1, int w2, int stride) */
+function ff_rv40_weight_func_8_neon, export=1
+                                                    @ r0 = dest, r1 = src1, r2 
= src2, r3 = w1
+        push            {r4-r5, lr}
+        ldrd            r4,       [sp, #12]         @ r4 = w2, r5 = stride
+        pld             [r1]
+        pld             [r2]
+        vmov.s16        d0[0],    r3
+        mov             r3,       #4
+        vmov.s16        d0[1],    r4
+1:
+        vld1.8          {d2},     [r1],  r5
+        vld1.8          {d4},     [r1],  r5
+        vld1.8          {d3},     [r2],  r5
+        vld1.8          {d5},     [r2],  r5
+
+        vmovl.u8        q8,       d2
+        vmovl.u8        q10,      d4
+        vmovl.u8        q9,       d3
+        vmovl.u8        q11,      d5
+
+        vmull.u16       q12,      d16,   d0[1]
+        vmull.u16       q13,      d17,   d0[1]
+        vmull.u16       q14,      d18,   d0[0]
+        vmull.u16       q15,      d19,   d0[0]
+
+        vshrn.i32       d2,       q12,   #9
+        vshrn.i32       d3,       q13,   #9
+        vshrn.i32       d6,       q14,   #9
+        vshrn.i32       d7,       q15,   #9
+
+        vmull.u16       q2,       d20,   d0[1]
+        vmull.u16       q8,       d21,   d0[1]
+        vmull.u16       q9,       d22,   d0[0]
+        vmull.u16       q10,      d23,   d0[0]
+
+        vshrn.i32       d24,      q2,    #9
+        vshrn.i32       d25,      q8,    #9
+        vshrn.i32       d4,       q9,    #9
+        vshrn.i32       d5,       q10,   #9
+
+        vadd.u16        q1,       q1,    q3
+        vadd.u16        q2,       q2,    q12
+        vrshrn.i16      d1,       q1,    #5
+        vrshrn.i16      d2,       q2,    #5
+
+        subs            r3,       r3,    #1
+
+        vst1.8          {d1},     [r0],  r5
+        vst1.8          {d2},     [r0],  r5
+        bne 1b
+        pop             {r4-r5, pc}
+endfunc
-- 
1.7.7

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

[libav-devel] [PATCH 4/6] rv40: NEON optimised rv40_weight_func[8|16]

Reply via email to