instructions reordered to improve scheduling, no noticeable difference
on omap4. I've also fixed a copy and paste error in the other patch.
I failed to rename the dsp context init function from _x86 to neon.
---8<---
temporary commit to make reviewing easier, will be squashed to previous
commit before pushing
---
 libavcodec/arm/rv40dsp_neon.S |   42 ++++++++++++++++++++++++++++++++--------
 1 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/libavcodec/arm/rv40dsp_neon.S b/libavcodec/arm/rv40dsp_neon.S
index 3a465a2..00db28d 100644
--- a/libavcodec/arm/rv40dsp_neon.S
+++ b/libavcodec/arm/rv40dsp_neon.S
@@ -20,6 +20,9 @@
 
 #include "asm.S"
 
+const   rv40bias
+        .short          0,16,32,16,32,28,32,28,0,32,16,32,32,28,32,28
+endconst
 
 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
         .macro  rv40_chroma_mc8 type
@@ -32,9 +35,15 @@ function ff_\type\()_rv40_chroma_mc8_neon, export=1
         pld             [r1]
         pld             [r1, r2]
 
+        movrel          r6,  rv40bias
+        mov             r7,  r5,  lsr #1
+        add             r6,  r6,  r7,  lsl #3
+        mov             r7,  r4,  lsr #1
+        add             r6,  r6,  r7,  lsl #1
 A       muls            r7,  r4,  r5
 T       mul             r7,  r4,  r5
 T       cmp             r7,  #0
+        vld1.16         {d22[],d23[]}, [r6]
         rsb             r6,  r7,  r5,  lsl #3
         rsb             ip,  r7,  r4,  lsl #3
         sub             r4,  r7,  r4,  lsl #3
@@ -66,12 +75,14 @@ T       cmp             r7,  #0
         vmull.u8        q9,  d6,  d0
         subs            r3,  r3,  #2
         vmlal.u8        q9,  d7,  d1
+        vadd.i16        q8,  q8,  q11
         vmlal.u8        q9,  d4,  d2
         vmlal.u8        q9,  d5,  d3
-        vrshrn.u16      d16, q8,  #6
+        vshrn.u16       d16, q8,  #6
+        vadd.i16        q9,  q9,  q11
         vld1.64         {d6, d7}, [r5], r4
         pld             [r1]
-        vrshrn.u16      d17, q9,  #6
+        vshrn.u16       d17, q9,  #6
 .ifc \type,avg
         vld1.64         {d20}, [lr,:64], r2
         vld1.64         {d21}, [lr,:64], r2
@@ -101,10 +112,12 @@ T       cmp             r7,  #0
         vmlal.u8        q8,  d6,  d1
         vld1.64         {d4}, [r1], r4
         vmull.u8        q9,  d6,  d0
+        vadd.i16        q8,  q8,  q11
         vmlal.u8        q9,  d4,  d1
         vld1.64         {d6}, [r5], r4
-        vrshrn.u16      d16, q8,  #6
-        vrshrn.u16      d17, q9,  #6
+        vadd.i16        q9,  q9,  q11
+        vshrn.u16       d16, q8,  #6
+        vshrn.u16       d17, q9,  #6
 .ifc \type,avg
         vld1.64         {d20}, [lr,:64], r2
         vld1.64         {d21}, [lr,:64], r2
@@ -129,11 +142,13 @@ T       cmp             r7,  #0
         vmlal.u8        q8,  d5,  d1
         vld1.64         {d4, d5}, [r1], r2
         vmull.u8        q9,  d6,  d0
+        vadd.i16        q8,  q8,  q11
         vmlal.u8        q9,  d7,  d1
         pld             [r1]
         vext.8          d5,  d4,  d5,  #1
-        vrshrn.u16      d16, q8,  #6
-        vrshrn.u16      d17, q9,  #6
+        vadd.i16        q9,  q9,  q11
+        vshrn.u16       d16, q8,  #6
+        vshrn.u16       d17, q9,  #6
 .ifc \type,avg
         vld1.64         {d20}, [lr,:64], r2
         vld1.64         {d21}, [lr,:64], r2
@@ -160,9 +175,15 @@ function ff_\type\()_rv40_chroma_mc4_neon, export=1
         pld             [r1]
         pld             [r1, r2]
 
+        movrel          r6,  rv40bias
+        mov             r7,  r5,  lsr #1
+        add             r6,  r6,  r7,  lsl #3
+        mov             r7,  r4,  lsr #1
+        add             r6,  r6,  r7,  lsl #1
 A       muls            r7,  r4,  r5
 T       mul             r7,  r4,  r5
 T       cmp             r7,  #0
+        vld1.16         {d22[],d23[]}, [r6]
         rsb             r6,  r7,  r5,  lsl #3
         rsb             ip,  r7,  r4,  lsl #3
         sub             r4,  r7,  r4,  lsl #3
@@ -200,7 +221,8 @@ T       cmp             r7,  #0
         vld1.64         {d6},     [r5], r4
         vadd.i16        d16, d16, d17
         vadd.i16        d17, d18, d19
-        vrshrn.u16      d16, q8,  #6
+        vadd.i16        q8,  q8,  q11
+        vshrn.u16       d16, q8,  #6
         subs            r3,  r3,  #2
         pld             [r1]
 .ifc \type,avg
@@ -237,7 +259,8 @@ T       cmp             r7,  #0
         vld1.32         {d4[1]},  [r5], r4
         vadd.i16        d16, d16, d17
         vadd.i16        d17, d18, d19
-        vrshrn.u16      d16, q8,  #6
+        vadd.i16        q8,  q8,  q11
+        vshrn.u16       d16, q8,  #6
 .ifc \type,avg
         vld1.32         {d20[0]}, [lr,:32], r2
         vld1.32         {d20[1]}, [lr,:32], r2
@@ -266,8 +289,9 @@ T       cmp             r7,  #0
         vtrn.32         d4,  d5
         vadd.i16        d16, d16, d17
         vadd.i16        d17, d18, d19
+        vadd.i16        q8,  q8,  q11
         pld             [r1]
-        vrshrn.u16      d16, q8,  #6
+        vshrn.u16       d16, q8,  #6
 .ifc \type,avg
         vld1.32         {d20[0]}, [lr,:32], r2
         vld1.32         {d20[1]}, [lr,:32], r2
-- 
1.7.6.1

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to