temporary commit to make reviewing easier, will be squashed to previous
commit before pushing
---
 libavcodec/arm/rv40dsp_neon.S |   42 ++++++++++++++++++++++++++++++++--------
 1 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/libavcodec/arm/rv40dsp_neon.S b/libavcodec/arm/rv40dsp_neon.S
index 3a465a2..4a249bb 100644
--- a/libavcodec/arm/rv40dsp_neon.S
+++ b/libavcodec/arm/rv40dsp_neon.S
@@ -20,6 +20,9 @@
 
 #include "asm.S"
 
+const   rv40bias, align=4
+        .byte          0,16,32,16,32,28,32,28,0,32,16,32,32,28,32,28
+endconst
 
 /* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
         .macro  rv40_chroma_mc8 type
@@ -32,6 +35,12 @@ function ff_\type\()_rv40_chroma_mc8_neon, export=1
         pld             [r1]
         pld             [r1, r2]
 
+        mov             r7,  r5,  lsr #1
+        lsl             r7,  r7,  #2
+        add             r7,  r7,  r4,  lsr #1
+        movrel          r6,  rv40bias
+        ldrb            r6,  [r6, r7]
+        vdup.16         q11, r6
 A       muls            r7,  r4,  r5
 T       mul             r7,  r4,  r5
 T       cmp             r7,  #0
@@ -63,15 +72,17 @@ T       cmp             r7,  #0
         vmlal.u8        q8,  d6,  d2
         vext.8          d5,  d4,  d5,  #1
         vmlal.u8        q8,  d7,  d3
+        vadd.i16        q8,  q8,  q11
         vmull.u8        q9,  d6,  d0
         subs            r3,  r3,  #2
         vmlal.u8        q9,  d7,  d1
         vmlal.u8        q9,  d4,  d2
         vmlal.u8        q9,  d5,  d3
-        vrshrn.u16      d16, q8,  #6
+        vadd.i16        q9,  q9,  q11
+        vshrn.u16       d16, q8,  #6
         vld1.64         {d6, d7}, [r5], r4
         pld             [r1]
-        vrshrn.u16      d17, q9,  #6
+        vshrn.u16       d17, q9,  #6
 .ifc \type,avg
         vld1.64         {d20}, [lr,:64], r2
         vld1.64         {d21}, [lr,:64], r2
@@ -99,12 +110,14 @@ T       cmp             r7,  #0
 3:      pld             [r5]
         vmull.u8        q8,  d4,  d0
         vmlal.u8        q8,  d6,  d1
+        vadd.i16        q8,  q8,  q11
         vld1.64         {d4}, [r1], r4
         vmull.u8        q9,  d6,  d0
         vmlal.u8        q9,  d4,  d1
+        vadd.i16        q9,  q9,  q11
         vld1.64         {d6}, [r5], r4
-        vrshrn.u16      d16, q8,  #6
-        vrshrn.u16      d17, q9,  #6
+        vshrn.u16       d16, q8,  #6
+        vshrn.u16       d17, q9,  #6
 .ifc \type,avg
         vld1.64         {d20}, [lr,:64], r2
         vld1.64         {d21}, [lr,:64], r2
@@ -127,13 +140,15 @@ T       cmp             r7,  #0
         subs            r3,  r3,  #2
         vmull.u8        q8,  d4,  d0
         vmlal.u8        q8,  d5,  d1
+        vadd.i16        q8,  q8,  q11
         vld1.64         {d4, d5}, [r1], r2
         vmull.u8        q9,  d6,  d0
         vmlal.u8        q9,  d7,  d1
+        vadd.i16        q9,  q9,  q11
         pld             [r1]
         vext.8          d5,  d4,  d5,  #1
-        vrshrn.u16      d16, q8,  #6
-        vrshrn.u16      d17, q9,  #6
+        vshrn.u16       d16, q8,  #6
+        vshrn.u16       d17, q9,  #6
 .ifc \type,avg
         vld1.64         {d20}, [lr,:64], r2
         vld1.64         {d21}, [lr,:64], r2
@@ -160,6 +175,12 @@ function ff_\type\()_rv40_chroma_mc4_neon, export=1
         pld             [r1]
         pld             [r1, r2]
 
+        mov             r7,  r5,  lsr #1
+        lsl             r7,  r7,  #2
+        add             r7,  r7,  r4,  lsr #1
+        movrel          r6,  rv40bias
+        ldrb            r6,  [r6, r7]
+        vdup.16         q11, r6
 A       muls            r7,  r4,  r5
 T       mul             r7,  r4,  r5
 T       cmp             r7,  #0
@@ -200,7 +221,8 @@ T       cmp             r7,  #0
         vld1.64         {d6},     [r5], r4
         vadd.i16        d16, d16, d17
         vadd.i16        d17, d18, d19
-        vrshrn.u16      d16, q8,  #6
+        vadd.i16        q8,  q8,  q11
+        vshrn.u16       d16, q8,  #6
         subs            r3,  r3,  #2
         pld             [r1]
 .ifc \type,avg
@@ -237,7 +259,8 @@ T       cmp             r7,  #0
         vld1.32         {d4[1]},  [r5], r4
         vadd.i16        d16, d16, d17
         vadd.i16        d17, d18, d19
-        vrshrn.u16      d16, q8,  #6
+        vadd.i16        q8,  q8,  q11
+        vshrn.u16       d16, q8,  #6
 .ifc \type,avg
         vld1.32         {d20[0]}, [lr,:32], r2
         vld1.32         {d20[1]}, [lr,:32], r2
@@ -266,8 +289,9 @@ T       cmp             r7,  #0
         vtrn.32         d4,  d5
         vadd.i16        d16, d16, d17
         vadd.i16        d17, d18, d19
+        vadd.i16        q8,  q8,  q11
         pld             [r1]
-        vrshrn.u16      d16, q8,  #6
+        vshrn.u16       d16, q8,  #6
 .ifc \type,avg
         vld1.32         {d20[0]}, [lr,:32], r2
         vld1.32         {d20[1]}, [lr,:32], r2
-- 
1.7.6.1

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to