temporary commit to make reviewing easier, will be squashed to previous
commit before pushing
---
libavcodec/arm/rv40dsp_neon.S | 42 ++++++++++++++++++++++++++++++++--------
1 files changed, 33 insertions(+), 9 deletions(-)
diff --git a/libavcodec/arm/rv40dsp_neon.S b/libavcodec/arm/rv40dsp_neon.S
index 3a465a2..4a249bb 100644
--- a/libavcodec/arm/rv40dsp_neon.S
+++ b/libavcodec/arm/rv40dsp_neon.S
@@ -20,6 +20,9 @@
#include "asm.S"
+const rv40bias, align=4
+ .byte 0,16,32,16,32,28,32,28,0,32,16,32,32,28,32,28
+endconst
/* chroma_mc8(uint8_t *dst, uint8_t *src, int stride, int h, int x, int y) */
.macro rv40_chroma_mc8 type
@@ -32,6 +35,12 @@ function ff_\type\()_rv40_chroma_mc8_neon, export=1
pld [r1]
pld [r1, r2]
+ mov r7, r5, lsr #1
+ lsl r7, r7, #2
+ add r7, r7, r4, lsr #1
+ movrel r6, rv40bias
+ ldrb r6, [r6, r7]
+ vdup.16 q11, r6
A muls r7, r4, r5
T mul r7, r4, r5
T cmp r7, #0
@@ -63,15 +72,17 @@ T cmp r7, #0
vmlal.u8 q8, d6, d2
vext.8 d5, d4, d5, #1
vmlal.u8 q8, d7, d3
+ vadd.i16 q8, q8, q11
vmull.u8 q9, d6, d0
subs r3, r3, #2
vmlal.u8 q9, d7, d1
vmlal.u8 q9, d4, d2
vmlal.u8 q9, d5, d3
- vrshrn.u16 d16, q8, #6
+ vadd.i16 q9, q9, q11
+ vshrn.u16 d16, q8, #6
vld1.64 {d6, d7}, [r5], r4
pld [r1]
- vrshrn.u16 d17, q9, #6
+ vshrn.u16 d17, q9, #6
.ifc \type,avg
vld1.64 {d20}, [lr,:64], r2
vld1.64 {d21}, [lr,:64], r2
@@ -99,12 +110,14 @@ T cmp r7, #0
3: pld [r5]
vmull.u8 q8, d4, d0
vmlal.u8 q8, d6, d1
+ vadd.i16 q8, q8, q11
vld1.64 {d4}, [r1], r4
vmull.u8 q9, d6, d0
vmlal.u8 q9, d4, d1
+ vadd.i16 q9, q9, q11
vld1.64 {d6}, [r5], r4
- vrshrn.u16 d16, q8, #6
- vrshrn.u16 d17, q9, #6
+ vshrn.u16 d16, q8, #6
+ vshrn.u16 d17, q9, #6
.ifc \type,avg
vld1.64 {d20}, [lr,:64], r2
vld1.64 {d21}, [lr,:64], r2
@@ -127,13 +140,15 @@ T cmp r7, #0
subs r3, r3, #2
vmull.u8 q8, d4, d0
vmlal.u8 q8, d5, d1
+ vadd.i16 q8, q8, q11
vld1.64 {d4, d5}, [r1], r2
vmull.u8 q9, d6, d0
vmlal.u8 q9, d7, d1
+ vadd.i16 q9, q9, q11
pld [r1]
vext.8 d5, d4, d5, #1
- vrshrn.u16 d16, q8, #6
- vrshrn.u16 d17, q9, #6
+ vshrn.u16 d16, q8, #6
+ vshrn.u16 d17, q9, #6
.ifc \type,avg
vld1.64 {d20}, [lr,:64], r2
vld1.64 {d21}, [lr,:64], r2
@@ -160,6 +175,12 @@ function ff_\type\()_rv40_chroma_mc4_neon, export=1
pld [r1]
pld [r1, r2]
+ mov r7, r5, lsr #1
+ lsl r7, r7, #2
+ add r7, r7, r4, lsr #1
+ movrel r6, rv40bias
+ ldrb r6, [r6, r7]
+ vdup.16 q11, r6
A muls r7, r4, r5
T mul r7, r4, r5
T cmp r7, #0
@@ -200,7 +221,8 @@ T cmp r7, #0
vld1.64 {d6}, [r5], r4
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
- vrshrn.u16 d16, q8, #6
+ vadd.i16 q8, q8, q11
+ vshrn.u16 d16, q8, #6
subs r3, r3, #2
pld [r1]
.ifc \type,avg
@@ -237,7 +259,8 @@ T cmp r7, #0
vld1.32 {d4[1]}, [r5], r4
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
- vrshrn.u16 d16, q8, #6
+ vadd.i16 q8, q8, q11
+ vshrn.u16 d16, q8, #6
.ifc \type,avg
vld1.32 {d20[0]}, [lr,:32], r2
vld1.32 {d20[1]}, [lr,:32], r2
@@ -266,8 +289,9 @@ T cmp r7, #0
vtrn.32 d4, d5
vadd.i16 d16, d16, d17
vadd.i16 d17, d18, d19
+ vadd.i16 q8, q8, q11
pld [r1]
- vrshrn.u16 d16, q8, #6
+ vshrn.u16 d16, q8, #6
.ifc \type,avg
vld1.32 {d20[0]}, [lr,:32], r2
vld1.32 {d20[1]}, [lr,:32], r2
--
1.7.6.1
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel