Overall 3% faster, idct_add down from 340 to 101 cycles, idct_dc_add
down from 96 to 54 cycles.
---
 libavcodec/arm/rv34dsp_init_neon.c |    6 +++
 libavcodec/arm/rv34dsp_neon.S      |   65 ++++++++++++++++++++++++++++++++++++
 2 files changed, 71 insertions(+), 0 deletions(-)

diff --git a/libavcodec/arm/rv34dsp_init_neon.c 
b/libavcodec/arm/rv34dsp_init_neon.c
index 3984d43..744818c 100644
--- a/libavcodec/arm/rv34dsp_init_neon.c
+++ b/libavcodec/arm/rv34dsp_init_neon.c
@@ -27,8 +27,14 @@ void ff_rv34_inv_transform_noround_neon(DCTELEM *block);
 
 void ff_rv34_inv_transform_noround_dc_neon(DCTELEM *block);
 
+void ff_rv34_idct_add_neon(uint8_t *dst, int stride, DCTELEM *block);
+void ff_rv34_idct_dc_add_neon(uint8_t *dst, int stride, int dc);
+
 void ff_rv34dsp_init_neon(RV34DSPContext *c, DSPContext* dsp)
 {
     c->rv34_inv_transform    = ff_rv34_inv_transform_noround_neon;
     c->rv34_inv_transform_dc = ff_rv34_inv_transform_noround_dc_neon;
+
+    c->rv34_idct_add    = ff_rv34_idct_add_neon;
+    c->rv34_idct_dc_add = ff_rv34_idct_dc_add_neon;
 }
diff --git a/libavcodec/arm/rv34dsp_neon.S b/libavcodec/arm/rv34dsp_neon.S
index a156412..f3ae0a6 100644
--- a/libavcodec/arm/rv34dsp_neon.S
+++ b/libavcodec/arm/rv34dsp_neon.S
@@ -66,6 +66,42 @@
         vsub.s32        q15, q14, q9    @ z0 - z3
 .endm
 
+/* void rv34_idct_add_c(uint8_t *dst, int stride, DCTELEM *block) */
+function ff_rv34_idct_add_neon, export=1
+        mov             r3,  r0
+        mov             r0,  r2
+        rv34_inv_transform
+        mov             r0,  r3
+        vrshrn.s32      d22, q15, #10   @ (z0 - z3) >> 10
+        veor.s16        q12, q12
+        veor.s16        q13, q13
+        vld4.8          {d28[0], d29[0], d30[0], d31[0]}, [r0], r1
+        vld4.8          {d28[1], d29[1], d30[1], d31[1]}, [r0], r1
+        vld4.8          {d28[2], d29[2], d30[2], d31[2]}, [r0], r1
+        vld4.8          {d28[3], d29[3], d30[3], d31[3]}, [r0], r1
+        vrshrn.s32      d16, q1,  #10   @ (z0 + z3) >> 10
+        vrshrn.s32      d18, q2,  #10   @ (z1 + z2) >> 10
+        vrshrn.s32      d20, q3,  #10   @ (z1 - z2) >> 10
+        vst1.16         {q12-q13}, [r2,:128]    @ memset(block, 0, 16*2)
+        vmovl.u8        q0,  d28
+        vmovl.u8        q1,  d29
+        vmovl.u8        q2,  d30
+        vmovl.u8        q3,  d31
+        vadd.s16        d24, d0,  d16   @ dst[i*4 + 0] + ((z0 + z3) >> 10)
+        vadd.s16        d26, d2,  d18   @ dst[i*4 + 1] + ((z1 + z2) >> 10)
+        vadd.s16        d28, d4,  d20   @ dst[i*4 + 2] + ((z1 - z2) >> 10)
+        vadd.s16        d30, d6,  d22   @ dst[i*4 + 3] + ((z0 - z3) >> 10)
+        vqmovun.s16     d0,  q12
+        vqmovun.s16     d1,  q13
+        vqmovun.s16     d2,  q14
+        vqmovun.s16     d3,  q15
+        vst4.8          {d0[0], d1[0], d2[0], d3[0]}, [r3], r1
+        vst4.8          {d0[1], d1[1], d2[1], d3[1]}, [r3], r1
+        vst4.8          {d0[2], d1[2], d2[2], d3[2]}, [r3], r1
+        vst4.8          {d0[3], d1[3], d2[3], d3[3]}, [r3], r1
+        bx              lr
+endfunc
+
 /* void rv34_inv_transform_noround_neon(DCTELEM *block); */
 function ff_rv34_inv_transform_noround_neon, export=1
         rv34_inv_transform
@@ -88,6 +124,35 @@ function ff_rv34_inv_transform_noround_neon, export=1
         bx              lr
 endfunc
 
+/* void ff_rv34_idct_dc_add_neon(uint8_t *dst, int stride, int dc) */
+function ff_rv34_idct_dc_add_neon, export=1
+        mov             r3,  r0
+        vld4.8          {d28[0], d29[0], d30[0], d31[0]}, [r0], r1
+        vld4.8          {d28[1], d29[1], d30[1], d31[1]}, [r0], r1
+        vld4.8          {d28[2], d29[2], d30[2], d31[2]}, [r0], r1
+        vld4.8          {d28[3], d29[3], d30[3], d31[3]}, [r0], r1
+        vdup.16         d0,  r2
+        vmov.s16        d1,  #169
+        vmovl.u8        q8,  d28
+        vmull.s16       q1,  d0,  d1    @ dc * 13 * 13
+        vmovl.u8        q9,  d29
+        vrshrn.s32      d0,  q1,  #10   @ (dc * 13 * 13 + 0x200) >> 10
+        vmovl.u8        q10, d30
+        vmovl.u8        q11, d31
+        vadd.s16        d4,  d0,  d16
+        vadd.s16        d6,  d0,  d18
+        vadd.s16        d24, d0,  d20
+        vadd.s16        d26, d0,  d22
+        vqmovun.s16     d28, q2
+        vqmovun.s16     d29, q3
+        vqmovun.s16     d30, q12
+        vqmovun.s16     d31, q13
+        vst4.8          {d28[0], d29[0], d30[0], d31[0]}, [r3], r1
+        vst4.8          {d28[1], d29[1], d30[1], d31[1]}, [r3], r1
+        vst4.8          {d28[2], d29[2], d30[2], d31[2]}, [r3], r1
+        vst4.8          {d28[3], d29[3], d30[3], d31[3]}, [r3], r1
+        bx              lr
+endfunc
 
 /* void rv34_inv_transform_dc_noround_c(DCTELEM *block) */
 function ff_rv34_inv_transform_noround_dc_neon, export=1
-- 
1.7.8.3

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to