On Thu, 27 Apr 2017, Alexandra Hájková wrote:

---
libavcodec/arm/hevc_idct.S        | 89 +++++++++++++++++++++++++++++++++++++++
libavcodec/arm/hevcdsp_init_arm.c | 13 ++++++
2 files changed, 102 insertions(+)

diff --git a/libavcodec/arm/hevc_idct.S b/libavcodec/arm/hevc_idct.S
index 3966e93..14af40f 100644
--- a/libavcodec/arm/hevc_idct.S
+++ b/libavcodec/arm/hevc_idct.S
@@ -50,6 +50,32 @@ function ff_hevc_add_residual_4x4_8_neon, export=1
        bx              lr
endfunc

+.macro clip10 in1, in2, c1, c2
+        vmax.s16        \in1, \in1, q12
+        vmax.s16        \in2, \in2, q12
+        vmin.s16        \in1, \in1, q13
+        vmin.s16        \in2, \in2, q13
+.endm

I'd rather move this macro to the top, above the first function (4x4_8)

Also I think it might be better to add parameters for the max/min registers instead of hardcoding q12/q13 - this would be useful if one wants to restructure one of the functions but not the others. And what's with the c1/c2 unused parameters?

+
+function ff_hevc_add_residual_4x4_10_neon, export=1
+        vld1.16         {q0-q1}, [r1, :128]
+        mov             r12, r0
+        vld1.16         d4, [r12, :64], r2

It might be better to do the mov before the first vld1.16

+        vld1.16         d5, [r12, :64], r2
+        vld1.16         d6, [r12, :64], r2
+        vld1.16         d7, [r12, :64], r2
+        vqadd.s16       q0, q2
+        vqadd.s16       q1, q3
+        vmov.s16        q12, #0
+        vmov.s16        q13, #0x3FF
+        clip10          q0, q1
+        vst1.16         d0, [r0, :64], r2
+        vst1.16         d1, [r0, :64], r2
+        vst1.16         d2, [r0, :64], r2
+        vst1.16         d3, [r0, :64], r2
+        bx              lr
+endfunc
+

I tested tweaking this a bit, and this ordering seems to be a few cycles faster:

        mov             r12, r0
        vld1.16         {q0-q1}, [r1, :128]
        vld1.16         d4, [r12, :64], r2
        vld1.16         d5, [r12, :64], r2
        vld1.16         d6, [r12, :64], r2
        vqadd.s16       q0, q2
        vld1.16         d7, [r12, :64], r2
        vmov.s16        q12, #0
        vqadd.s16       q1, q3
        vmov.s16        q13, #0x3FF
        clip10          q0, q1
        vst1.16         d0, [r0, :64], r2
        vst1.16         d1, [r0, :64], r2
        vst1.16         d2, [r0, :64], r2
        vst1.16         d3, [r0, :64], r2


function ff_hevc_add_residual_8x8_8_neon, export=1
        mov             r3,   #8
1:      subs            r3,   #2
@@ -69,6 +95,24 @@ function ff_hevc_add_residual_8x8_8_neon, export=1
        bx              lr
endfunc

+function ff_hevc_add_residual_8x8_10_neon, export=1
+        mov             r3,  #8
+        vmov.s16        q12, #0
+        vmov.s16        q13, #0x3FF
+1:      subs            r3,  #2
+        vld1.16         {q0-q1}, [r1, :128]!
+        vld1.16         {q8},    [r0, :128]
+        add             r12, r0, r2
+        vld1.16         {q9},    [r12, :128]

I would think it's better to use r0/r12 in the same way as in the 16x16 version here - I guess I should have asked for that in the 8 bit version as well. Oh well.

Overall looks mostly straightforward.

// Martin
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to