On Thu, 27 Apr 2017, Alexandra Hájková wrote:
---
libavcodec/arm/hevc_idct.S | 89 +++++++++++++++++++++++++++++++++++++++
libavcodec/arm/hevcdsp_init_arm.c | 13 ++++++
2 files changed, 102 insertions(+)
diff --git a/libavcodec/arm/hevc_idct.S b/libavcodec/arm/hevc_idct.S
index 3966e93..14af40f 100644
--- a/libavcodec/arm/hevc_idct.S
+++ b/libavcodec/arm/hevc_idct.S
@@ -50,6 +50,32 @@ function ff_hevc_add_residual_4x4_8_neon, export=1
bx lr
endfunc
+.macro clip10 in1, in2, c1, c2
+ vmax.s16 \in1, \in1, q12
+ vmax.s16 \in2, \in2, q12
+ vmin.s16 \in1, \in1, q13
+ vmin.s16 \in2, \in2, q13
+.endm
I'd rather move this macro to the top, above the first function (4x4_8)
Also I think it might be better to add parameters for the max/min
registers instead of hardcoding q12/q13 - this would be useful if one
wants to restructure one of the functions but not the others. And what's
with the c1/c2 unused parameters?
+
+function ff_hevc_add_residual_4x4_10_neon, export=1
+ vld1.16 {q0-q1}, [r1, :128]
+ mov r12, r0
+ vld1.16 d4, [r12, :64], r2
It might be better to do the mov before the first vld1.16
+ vld1.16 d5, [r12, :64], r2
+ vld1.16 d6, [r12, :64], r2
+ vld1.16 d7, [r12, :64], r2
+ vqadd.s16 q0, q2
+ vqadd.s16 q1, q3
+ vmov.s16 q12, #0
+ vmov.s16 q13, #0x3FF
+ clip10 q0, q1
+ vst1.16 d0, [r0, :64], r2
+ vst1.16 d1, [r0, :64], r2
+ vst1.16 d2, [r0, :64], r2
+ vst1.16 d3, [r0, :64], r2
+ bx lr
+endfunc
+
I tested tweaking this a bit, and this ordering seems to be a few cycles
faster:
mov r12, r0
vld1.16 {q0-q1}, [r1, :128]
vld1.16 d4, [r12, :64], r2
vld1.16 d5, [r12, :64], r2
vld1.16 d6, [r12, :64], r2
vqadd.s16 q0, q2
vld1.16 d7, [r12, :64], r2
vmov.s16 q12, #0
vqadd.s16 q1, q3
vmov.s16 q13, #0x3FF
clip10 q0, q1
vst1.16 d0, [r0, :64], r2
vst1.16 d1, [r0, :64], r2
vst1.16 d2, [r0, :64], r2
vst1.16 d3, [r0, :64], r2
function ff_hevc_add_residual_8x8_8_neon, export=1
mov r3, #8
1: subs r3, #2
@@ -69,6 +95,24 @@ function ff_hevc_add_residual_8x8_8_neon, export=1
bx lr
endfunc
+function ff_hevc_add_residual_8x8_10_neon, export=1
+ mov r3, #8
+ vmov.s16 q12, #0
+ vmov.s16 q13, #0x3FF
+1: subs r3, #2
+ vld1.16 {q0-q1}, [r1, :128]!
+ vld1.16 {q8}, [r0, :128]
+ add r12, r0, r2
+ vld1.16 {q9}, [r12, :128]
I would think it's better to use r0/r12 in the same way as in the 16x16
version here - I guess I should have asked for that in the 8 bit version
as well. Oh well.
Overall looks mostly straightforward.
// Martin
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel