The theoretical maximum value of E is 193, so we can just
saturate the addition to 255.
Before: Cortex A7 A8 A9 A53 A53/AArch64
vp9_loop_filter_v_4_8_neon: 143.0 127.7 114.8 88.0 87.7
vp9_loop_filter_v_8_8_neon: 241.0 197.2 173.7 140.0 136.7
vp9_loop_filter_v_16_8_neon: 497.0 419.5 379.7 293.0 275.7
vp9_loop_filter_v_16_16_neon: 965.2 818.7 731.4 579.0 452.0
After:
vp9_loop_filter_v_4_8_neon: 136.0 125.7 112.6 84.0 83.0
vp9_loop_filter_v_8_8_neon: 234.0 195.5 171.5 136.0 133.7
vp9_loop_filter_v_16_8_neon: 490.0 417.5 377.7 289.0 271.0
vp9_loop_filter_v_16_16_neon: 951.2 814.7 732.3 571.0 446.7
---
libavcodec/aarch64/vp9lpf_neon.S | 40 +++++++++-------------------------------
libavcodec/arm/vp9lpf_neon.S | 11 +++++------
2 files changed, 14 insertions(+), 37 deletions(-)
diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index 3b8e6eb..4553173 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -51,13 +51,6 @@
// see the arm version instead.
-.macro uabdl_sz dst1, dst2, in1, in2, sz
- uabdl \dst1, \in1\().8b, \in2\().8b
-.ifc \sz, .16b
- uabdl2 \dst2, \in1\().16b, \in2\().16b
-.endif
-.endm
-
.macro add_sz dst1, dst2, in1, in2, in3, in4, sz
add \dst1, \in1, \in3
.ifc \sz, .16b
@@ -86,20 +79,6 @@
.endif
.endm
-.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz
- cmhs \dst1, \in1, \in3
-.ifc \sz, .16b
- cmhs \dst2, \in2, \in4
-.endif
-.endm
-
-.macro xtn_sz dst, in1, in2, sz
- xtn \dst\().8b, \in1
-.ifc \sz, .16b
- xtn2 \dst\().16b, \in2
-.endif
-.endm
-
.macro usubl_sz dst1, dst2, in1, in2, sz
usubl \dst1, \in1\().8b, \in2\().8b
.ifc \sz, .16b
@@ -179,20 +158,20 @@
// tmpq2 == tmp3 + tmp4, etc.
.macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
.if \mix == 0
- dup v0.8h, w2 // E
- dup v1.8h, w2 // E
+ dup v0\sz, w2 // E
dup v2\sz, w3 // I
dup v3\sz, w4 // H
.else
- dup v0.8h, w2 // E
+ dup v0.8b, w2 // E
dup v2.8b, w3 // I
dup v3.8b, w4 // H
+ lsr w5, w2, #8
lsr w6, w3, #8
lsr w7, w4, #8
- ushr v1.8h, v0.8h, #8 // E
+ dup v1.8b, w5 // E
dup v4.8b, w6 // I
- bic v0.8h, #255, lsl 8 // E
dup v5.8b, w7 // H
+ trn1 v0.2d, v0.2d, v1.2d