On Fri, 10 Feb 2017, Janne Grunau wrote:

On 2017-01-15 22:55:48 +0200, Martin Storsjö wrote:
The theoretical maximum value of E is 193, so we can just
saturate the addition to 255.

Before:                     Cortex A7      A8      A9     A53  A53/AArch64
vp9_loop_filter_v_4_8_neon:     143.0   127.7   114.8    88.0         87.7
vp9_loop_filter_v_8_8_neon:     241.0   197.2   173.7   140.0        136.7
vp9_loop_filter_v_16_8_neon:    497.0   419.5   379.7   293.0        275.7
vp9_loop_filter_v_16_16_neon:   965.2   818.7   731.4   579.0        452.0
After:
vp9_loop_filter_v_4_8_neon:     136.0   125.7   112.6    84.0         83.0
vp9_loop_filter_v_8_8_neon:     234.0   195.5   171.5   136.0        133.7
vp9_loop_filter_v_16_8_neon:    490.0   417.5   377.7   289.0        271.0
vp9_loop_filter_v_16_16_neon:   951.2   814.7   732.3   571.0        446.7
---
 libavcodec/aarch64/vp9lpf_neon.S | 40 +++++++++-------------------------------
 libavcodec/arm/vp9lpf_neon.S     | 11 +++++------
 2 files changed, 14 insertions(+), 37 deletions(-)

diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index 3b8e6eb..4553173 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -51,13 +51,6 @@
 // see the arm version instead.


-.macro uabdl_sz dst1, dst2, in1, in2, sz
-        uabdl           \dst1,  \in1\().8b,  \in2\().8b
-.ifc \sz, .16b
-        uabdl2          \dst2,  \in1\().16b, \in2\().16b
-.endif
-.endm
-
 .macro add_sz dst1, dst2, in1, in2, in3, in4, sz
         add             \dst1,  \in1,  \in3
 .ifc \sz, .16b
@@ -86,20 +79,6 @@
 .endif
 .endm

-.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz
-        cmhs            \dst1,  \in1,  \in3
-.ifc \sz, .16b
-        cmhs            \dst2,  \in2,  \in4
-.endif
-.endm
-
-.macro xtn_sz dst, in1, in2, sz
-        xtn             \dst\().8b,  \in1
-.ifc \sz, .16b
-        xtn2            \dst\().16b, \in2
-.endif
-.endm
-
 .macro usubl_sz dst1, dst2, in1, in2, sz
         usubl           \dst1,  \in1\().8b,  \in2\().8b
 .ifc \sz, .16b
@@ -179,20 +158,20 @@
 // tmpq2 == tmp3 + tmp4, etc.
 .macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
 .if \mix == 0
-        dup             v0.8h,  w2        // E
-        dup             v1.8h,  w2        // E
+        dup             v0\sz,  w2        // E
         dup             v2\sz,  w3        // I
         dup             v3\sz,  w4        // H
 .else
-        dup             v0.8h,  w2        // E
+        dup             v0.8b,  w2        // E
         dup             v2.8b,  w3        // I
         dup             v3.8b,  w4        // H
+        lsr             w5,     w2,  #8
         lsr             w6,     w3,  #8
         lsr             w7,     w4,  #8
-        ushr            v1.8h,  v0.8h, #8 // E
+        dup             v1.8b,  w5        // E
         dup             v4.8b,  w6        // I
-        bic             v0.8h,  #255, lsl 8 // E
         dup             v5.8b,  w7        // H
+        trn1            v0.2d,  v0.2d,  v1.2d

isn't this equivalent to

dup  v0.8h, w2
uzp1 v0.16b, v0.16b, v0.16b

on little endian?

Nice idea, but it isn't quite as straightforward on aarch64 - on arm it would have been.

All the even values will be output in the output registers of uzp1, so you need uzp2 as well.

So instead of this as we have now:

    dup  v0.8b, w2
    lsr  w5, w2, #8
    dup  v1.8b, w5
    trn1 v0.2d, v0.2d, v1.2d

We could do:

    dup  v0.8h, w2
    uzp2 v1.16b, v0.16b, v0.16b
    uzp1 v0.16b, v0.16b, v0.16b
    trn1 v0.2d, v0.2d, v1.2d

And I'm not sure if that's any more straightforward.

In arm mode, one could have done this though:

    vdup.s16 q0, r2
    vuzp.8  d0, d1

(We don't have a corresponding routine on arm though since we don't have enough register space.)

// Martin
_______________________________________________
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to