On 2017-01-15 22:55:48 +0200, Martin Storsjö wrote:
> The theoretical maximum value of E is 193, so we can just
> saturate the addition to 255.
> 
> Before:                     Cortex A7      A8      A9     A53  A53/AArch64
> vp9_loop_filter_v_4_8_neon:     143.0   127.7   114.8    88.0         87.7
> vp9_loop_filter_v_8_8_neon:     241.0   197.2   173.7   140.0        136.7
> vp9_loop_filter_v_16_8_neon:    497.0   419.5   379.7   293.0        275.7
> vp9_loop_filter_v_16_16_neon:   965.2   818.7   731.4   579.0        452.0
> After:
> vp9_loop_filter_v_4_8_neon:     136.0   125.7   112.6    84.0         83.0
> vp9_loop_filter_v_8_8_neon:     234.0   195.5   171.5   136.0        133.7
> vp9_loop_filter_v_16_8_neon:    490.0   417.5   377.7   289.0        271.0
> vp9_loop_filter_v_16_16_neon:   951.2   814.7   732.3   571.0        446.7
> ---
>  libavcodec/aarch64/vp9lpf_neon.S | 40 
> +++++++++-------------------------------
>  libavcodec/arm/vp9lpf_neon.S     | 11 +++++------
>  2 files changed, 14 insertions(+), 37 deletions(-)
> 
> diff --git a/libavcodec/aarch64/vp9lpf_neon.S 
> b/libavcodec/aarch64/vp9lpf_neon.S
> index 3b8e6eb..4553173 100644
> --- a/libavcodec/aarch64/vp9lpf_neon.S
> +++ b/libavcodec/aarch64/vp9lpf_neon.S
> @@ -51,13 +51,6 @@
>  // see the arm version instead.
>  
>  
> -.macro uabdl_sz dst1, dst2, in1, in2, sz
> -        uabdl           \dst1,  \in1\().8b,  \in2\().8b
> -.ifc \sz, .16b
> -        uabdl2          \dst2,  \in1\().16b, \in2\().16b
> -.endif
> -.endm
> -
>  .macro add_sz dst1, dst2, in1, in2, in3, in4, sz
>          add             \dst1,  \in1,  \in3
>  .ifc \sz, .16b
> @@ -86,20 +79,6 @@
>  .endif
>  .endm
>  
> -.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz
> -        cmhs            \dst1,  \in1,  \in3
> -.ifc \sz, .16b
> -        cmhs            \dst2,  \in2,  \in4
> -.endif
> -.endm
> -
> -.macro xtn_sz dst, in1, in2, sz
> -        xtn             \dst\().8b,  \in1
> -.ifc \sz, .16b
> -        xtn2            \dst\().16b, \in2
> -.endif
> -.endm
> -
>  .macro usubl_sz dst1, dst2, in1, in2, sz
>          usubl           \dst1,  \in1\().8b,  \in2\().8b
>  .ifc \sz, .16b
> @@ -179,20 +158,20 @@
>  // tmpq2 == tmp3 + tmp4, etc.
>  .macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 
> tmp8
>  .if \mix == 0
> -        dup             v0.8h,  w2        // E
> -        dup             v1.8h,  w2        // E
> +        dup             v0\sz,  w2        // E
>          dup             v2\sz,  w3        // I
>          dup             v3\sz,  w4        // H
>  .else
> -        dup             v0.8h,  w2        // E
> +        dup             v0.8b,  w2        // E
>          dup             v2.8b,  w3        // I
>          dup             v3.8b,  w4        // H
> +        lsr             w5,     w2,  #8
>          lsr             w6,     w3,  #8
>          lsr             w7,     w4,  #8
> -        ushr            v1.8h,  v0.8h, #8 // E
> +        dup             v1.8b,  w5        // E
>          dup             v4.8b,  w6        // I
> -        bic             v0.8h,  #255, lsl 8 // E
>          dup             v5.8b,  w7        // H
> +        trn1            v0.2d,  v0.2d,  v1.2d

isn't this equivalent to

dup  v0.8h, w2
uzp1 v0.16b, v0.16b, v0.16b

on little endian?

>          trn1            v2.2d,  v2.2d,  v4.2d
>          trn1            v3.2d,  v3.2d,  v5.2d
>  .endif
> @@ -206,16 +185,15 @@
>          umax            v4\sz,  v4\sz,  v5\sz
>          umax            v5\sz,  v6\sz,  v7\sz
>          umax            \tmp1\sz, \tmp1\sz, \tmp2\sz
> -        uabdl_sz        v6.8h,  v7.8h,  v23, v24, \sz // abs(p0 - q0)
> +        uabd            v6\sz,  v23\sz, v24\sz        // abs(p0 - q0)
>          umax            v4\sz,  v4\sz,  v5\sz
> -        add_sz          v6.8h,  v7.8h,  v6.8h,  v7.8h,  v6.8h,  v7.8h, \sz 
> // abs(p0 - q0) * 2
> +        uqadd           v6\sz,  v6\sz,  v6\sz         // abs(p0 - q0) * 2
>          uabd            v5\sz,  v22\sz, v25\sz        // abs(p1 - q1)
>          umax            v4\sz,  v4\sz,  \tmp1\sz      // max(abs(p3 - p2), 
> ..., abs(q2 - q3))
>          ushr            v5\sz,  v5\sz,  #1
>          cmhs            v4\sz,  v2\sz,  v4\sz         // max(abs()) <= I
> -        uaddw_sz        v6.8h,  v7.8h,  v6.8h,  v7.8h,  v5, \sz // abs(p0 - 
> q0) * 2 + abs(p1 - q1) >> 1
> -        cmhs_sz         v6.8h,  v7.8h,  v0.8h,  v1.8h,  v6.8h,  v7.8h, \sz
> -        xtn_sz          v5,     v6.8h,  v7.8h,  \sz
> +        uqadd           v6\sz,  v6\sz,  v5\sz         // abs(p0 - q0) * 2 + 
> abs(p1 - q1) >> 1
> +        cmhs            v5\sz,  v0\sz,  v6\sz
>          and             v4\sz,  v4\sz,  v5\sz         // fm
>  
>          // If no pixels need filtering, just exit as soon as possible
> diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
> index c57c0e9..5e154f6 100644
> --- a/libavcodec/arm/vp9lpf_neon.S
> +++ b/libavcodec/arm/vp9lpf_neon.S
> @@ -51,7 +51,7 @@
>  @ and d28-d31 as temp registers, or d8-d15.
>  @ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
>  .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, 
> tmpq1, tmpq2, tmpq3, tmpq4
> -        vdup.u16        q0,  r2 @ E
> +        vdup.u8         d0,  r2 @ E
>          vdup.u8         d2,  r3 @ I
>          ldr             r3,  [sp]
>  
> @@ -64,16 +64,15 @@
>          vmax.u8         d4,  d4,  d5
>          vmax.u8         d5,  d6,  d7
>          vmax.u8         \tmp1,  \tmp1,  \tmp2
> -        vabdl.u8        q3,  d23, d24    @ abs(p0 - q0)
> +        vabd.u8         d6,  d23, d24    @ abs(p0 - q0)
>          vmax.u8         d4,  d4,  d5
> -        vadd.u16        q3,  q3,  q3     @ abs(p0 - q0) * 2
> +        vqadd.u8        d6,  d6,  d6     @ abs(p0 - q0) * 2
>          vabd.u8         d5,  d22, d25    @ abs(p1 - q1)
>          vmax.u8         d4,  d4,  \tmp1  @ max(abs(p3 - p2), ..., abs(q2 - 
> q3))
>          vshr.u8         d5,  d5,  #1
>          vcle.u8         d4,  d4,  d2     @ max(abs()) <= I
> -        vaddw.u8        q3,  q3,  d5     @ abs(p0 - q0) * 2 + abs(p1 - q1) 
> >> 1
> -        vcle.u16        q3,  q3,  q0
> -        vmovn.u16       d5,  q3
> +        vqadd.u8        d6,  d6,  d5     @ abs(p0 - q0) * 2 + abs(p1 - q1) 
> >> 1
> +        vcle.u8         d5,  d6,  d0
>          vand            d4,  d4,  d5     @ fm
>  
>          vdup.u8         d3,  r3          @ H

otherwise ok

Janne
_______________________________________________
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to