On 2017-01-15 22:55:47 +0200, Martin Storsjö wrote:
> Previously we first calculated hev, and then negated it.
> 
> Since we were able to schedule the negation in the middle
> of another calculation, we don't see any gain in all cases.
> 
> Before:                     Cortex A7      A8      A9     A53  A53/AArch64
> vp9_loop_filter_v_4_8_neon:     147.0   129.0   115.8    89.0         88.7
> vp9_loop_filter_v_8_8_neon:     242.0   198.5   174.7   140.0        136.7
> vp9_loop_filter_v_16_8_neon:    500.0   419.5   382.7   293.0        275.7
> vp9_loop_filter_v_16_16_neon:   971.2   825.5   731.5   579.0        453.0
> After:
> vp9_loop_filter_v_4_8_neon:     143.0   127.7   114.8    88.0         87.7
> vp9_loop_filter_v_8_8_neon:     241.0   197.2   173.7   140.0        136.7
> vp9_loop_filter_v_16_8_neon:    497.0   419.5   379.7   293.0        275.7
> vp9_loop_filter_v_16_16_neon:   965.2   818.7   731.4   579.0        452.0
> ---
>  libavcodec/aarch64/vp9lpf_neon.S | 5 ++---
>  libavcodec/arm/vp9lpf_neon.S     | 5 ++---
>  2 files changed, 4 insertions(+), 6 deletions(-)
> 
> diff --git a/libavcodec/aarch64/vp9lpf_neon.S 
> b/libavcodec/aarch64/vp9lpf_neon.S
> index e9c7d9e..3b8e6eb 100644
> --- a/libavcodec/aarch64/vp9lpf_neon.S
> +++ b/libavcodec/aarch64/vp9lpf_neon.S
> @@ -292,7 +292,7 @@
>  .if \mix != 0
>          sxtl            v1.8h,  v1.8b
>  .endif
> -        cmhi            v5\sz,  v5\sz,  v3\sz  // hev
> +        cmhs            v5\sz,  v3\sz,  v5\sz  // !hev
>  .if \wd == 8
>          // If a 4/8 or 8/4 mix is used, clear the relevant half of v6
>  .if \mix != 0
> @@ -306,11 +306,10 @@
>  .elseif \wd == 8
>          bic             v4\sz,  v4\sz,  v6\sz  // fm && !flat8in
>  .endif
> -        mvn             v5\sz,  v5\sz          // !hev
> +        and             v5\sz,  v5\sz,  v4\sz  // !hev && fm && !flat8in
>  .if \wd == 16
>          and             v7\sz,  v7\sz,  v6\sz  // flat8out && flat8in && fm
>  .endif
> -        and             v5\sz,  v5\sz,  v4\sz  // !hev && fm && !flat8in
>  
>          mul_sz          \tmp3\().8h,  \tmp4\().8h,  \tmp3\().8h, 
> \tmp4\().8h,  \tmp5\().8h,  \tmp5\().8h, \sz // 3 * (q0 - p0)
>          bic             \tmp1\sz,  \tmp1\sz,  v5\sz    // if (!hev) 
> av_clip_int8 = 0
> diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
> index fbf2901..c57c0e9 100644
> --- a/libavcodec/arm/vp9lpf_neon.S
> +++ b/libavcodec/arm/vp9lpf_neon.S
> @@ -141,7 +141,7 @@
>  .if \wd == 8
>          vcle.u8         d6,  d6,  d0            @ flat8in
>  .endif
> -        vcgt.u8         d5,  d5,  d3            @ hev
> +        vcle.u8         d5,  d5,  d3            @ !hev
>  .if \wd == 8
>          vand            d6,  d6,  d4            @ flat8in && fm
>  .endif
> @@ -151,11 +151,10 @@
>  .elseif \wd == 8
>          vbic            d4,  d4,  d6            @ fm && !flat8in
>  .endif
> -        vmvn            d5,  d5                 @ !hev
> +        vand            d5,  d5,  d4            @ !hev && fm && !flat8in
>  .if \wd == 16
>          vand            d7,  d7,  d6            @ flat8out && flat8in && fm
>  .endif
> -        vand            d5,  d5,  d4            @ !hev && fm && !flat8in
>  
>          vmul.s16        \tmpq2,  \tmpq2, \tmpq3 @ 3 * (q0 - p0)
>          vbic            \tmp1,   \tmp1,   d5    @ if (!hev) av_clip_int8 = 0

ok

Janne
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to