On 2017-01-15 22:55:47 +0200, Martin Storsjö wrote:
> Previously we first calculated hev, and then negated it.
>
> Since we were able to schedule the negation in the middle
> of another calculation, we don't see any gain in all cases.
>
> Before: Cortex A7 A8 A9 A53 A53/AArch64
> vp9_loop_filter_v_4_8_neon: 147.0 129.0 115.889.0 88.7
> vp9_loop_filter_v_8_8_neon: 242.0 198.5 174.7 140.0136.7
> vp9_loop_filter_v_16_8_neon:500.0 419.5 382.7 293.0275.7
> vp9_loop_filter_v_16_16_neon: 971.2 825.5 731.5 579.0453.0
> After:
> vp9_loop_filter_v_4_8_neon: 143.0 127.7 114.888.0 87.7
> vp9_loop_filter_v_8_8_neon: 241.0 197.2 173.7 140.0136.7
> vp9_loop_filter_v_16_8_neon:497.0 419.5 379.7 293.0275.7
> vp9_loop_filter_v_16_16_neon: 965.2 818.7 731.4 579.0452.0
> ---
> libavcodec/aarch64/vp9lpf_neon.S | 5 ++---
> libavcodec/arm/vp9lpf_neon.S | 5 ++---
> 2 files changed, 4 insertions(+), 6 deletions(-)
>
> diff --git a/libavcodec/aarch64/vp9lpf_neon.S
> b/libavcodec/aarch64/vp9lpf_neon.S
> index e9c7d9e..3b8e6eb 100644
> --- a/libavcodec/aarch64/vp9lpf_neon.S
> +++ b/libavcodec/aarch64/vp9lpf_neon.S
> @@ -292,7 +292,7 @@
> .if \mix != 0
> sxtlv1.8h, v1.8b
> .endif
> -cmhiv5\sz, v5\sz, v3\sz // hev
> +cmhsv5\sz, v3\sz, v5\sz // !hev
> .if \wd == 8
> // If a 4/8 or 8/4 mix is used, clear the relevant half of v6
> .if \mix != 0
> @@ -306,11 +306,10 @@
> .elseif \wd == 8
> bic v4\sz, v4\sz, v6\sz // fm && !flat8in
> .endif
> -mvn v5\sz, v5\sz // !hev
> +and v5\sz, v5\sz, v4\sz // !hev && fm && !flat8in
> .if \wd == 16
> and v7\sz, v7\sz, v6\sz // flat8out && flat8in && fm
> .endif
> -and v5\sz, v5\sz, v4\sz // !hev && fm && !flat8in
>
> mul_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h,
> \tmp4\().8h, \tmp5\().8h, \tmp5\().8h, \sz // 3 * (q0 - p0)
> bic \tmp1\sz, \tmp1\sz, v5\sz// if (!hev)
> av_clip_int8 = 0
> diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
> index fbf2901..c57c0e9 100644
> --- a/libavcodec/arm/vp9lpf_neon.S
> +++ b/libavcodec/arm/vp9lpf_neon.S
> @@ -141,7 +141,7 @@
> .if \wd == 8
> vcle.u8 d6, d6, d0@ flat8in
> .endif
> -vcgt.u8 d5, d5, d3@ hev
> +vcle.u8 d5, d5, d3@ !hev
> .if \wd == 8
> vandd6, d6, d4@ flat8in && fm
> .endif
> @@ -151,11 +151,10 @@
> .elseif \wd == 8
> vbicd4, d4, d6@ fm && !flat8in
> .endif
> -vmvnd5, d5 @ !hev
> +vandd5, d5, d4@ !hev && fm && !flat8in
> .if \wd == 16
> vandd7, d7, d6@ flat8out && flat8in && fm
> .endif
> -vandd5, d5, d4@ !hev && fm && !flat8in
>
> vmul.s16\tmpq2, \tmpq2, \tmpq3 @ 3 * (q0 - p0)
> vbic\tmp1, \tmp1, d5@ if (!hev) av_clip_int8 = 0
ok
Janne
___
libav-devel mailing list
libav-devel@libav.org
https://lists.libav.org/mailman/listinfo/libav-devel