On 2017-01-15 22:55:47 +0200, Martin Storsjö wrote: > Previously we first calculated hev, and then negated it. > > Since we were able to schedule the negation in the middle > of another calculation, we don't see any gain in all cases. > > Before: Cortex A7 A8 A9 A53 A53/AArch64 > vp9_loop_filter_v_4_8_neon: 147.0 129.0 115.8 89.0 88.7 > vp9_loop_filter_v_8_8_neon: 242.0 198.5 174.7 140.0 136.7 > vp9_loop_filter_v_16_8_neon: 500.0 419.5 382.7 293.0 275.7 > vp9_loop_filter_v_16_16_neon: 971.2 825.5 731.5 579.0 453.0 > After: > vp9_loop_filter_v_4_8_neon: 143.0 127.7 114.8 88.0 87.7 > vp9_loop_filter_v_8_8_neon: 241.0 197.2 173.7 140.0 136.7 > vp9_loop_filter_v_16_8_neon: 497.0 419.5 379.7 293.0 275.7 > vp9_loop_filter_v_16_16_neon: 965.2 818.7 731.4 579.0 452.0 > --- > libavcodec/aarch64/vp9lpf_neon.S | 5 ++--- > libavcodec/arm/vp9lpf_neon.S | 5 ++--- > 2 files changed, 4 insertions(+), 6 deletions(-) > > diff --git a/libavcodec/aarch64/vp9lpf_neon.S > b/libavcodec/aarch64/vp9lpf_neon.S > index e9c7d9e..3b8e6eb 100644 > --- a/libavcodec/aarch64/vp9lpf_neon.S > +++ b/libavcodec/aarch64/vp9lpf_neon.S > @@ -292,7 +292,7 @@ > .if \mix != 0 > sxtl v1.8h, v1.8b > .endif > - cmhi v5\sz, v5\sz, v3\sz // hev > + cmhs v5\sz, v3\sz, v5\sz // !hev > .if \wd == 8 > // If a 4/8 or 8/4 mix is used, clear the relevant half of v6 > .if \mix != 0 > @@ -306,11 +306,10 @@ > .elseif \wd == 8 > bic v4\sz, v4\sz, v6\sz // fm && !flat8in > .endif > - mvn v5\sz, v5\sz // !hev > + and v5\sz, v5\sz, v4\sz // !hev && fm && !flat8in > .if \wd == 16 > and v7\sz, v7\sz, v6\sz // flat8out && flat8in && fm > .endif > - and v5\sz, v5\sz, v4\sz // !hev && fm && !flat8in > > mul_sz \tmp3\().8h, \tmp4\().8h, \tmp3\().8h, > \tmp4\().8h, \tmp5\().8h, \tmp5\().8h, \sz // 3 * (q0 - p0) > bic \tmp1\sz, \tmp1\sz, v5\sz // if (!hev) > av_clip_int8 = 0 > diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S > index fbf2901..c57c0e9 100644 > --- a/libavcodec/arm/vp9lpf_neon.S > +++ b/libavcodec/arm/vp9lpf_neon.S > @@ -141,7 +141,7 @@ > .if \wd == 8 > vcle.u8 d6, d6, d0 @ flat8in > .endif > - vcgt.u8 d5, d5, d3 @ hev > + vcle.u8 d5, d5, d3 @ !hev > .if \wd == 8 > vand d6, d6, d4 @ flat8in && fm > .endif > @@ -151,11 +151,10 @@ > .elseif \wd == 8 > vbic d4, d4, d6 @ fm && !flat8in > .endif > - vmvn d5, d5 @ !hev > + vand d5, d5, d4 @ !hev && fm && !flat8in > .if \wd == 16 > vand d7, d7, d6 @ flat8out && flat8in && fm > .endif > - vand d5, d5, d4 @ !hev && fm && !flat8in > > vmul.s16 \tmpq2, \tmpq2, \tmpq3 @ 3 * (q0 - p0) > vbic \tmp1, \tmp1, d5 @ if (!hev) av_clip_int8 = 0
ok Janne _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
