On 2017-01-15 22:55:50 +0200, Martin Storsjö wrote:
> This adds lots of extra .ifs, but speeds it up by a couple cycles,
> by avoiding stalls.
> ---
>  libavcodec/arm/vp9lpf_neon.S | 8 ++++++--
>  1 file changed, 6 insertions(+), 2 deletions(-)
> 
> diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
> index 9be4cef..e31c807 100644
> --- a/libavcodec/arm/vp9lpf_neon.S
> +++ b/libavcodec/arm/vp9lpf_neon.S
> @@ -181,16 +181,20 @@
>  
>          vmovl.u8        q0,  d22                @ p1
>          vmovl.u8        q1,  d25                @ q1
> +.if \wd >= 8
> +        vmov            r2,  r3,  d6
> +.endif
>          vaddw.s8        q0,  q0,  \tmp3         @ p1 + f
>          vsubw.s8        q1,  q1,  \tmp3         @ q1 - f
> +.if \wd >= 8
> +        orrs            r2,  r2,  r3
> +.endif
>          vqmovun.s16     d0,  q0                 @ out p1
>          vqmovun.s16     d2,  q1                 @ out q1
>          vbit            d22, d0,  d5            @ if (!hev && fm && !flat8in)
>          vbit            d25, d2,  d5
>  
>  .if \wd >= 8
> -        vmov            r2,  r3,  d6
> -        orrs            r2,  r2,  r3
>          @ If no pixels need flat8in, jump to flat8out
>          @ (or to a writeout of the inner 4 pixels, for wd=8)
>          beq             6f

ok

Janne
_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to