On 03/07/2012 03:35 PM, Reimar Döffinger wrote:

> Since the values are floats, using the float operations
> makes sense, improves performance on some CPUs and
> makes the code SSE compatible instead of needing SSE2.
> 
> Based on suggestion by Jason.
> 
> Signed-off-by: Reimar Döffinger <[email protected]>
> ---
>  libavcodec/x86/sbrdsp.asm |   16 ++++++++--------
>  1 files changed, 8 insertions(+), 8 deletions(-)
> 
> diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
> index c3b559b..31a1c8b 100644
> --- a/libavcodec/x86/sbrdsp.asm
> +++ b/libavcodec/x86/sbrdsp.asm
> @@ -82,14 +82,14 @@ cglobal sbr_hf_g_filt, 5, 6, 5
>      lea         r0, [r0 + r3*8]
>      neg         r3
>  .loop4:
> -    movq        m0, [r2 + 4*r3 + 0]
> -    movq        m1, [r2 + 4*r3 + 8]
> -    movq        m2, [r1 + 0*STEP]
> -    movq        m3, [r1 + 2*STEP]
> +    movlps      m0, [r2 + 4*r3 + 0]
> +    movlps      m1, [r2 + 4*r3 + 8]
> +    movlps      m2, [r1 + 0*STEP]
> +    movlps      m3, [r1 + 2*STEP]
>      movhps      m2, [r1 + 1*STEP]
>      movhps      m3, [r1 + 3*STEP]
> -    punpckldq   m0, m0
> -    punpckldq   m1, m1
> +    unpcklps    m0, m0
> +    unpcklps    m1, m1
>      mulps       m0, m2
>      mulps       m1, m3
>      movu        [r0 + 8*r3 +  0], m0
> @@ -101,8 +101,8 @@ cglobal sbr_hf_g_filt, 5, 6, 5
>      jz          .end
>  .loop1: ; element 0 and 1 can be computed at the same time
>      movss       m0, [r2]
> -    movq        m2, [r1]
> -    punpckldq   m0, m0
> +    movlps      m2, [r1]
> +    unpcklps    m0, m0
>      mulps       m2, m0
>      movlps    [r0], m2
>      add         r0, 8


LGTM.

-Justin

_______________________________________________
libav-devel mailing list
[email protected]
https://lists.libav.org/mailman/listinfo/libav-devel

Reply via email to