On 03/07/2012 03:35 PM, Reimar Döffinger wrote: > Since the values are floats, using the float operations > makes sense, improves performance on some CPUs and > makes the code SSE compatible instead of needing SSE2. > > Based on suggestion by Jason. > > Signed-off-by: Reimar Döffinger <[email protected]> > --- > libavcodec/x86/sbrdsp.asm | 16 ++++++++-------- > 1 files changed, 8 insertions(+), 8 deletions(-) > > diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm > index c3b559b..31a1c8b 100644 > --- a/libavcodec/x86/sbrdsp.asm > +++ b/libavcodec/x86/sbrdsp.asm > @@ -82,14 +82,14 @@ cglobal sbr_hf_g_filt, 5, 6, 5 > lea r0, [r0 + r3*8] > neg r3 > .loop4: > - movq m0, [r2 + 4*r3 + 0] > - movq m1, [r2 + 4*r3 + 8] > - movq m2, [r1 + 0*STEP] > - movq m3, [r1 + 2*STEP] > + movlps m0, [r2 + 4*r3 + 0] > + movlps m1, [r2 + 4*r3 + 8] > + movlps m2, [r1 + 0*STEP] > + movlps m3, [r1 + 2*STEP] > movhps m2, [r1 + 1*STEP] > movhps m3, [r1 + 3*STEP] > - punpckldq m0, m0 > - punpckldq m1, m1 > + unpcklps m0, m0 > + unpcklps m1, m1 > mulps m0, m2 > mulps m1, m3 > movu [r0 + 8*r3 + 0], m0 > @@ -101,8 +101,8 @@ cglobal sbr_hf_g_filt, 5, 6, 5 > jz .end > .loop1: ; element 0 and 1 can be computed at the same time > movss m0, [r2] > - movq m2, [r1] > - punpckldq m0, m0 > + movlps m2, [r1] > + unpcklps m0, m0 > mulps m2, m0 > movlps [r0], m2 > add r0, 8
LGTM. -Justin _______________________________________________ libav-devel mailing list [email protected] https://lists.libav.org/mailman/listinfo/libav-devel
