reg0: i0 j0 i1 j1...
reg1: k0 l0 k1 l1...
(reg0 ^ reg1) = (i ^ k) (j ^ l)...
so we can reduce load operator
of course, this code is right, we have wasting some days on it, keep this
implement code and work on other functions.
At 2013-11-18 19:20:04,[email protected] wrote:
># HG changeset patch ># User Murugan Vairavel <[email protected]>
>># Date 1384773570 -19800 ># Mon Nov 18 16:49:30 2013 +0530 ># Node ID
>c355ba4b6711bfad87ff37d650a8f1946f878eec ># Parent
>2321ebe0bf64e5f3c0034076c7edb3ecbcd48039 >asm: code for scale2D_64to32
>routine> >diff -r 2321ebe0bf64 -r c355ba4b6711
>source/common/x86/asm-primitives.cpp >---
>a/source/common/x86/asm-primitives.cpp Mon Nov 18 11:32:06 2013 +0530 >+++
>b/source/common/x86/asm-primitives.cpp Mon Nov 18 16:49:30 2013 +0530 >@@
>-530,6 +530,7 @@ > PIXEL_AVG_W4(ssse3); > > p.scale1D_128to64
>= x265_scale1D_128to64_ssse3; >+ p.scale2D_64to32 =
>x265_scale2D_64to32_ssse3; > > p.sad_x4[LUMA_8x4] =
>x265_pixel_sad_x4_8x4_ssse3; > p.sad_x4[LUMA_8x8] =
>x265_pixel_sad_x4_8x8_ssse3; >diff -r 2321ebe0bf64 -r c355ba4b6711
>source/common/x86/pixel-a.asm >--- a/source/common/x86/pixel-a.asm Mon Nov 18
>11:32:06 2013 +0530 >+++ b/source/common/x86/pixel-a.asm Mon Nov 18 16:49:30
>2013 +0530 >@@ -8230,3 +8230,113 @@ > movu [r0 + 48], m4 > >
>RET>+ >+;----------------------------------------------------------------- >+;
>void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride)
>>+;-----------------------------------------------------------------
>>+INIT_XMM ssse3 >+cglobal scale2D_64to32, 3, 4, 8, dest, src, stride >+ >+
>mova m7, [deinterleave_shuf] >+ mov r3d, 32
>>+.loop>+ >+ movu m0, [r1] ;i >+ movu
> m1, [r1 + 1] ;j >+ movu m2, [r1 + r2]
> ;k >+ movu m3, [r1 + r2 + 1] ;l >+ movu
>m4, m0 >+ movu m5, m2 >+ >+ pxor m4, m1
> ;i^j >+ pxor m5, m3 ;k^l >+
> por m4, m5 ;ij|kl >+ >+ pavgb m0,
> m1 ;s >+ pavgb m2, m3 ;t
>>+ movu m5, m0 >+ pavgb m0, m2
>;(s+t+1)/2 >+ pxor m5, m2 ;s^t >+ pand
> m4, m5 ;(ij|kl)&st >+ pand m4,
>[hmul_16p] >+ psubb m0, m4 ;Result >+ >+
>movu m1, [r1 + 16] ;i >+ movu m2, [r1 +
>16 + 1] ;j >+ movu m3, [r1 + r2 + 16] ;k >+
>movu m4, [r1 + r2 + 16 + 1] ;l >+ movu m5, m1 >+
> movu m6, m3 >+ >+ pxor m5, m2
>;i^j >+ pxor m6, m4 ;k^l >+ por
>m5, m6 ;ij|kl >+ >+ pavgb m1, m2
> ;s >+ pavgb m3, m4 ;t >+ movu
> m6, m1 >+ pavgb m1, m3 ;(s+t+1)/2
>>+ pxor m6, m3 ;s^t >+ pand m5,
> m6 ;(ij|kl)&st >+ pand m5, [hmul_16p] >+
> psubb m1, m5 ;Result >+ >+ pshufb m0,
> m0, m7 >+ pshufb m1, m1, m7 >+ >+ punpcklqdq m0,
> m1 >+ movu [r0], m0 >+ >+ movu m0,
>[r1 + 32] ;i >+ movu m1, [r1 + 32 + 1] ;j
>>+ movu m2, [r1 + r2 + 32] ;k >+ movu m3,
>[r1 + r2 + 32 + 1] ;l >+ movu m4, m0 >+ movu m5,
> m2 >+ >+ pxor m4, m1 ;i^j >+ pxor
> m5, m3 ;k^l >+ por m4, m5
> ;ij|kl >+ >+ pavgb m0, m1 ;s >+
>pavgb m2, m3 ;t >+ movu m5, m0 >+
> pavgb m0, m2 ;(s+t+1)/2 >+ pxor m5,
> m2 ;s^t >+ pand m4, m5
> ;(ij|kl)&st >+ pand m4, [hmul_16p] >+ psubb m0,
>m4 ;Result >+ >+ movu m1, [r1 + 48]
> ;i >+ movu m2, [r1 + 48 + 1] ;j >+ movu
>m3, [r1 + r2 + 48] ;k >+ movu m4, [r1 + r2 + 48 +
>1] ;l >+ movu m5, m1 >+ movu m6, m3 >+ >+
>pxor m5, m2 ;i^j >+ pxor m6, m4
> ;k^l >+ por m5, m6 ;ij|kl
>>+ >+ pavgb m1, m2 ;s >+ pavgb m3,
> m4 ;t >+ movu m6, m1 >+ pavgb
>m1, m3 ;(s+t+1)/2 >+ pxor m6, m3
> ;s^t >+ pand m5, m6 ;(ij|kl)&st
>>+ pand m5, [hmul_16p] >+ psubb m1, m5
> ;Result >+ >+ pshufb m0, m0, m7 >+ pshufb m1,
> m1, m7 >+ >+ punpcklqdq m0, m1 >+ movu [r0
>+ 16], m0 >+ >+ lea r0, [r0 + 32] >+ lea r1, [r1 + 2 *
>r2] >+ dec r3d>+ >+ jnz .loop>+>+RET >diff -r 2321ebe0bf64 -r
>c355ba4b6711 source/common/x86/pixel.h >--- a/source/common/x86/pixel.h Mon
>Nov 18 11:32:06 2013 +0530 >+++ b/source/common/x86/pixel.h Mon Nov 18
>16:49:30 2013 +0530 >@@ -117,6 +117,7 @@ > int
>x265_pixel_satd_16x32_sse2(pixel *, intptr_t, pixel *, intptr_t); > int
>x265_pixel_satd_16x64_sse2(pixel *, intptr_t, pixel *, intptr_t); > void
>x265_scale1D_128to64_ssse3(pixel *, pixel *, intptr_t); >+void
>x265_scale2D_64to32_ssse3(pixel *, pixel *, intptr_t); > >
>DECL_PIXELS(uint64_t, var, mmx2, (pixel * pix, intptr_t i_stride)) >
>DECL_PIXELS(uint64_t, var, sse2, (pixel * pix, intptr_t i_stride))
>>_______________________________________________ >x265-devel mailing
>list>[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel