At 2015-03-11 17:48:16,[email protected] wrote: ># HG changeset patch ># User Sumalatha Polureddy<[email protected]> ># Date 1426067286 -19800 ># Node ID bc9f8dbe8f655fe0b1a218bb6abb8fa88536c6dd ># Parent 01bfd365bf5f5317874b5c0315736ca76176f3df >asm: avx2 code for sad[64x64] for 8bpp > >SSE3 >sad[64x64] 33.47x 2584.70 86508.58 > >AVX2 >sad[64x64] 34.42x 2557.98 88038.73
same speed? >diff -r 01bfd365bf5f -r bc9f8dbe8f65 source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Wed Mar 11 14:24:10 2015 +0530 >+++ b/source/common/x86/asm-primitives.cpp Wed Mar 11 15:18:06 2015 +0530 >@@ -1443,6 +1443,7 @@ > p.pu[LUMA_8x8].satd = x265_pixel_satd_8x8_avx2; > > p.pu[LUMA_32x32].sad = x265_pixel_sad_32x32_avx2; >+ p.pu[LUMA_64x64].sad = x265_pixel_sad_64x64_avx2; > > p.pu[LUMA_8x4].sad_x3 = x265_pixel_sad_x3_8x4_avx2; > p.pu[LUMA_8x8].sad_x3 = x265_pixel_sad_x3_8x8_avx2; >diff -r 01bfd365bf5f -r bc9f8dbe8f65 source/common/x86/sad-a.asm >--- a/source/common/x86/sad-a.asm Wed Mar 11 14:24:10 2015 +0530 >+++ b/source/common/x86/sad-a.asm Wed Mar 11 15:18:06 2015 +0530 >@@ -3924,4 +3924,44 @@ > movd eax, xm0 > RET > >+INIT_YMM avx2 >+cglobal pixel_sad_64x64, 4,4,5 >+ xorps m0, m0 >+ >+%assign x 0 >+%rep 32 could you try to compare loop version performance? >+ movu m1, [r0] ; first 32 of row 0 of pix0 >+ movu m2, [r2] ; first 32 of row 0 of pix1 >+ movu m3, [r0 + 32] ; second 32 of row 0 of pix0 >+ movu m4, [r2 + 32] ; second 32 of row 0 of pix1 >+ >+ psadbw m1, m2 >+ psadbw m3, m4 >+ paddd m0, m1 >+ paddd m0, m3 in here, we have more free register, so sum parallel is better, eg: m0+=m1, m2+=m3 >+ >+ movu m1, [r0 + r1] ; first 32 of row 1 of pix0 >+ movu m2, [r2 + r1] ; first 32 of row 1 of pix1 >+ movu m3, [r0 + 32 + r1] ; second 32 of row 1 of pix0 >+ movu m4, [r2 + 32 + r1] ; second 32 of row 1 of pix1 >+ >+ psadbw m1, m2 >+ psadbw m3, m4 >+ paddd m0, m1 >+ paddd m0, m3 >+ >+%assign x x+1 >+ %if x < 32 >+ lea r2, [r2 + 2 * r3] >+ lea r0, [r0 + 2 * r1] >+ %endif >+%endrep >+ >+ vextracti128 xm1, m0, 1 >+ paddd xm0, xm1 >+ pshufd xm1, xm0, 2 >+ paddd xm0,xm1 >+ movd eax, xm0 >+ RET >+ > %endif >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
