# HG changeset patch # User Praveen Tiwari # Date 1410328371 -19800 # Node ID d29cb300975a491287abdfb6abd2a9d3141e99f0 # Parent 408e2e6f0f709525cedb784a65386a116f2d3d00 copy_cnt_8, AVX2 asm code as per new interface, performance improved from 5.13x to 7.59x on HASWELL-I5
diff -r 408e2e6f0f70 -r d29cb300975a source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Tue Sep 09 22:23:26 2014 +0200 +++ b/source/common/x86/blockcopy8.asm Wed Sep 10 11:22:51 2014 +0530 @@ -4079,85 +4079,44 @@ INIT_YMM avx2 -%if ARCH_X86_64 == 1 -cglobal copy_cnt_8, 3,4,6 - %define tmpd eax -%else -cglobal copy_cnt_8, 3,5,6 - %define tmpd r4d -%endif +cglobal copy_cnt_8, 3,4,5 add r2d, r2d - pxor m4, m4 lea r3, [r2 * 3] - ; row 0 + ; row 0 - 1 movu xm0, [r1] - mova xm2, xm0 - pmovsxwd m1, xm0 - movu [r0 + 0 * mmsize], m1 - - ; row 1 - movu xm0, [r1 + r2] - vinserti128 m2, m2, xm0, 1 - pmovsxwd m1, xm0 - movu [r0 + 1 * mmsize], m1 - - ; row 2 - movu xm0, [r1 + r2 * 2] - mova xm5, xm0 - pmovsxwd m1, xm0 - movu [r0 + 2 * mmsize], m1 - - ; row 3 - movu xm0, [r1 + r3] - vinserti128 m5, m5, xm0, 1 - packsswb m2, m5 - pcmpeqb m2, m4 - pmovmskb tmpd, m2 - not tmpd - popcnt tmpd, tmpd - pmovsxwd m1, xm0 - movu [r0 + 3 * mmsize], m1 - - add r0, 4 * mmsize - lea r1, [r1 + r2 * 4] - - ; row 4 - movu xm0, [r1] - mova xm2, xm0 - pmovsxwd m1, xm0 - movu [r0 + 0 * mmsize], m1 - - ; row 5 - movu xm0, [r1 + r2] - vinserti128 m2, m2, xm0, 1 - pmovsxwd m1, xm0 - movu [r0 + 1 * mmsize], m1 - - ; row 6 - movu xm0, [r1 + r2 * 2] - mova xm5, xm0 - pmovsxwd m1, xm0 - movu [r0 + 2 * mmsize], m1 - - ; row 7 - movu xm0, [r1 + r3] - pmovsxwd m1, xm0 - movu [r0 + 3 * mmsize], m1 - vinserti128 m5, m5, xm0, 1 + vinserti128 m0, m0, [r1 + r2], 1 + movu [r0], m0 + + ; row 2 - 3 + movu xm1, [r1 + r2 * 2] + vinserti128 m1, m1, [r1 + r3], 1 + movu [r0 + 32], m1 + lea r1, [r1 + r2 * 4] + + ; row 4 - 5 + movu xm2, [r1] + vinserti128 m2, m2, [r1 + r2], 1 + movu [r0 + 64], m2 + + ; row 6 - 7 + movu xm3, [r1 + r2 * 2] + vinserti128 m3, m3, [r1 + r3], 1 + movu [r0 + 96], m3 ; get count - packsswb m2, m5 - pcmpeqb m2, m4 - pmovmskb r0d, m2 - not r0d - popcnt r0d, r0d - -%if ARCH_X86_64 == 1 - add tmpd, r0d -%else - add r0d, tmpd -%endif + xorpd m4, m4 + vpacksswb m0, m1 + vpacksswb m2, m3 + pminub m0, [pb_1] + pminub m2, [pb_1] + paddb m0, m2 + vextracti128 xm1, m0, 1 + paddb xm0, xm1 + psadbw xm0, xm4 + movhlps xm1, xm0 + paddd xm0, xm1 + movd eax, xm0 RET _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel