[x265] [PATCH] removed copy_cnt_4 avx2 asm code: SSE version is eualy faster
# HG changeset patch # User Praveen Tiwari # Date 1410433904 -19800 # Node ID 5740ec22db67267bfca97fbba07ef9239802d2b0 # Parent 012f315d3eda8044f5a49865e15ba2943fbab094 removed copy_cnt_4 avx2 asm code: SSE version is eualy faster diff -r 012f315d3eda -r 5740ec22db67 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Sep 10 17:27:20 2014 +0200 +++ b/source/common/x86/asm-primitives.cpp Thu Sep 11 16:41:44 2014 +0530 @@ -1730,7 +1730,6 @@ /* Need to update assembly code as per changed interface of the copy_cnt primitive, once * code is updated, avx2 version will be enabled */ -// p.copy_cnt[BLOCK_4x4] = x265_copy_cnt_4_avx2; p.copy_cnt[BLOCK_8x8] = x265_copy_cnt_8_avx2; // p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_avx2; // p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_avx2; diff -r 012f315d3eda -r 5740ec22db67 source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Wed Sep 10 17:27:20 2014 +0200 +++ b/source/common/x86/blockcopy8.asm Thu Sep 11 16:41:44 2014 +0530 @@ -3987,35 +3987,6 @@ %endif RET - -INIT_YMM avx2 -cglobal copy_cnt_4, 3,3,3 -add r2d, r2d -xorpd xm2, xm2 - -; row 0 1 -movqxm0, [r1] -movhps xm0, [r1 + r2] - -; row 2 3 -movqxm1, [r1 + r2 * 2] -lea r2, [r2 * 3] -movhps xm1, [r1 + r2] - -vinserti128 m0, m0, xm1, 1 -movu[r0], m0 - -vextractf128 xm1, m0, 1 -packsswb xm0, xm1 -pcmpeqb xm0, xm2 - -; get count -pmovmskbeax, xm0 -not ax -popcnt ax, ax -RET - - ;-- ; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride); ;-- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH] removed copy_cnt_4 avx2 asm code: SSE version is eualy faster
Ignore It, need to correct commit message. Regards, Praveen Tiwari On Thu, Sep 11, 2014 at 4:41 PM, prav...@multicorewareinc.com wrote: # HG changeset patch # User Praveen Tiwari # Date 1410433904 -19800 # Node ID 5740ec22db67267bfca97fbba07ef9239802d2b0 # Parent 012f315d3eda8044f5a49865e15ba2943fbab094 removed copy_cnt_4 avx2 asm code: SSE version is eualy faster diff -r 012f315d3eda -r 5740ec22db67 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Sep 10 17:27:20 2014 +0200 +++ b/source/common/x86/asm-primitives.cpp Thu Sep 11 16:41:44 2014 +0530 @@ -1730,7 +1730,6 @@ /* Need to update assembly code as per changed interface of the copy_cnt primitive, once * code is updated, avx2 version will be enabled */ -// p.copy_cnt[BLOCK_4x4] = x265_copy_cnt_4_avx2; p.copy_cnt[BLOCK_8x8] = x265_copy_cnt_8_avx2; // p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_avx2; // p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_avx2; diff -r 012f315d3eda -r 5740ec22db67 source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Wed Sep 10 17:27:20 2014 +0200 +++ b/source/common/x86/blockcopy8.asm Thu Sep 11 16:41:44 2014 +0530 @@ -3987,35 +3987,6 @@ %endif RET - -INIT_YMM avx2 -cglobal copy_cnt_4, 3,3,3 -add r2d, r2d -xorpd xm2, xm2 - -; row 0 1 -movqxm0, [r1] -movhps xm0, [r1 + r2] - -; row 2 3 -movqxm1, [r1 + r2 * 2] -lea r2, [r2 * 3] -movhps xm1, [r1 + r2] - -vinserti128 m0, m0, xm1, 1 -movu[r0], m0 - -vextractf128 xm1, m0, 1 -packsswb xm0, xm1 -pcmpeqb xm0, xm2 - -; get count -pmovmskbeax, xm0 -not ax -popcnt ax, ax -RET - - ;-- ; uint32_t copy_cnt(int16_t *dst, int16_t *src, intptr_t stride); ;-- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel