right now
At 2015-10-14 17:28:57,"Ramya Sriraman" <[email protected]> wrote: # HG changeset patch # User Ramya Sriraman <[email protected]> # Date 1443592336 -19800 # Wed Sep 30 11:22:16 2015 +0530 # Node ID 4f3b58b4db8d6f10ec849ad9f2ab9be3cf12649a # Parent b6156a08b1def3584647f26096866c1a0c11e54a asm: Add sse_ss for [16x16],[32x32] & [64x64] for 8bpp avx2 diff -r b6156a08b1de -r 4f3b58b4db8d source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cppFri Oct 09 20:45:59 2015 +0530 +++ b/source/common/x86/asm-primitives.cppWed Sep 30 11:22:16 2015 +0530 @@ -2667,6 +2667,10 @@ #if X86_64 if (cpuMask & X265_CPU_AVX2) { + p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_avx2); + p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx2); + p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx2); + p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx2); p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx2); p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_avx2); diff -r b6156a08b1de -r 4f3b58b4db8d source/common/x86/ssd-a.asm --- a/source/common/x86/ssd-a.asmFri Oct 09 20:45:59 2015 +0530 +++ b/source/common/x86/ssd-a.asmWed Sep 30 11:22:16 2015 +0530 @@ -1016,6 +1016,172 @@ SSD_SS_32xN SSD_SS_48 SSD_SS_64xN + +INIT_YMM avx2 +cglobal pixel_ssd_ss_16x16, 4,6,4 + add r1d, r1d + add r3d, r3d + pxor m2, m2 + pxor m3, m3 + lea r4, [3 * r1] + lea r5, [3 * r3] + + movu m0, [r0] + movu m1, [r0 + r1] + psubw m0, [r2] + psubw m1, [r2 + r3] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m3, m1 + + movu m0, [r0 + 2 * r1] + movu m1, [r0 + r4] + psubw m0, [r2 + 2 * r3] + psubw m1, [r2 + r5] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m3, m1 + + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + + movu m0, [r0] + movu m1, [r0 + r1] + psubw m0, [r2] + psubw m1, [r2 + r3] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m3, m1 + + movu m0, [r0 + 2 * r1] + movu m1, [r0 + r4] + psubw m0, [r2 + 2 * r3] + psubw m1, [r2 + r5] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m3, m1 + + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + + movu m0, [r0] + movu m1, [r0 + r1] + psubw m0, [r2] + psubw m1, [r2 + r3] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m3, m1 + + movu m0, [r0 + 2 * r1] + movu m1, [r0 + r4] + psubw m0, [r2 + 2 * r3] + psubw m1, [r2 + r5] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m3, m1 + + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + + movu m0, [r0] + movu m1, [r0 + r1] + psubw m0, [r2] + psubw m1, [r2 + r3] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m3, m1 + + movu m0, [r0 + 2 * r1] + movu m1, [r0 + r4] + psubw m0, [r2 + 2 * r3] + psubw m1, [r2 + r5] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m3, m1 + + paddd m2, m3 + HADDD m2, m0 + movd eax, xm2 + RET + +INIT_YMM avx2 +cglobal pixel_ssd_ss_32x32, 4,5,4 + add r1d, r1d + add r3d, r3d + pxor m2, m2 + pxor m3, m3 + mov r4d, 16 +.loop: + movu m0, [r0] + movu m1, [r0 + mmsize] + psubw m0, [r2] + psubw m1, [r2 + mmsize] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m3, m1 + movu m0, [r0 + r1] + movu m1, [r0 + r1 + mmsize] + psubw m0, [r2 + r3] + psubw m1, [r2 + r3 + mmsize] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m3, m1 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + dec r4d + jne .loop + + paddd m2, m3 + HADDD m2, m0 + movd eax, xm2 + RET + +INIT_YMM avx2 +cglobal pixel_ssd_ss_64x64, 4,5,4 + add r1d, r1d + add r3d, r3d + pxor m2, m2 + pxor m3, m3 + mov r4d,64 +.loop: + movu m0, [r0] + movu m1, [r0 + mmsize] + psubw m0, [r2] + psubw m1, [r2 + mmsize] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m3, m1 + movu m0, [r0 + 2 * mmsize] + movu m1, [r0 + 3 * mmsize] + psubw m0, [r2 + 2 * mmsize] + psubw m1, [r2 + 3 * mmsize] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m3, m1 + + add r0, r1 + add r2, r3 + + dec r4d + jne .loop + + paddd m2, m3 + HADDD m2, m0 + movd eax, xm2 + RET + %endif ; !HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 Thank you Regards Ramya On Mon, Oct 12, 2015 at 7:52 PM, chen <[email protected]> wrote: At 2015-10-12 14:23:41,"Ramya Sriraman" <[email protected]> wrote: # HG changeset patch # User Ramya Sriraman <[email protected]> # Date 1443592336 -19800 # Wed Sep 30 11:22:16 2015 +0530 # Node ID ca5321eb84ef8a0e16f18a2774d3f5a299d7f997 # Parent b6156a08b1def3584647f26096866c1a0c11e54a asm: Add sse_ss for [16x16],[32x32] & [64x64] for 8bpp avx2 diff -r b6156a08b1de -r ca5321eb84ef source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cppFri Oct 09 20:45:59 2015 +0530 +++ b/source/common/x86/asm-primitives.cppWed Sep 30 11:22:16 2015 +0530 @@ -2667,6 +2667,10 @@ #if X86_64 if (cpuMask & X265_CPU_AVX2) { + p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_avx2); + p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx2); + p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx2); + p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx2); p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx2); p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_avx2); diff -r b6156a08b1de -r ca5321eb84ef source/common/x86/ssd-a.asm --- a/source/common/x86/ssd-a.asmFri Oct 09 20:45:59 2015 +0530 +++ b/source/common/x86/ssd-a.asmWed Sep 30 11:22:16 2015 +0530 @@ -1016,8 +1016,166 @@ SSD_SS_32xN SSD_SS_48 SSD_SS_64xN + +INIT_YMM avx2 +cglobal pixel_ssd_ss_16x16, 4,6,4 + add r1d, r1d + add r3d, r3d + pxor m2, m2 + lea r4, [3 * r1] + lea r5, [3 * r3] + + movu m0, [r0] + movu m1, [r0 + r1] + psubw m0, [r2] + psubw m1, [r2 + r3] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m3, m1, m0 + paddd m2, m3 + + movu m0, [r0 + 2 * r1] + movu m1, [r0 + r4] + psubw m0, [r2 + 2 * r3] + psubw m1, [r2 + r5] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m3, m1, m0 + paddd m2, m3 paddd m2, m0 paddd m3, m1 + + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + + movu m0, [r0] + movu m1, [r0 + r1] + psubw m0, [r2] + psubw m1, [r2 + r3] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m3, m1, m0 + paddd m2, m3 + + movu m0, [r0 + 2 * r1] + movu m1, [r0 + r4] + psubw m0, [r2 + 2 * r3] + psubw m1, [r2 + r5] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m3, m1, m0 + paddd m2, m3 + + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + + movu m0, [r0] + movu m1, [r0 + r1] + psubw m0, [r2] + psubw m1, [r2 + r3] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m3, m1, m0 + paddd m2, m3 + + movu m0, [r0 + 2 * r1] + movu m1, [r0 + r4] + psubw m0, [r2 + 2 * r3] + psubw m1, [r2 + r5] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m3, m1, m0 + paddd m2, m3 + + lea r0, [r0 + 4 * r1] + lea r2, [r2 + 4 * r3] + + movu m0, [r0] + movu m1, [r0 + r1] + psubw m0, [r2] + psubw m1, [r2 + r3] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m3, m1, m0 + paddd m2, m3 + + movu m0, [r0 + 2 * r1] + movu m1, [r0 + r4] + psubw m0, [r2 + 2 * r3] + psubw m1, [r2 + r5] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m3, m1, m0 + paddd m2, m3 + + HADDD m2,m0 + movd eax, xm2 + RET + +INIT_YMM avx2 +cglobal pixel_ssd_ss_32x32, 4,5,3 + add r1d, r1d + add r3d, r3d + pxor m2, m2 + mov r4d, 16 +.loop: + movu m0, [r0] + movu m1, [r0 + mmsize] + psubw m0, [r2] + psubw m1, [r2 + mmsize] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m2, m1 + movu m0, [r0 + r1] + movu m1, [r0 + r1 + mmsize] + psubw m0, [r2 + r3] + psubw m1, [r2 + r3 + mmsize] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m2, m1 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + dec r4d + jne .loop + + HADDD m2,m0 + movd eax, xm2 + RET need empty line in between two functions +INIT_YMM avx2 +cglobal pixel_ssd_ss_64x64, 4,5,3 + add r1d, r1d + add r3d, r3d + pxor m2, m2 + mov r4d,64 +.loop: + movu m0, [r0] + movu m1, [r0 + mmsize] + psubw m0, [r2] + psubw m1, [r2 + mmsize] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m2, m1 + movu m0, [r0 + 2 * mmsize] + movu m1, [r0 + 3 * mmsize] + psubw m0, [r2 + 2 * mmsize] + psubw m1, [r2 + 3 * mmsize] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m2, m1 paddd m2/m3 _______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
