# HG changeset patch # User Vignesh Vijayakumar # Date 1499410957 -19800 # Fri Jul 07 12:32:37 2017 +0530 # Node ID cc3a93869b28b7d5b3478a2524d07e7e630a0eca # Parent 7283818f2dd7191c8258030c7424fa6b4ed5330f x86: AVX512 addAvg_W64
Size | AVX2 performance | AVX512 performance -------------------------------------------------- 64x16 | 14.46x | 22.25x 64x32 | 13.93x | 23.96x 64x48 | 13.90x | 24.27x 64x64 | 14.74x | 24.31x diff -r 7283818f2dd7 -r cc3a93869b28 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 06 17:32:24 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jul 07 12:32:37 2017 +0530 @@ -3785,6 +3785,11 @@ p.scale1D_128to64 = PFX(scale1D_128to64_avx512); + p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_avx512); + p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx512); + p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512); + p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512); + } #endif } diff -r 7283818f2dd7 -r cc3a93869b28 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asm Thu Jul 06 17:32:24 2017 +0530 +++ b/source/common/x86/mc-a.asm Fri Jul 07 12:32:37 2017 +0530 @@ -2892,6 +2892,65 @@ ADDAVG_W64_H2_AVX2 48 ADDAVG_W64_H2_AVX2 64 +%macro ADDAVG_W64_H2_AVX512 1 +INIT_ZMM avx512 +cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride + vbroadcasti32x8 m4, [pw_256] + vbroadcasti32x8 m5, [pw_128] + add r3, r3 + add r4, r4 + mov r6d, %1/16 + +.loop: +%rep 8 + movu m0, [r0] + movu m1, [r1] + movu m2, [r0 + 64] + movu m3, [r1 + 64] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + paddw m2, m3 + pmulhrsw m2, m4 + paddw m2, m5 + + packuswb m0, m2 + vpermq m0, m0, 11011000b + vshufi64x2 m0, m0, 11011000b + movu [r2], m0 + + + movu m0, [r0 + r3] + movu m1, [r1 + r4] + movu m2, [r0 + r3 + 64] + movu m3, [r1 + r4 + 64] + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + paddw m2, m3 + pmulhrsw m2, m4 + paddw m2, m5 + + packuswb m0, m2 + vpermq m0, m0, 11011000b + vshufi64x2 m0, m0, 11011000b + movu [r2 + r5], m0 + + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] +%endrep + + dec r6d + jnz .loop + RET +%endmacro + +ADDAVG_W64_H2_AVX512 16 +ADDAVG_W64_H2_AVX512 32 +ADDAVG_W64_H2_AVX512 48 +ADDAVG_W64_H2_AVX512 64 + %macro ADDAVG_W48_H2_AVX2 1 INIT_YMM avx2 cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel