# HG changeset patch # User Vignesh Vijayakumar # Date 1503385834 -19800 # Tue Aug 22 12:40:34 2017 +0530 # Node ID 738f07186eb1d4bca84e9acdf70921ee9e2fee92 # Parent ed1932a414bf5962bbeccfd5c9e208b7db90f77f x86: AVX512 addAvg_32xN
Size | AVX2 performance | AVX512 performance ---------------------------------------------- 32x8 | 15.31x | 19.98x 32x16 | 15.14x | 23.25x 32x24 | 14.65x | 23.95x 32x32 | 15.41x | 24.76x 32x64 | 14.56x | 24.53x diff -r ed1932a414bf -r 738f07186eb1 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Sun Aug 13 18:18:28 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 22 12:40:34 2017 +0530 @@ -3964,6 +3964,19 @@ p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx512); p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512); p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512); + p.pu[LUMA_32x8].addAvg = PFX(addAvg_32x8_avx512); + p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_avx512); + p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx512); + p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_avx512); + p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = PFX(addAvg_32x8_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_avx512); p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_avx512); diff -r ed1932a414bf -r 738f07186eb1 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asm Sun Aug 13 18:18:28 2017 +0530 +++ b/source/common/x86/mc-a.asm Tue Aug 22 12:40:34 2017 +0530 @@ -3317,6 +3317,24 @@ movu [r2 + r5], m0 %endmacro +%macro PROCESS_ADDAVG_32x2_AVX512 0 + movu m0, [r0] + movu m1, [r1] + movu m2, [r0 + r3] + movu m3, [r1 + r4] + + paddw m0, m1 + pmulhrsw m0, m4 + paddw m0, m5 + paddw m2, m3 + pmulhrsw m2, m4 + paddw m2, m5 + + packuswb m0, m2 + vpermq m0, m6, m0 + movu [r2], ym0 + vextracti32x8 [r2 + r5], m0, 1 +%endmacro ;-------------------------------------------------------------------------------------------------------------------- ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) ;-------------------------------------------------------------------------------------------------------------------- @@ -3344,6 +3362,32 @@ ADDAVG_W64_AVX512 32 ADDAVG_W64_AVX512 48 ADDAVG_W64_AVX512 64 + +%macro ADDAVG_W32_AVX512 1 +INIT_ZMM avx512 +cglobal addAvg_32x%1, 6,6,7 + vbroadcasti32x8 m4, [pw_256] + vbroadcasti32x8 m5, [pw_128] + mova m6, [shuf_avx512] + add r3, r3 + add r4, r4 + +%rep %1/2 - 1 + PROCESS_ADDAVG_32x2_AVX512 + lea r2, [r2 + 2 * r5] + lea r0, [r0 + 2 * r3] + lea r1, [r1 + 2 * r4] +%endrep + PROCESS_ADDAVG_32x2_AVX512 + RET +%endmacro + +ADDAVG_W32_AVX512 8 +ADDAVG_W32_AVX512 16 +ADDAVG_W32_AVX512 24 +ADDAVG_W32_AVX512 32 +ADDAVG_W32_AVX512 48 +ADDAVG_W32_AVX512 64 ;----------------------------------------------------------------------------- ; addAvg avx512 code end ;----------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel