# HG changeset patch # User Vignesh Vijayakumar # Date 1501588310 -19800 # Tue Aug 01 17:21:50 2017 +0530 # Node ID 465b4925d622ba66e2536c9f79eaaffcdd26d5fc # Parent 73ee464e136910a95d7b3070a1c736dedeaa6278 x86: AVX512 addAvg_W64 for high bit depth
Size | AVX2 performance | AVX512 performance ---------------------------------------------- 64x16 | 11.13x | 18.48x 64x32 | 11.04x | 17.75x 64x48 | 10.97x | 17.85x 64x64 | 10.93x | 17.37x diff -r 73ee464e1369 -r 465b4925d622 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 01 16:45:51 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 01 17:21:50 2017 +0530 @@ -2278,6 +2278,10 @@ p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512); + p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_avx512); + p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx512); + p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512); + p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512); p.pu[LUMA_32x8].addAvg = PFX(addAvg_32x8_avx512); p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_avx512); p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx512); diff -r 73ee464e1369 -r 465b4925d622 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asm Tue Aug 01 16:45:51 2017 +0530 +++ b/source/common/x86/mc-a.asm Tue Aug 01 17:21:50 2017 +0530 @@ -1738,6 +1738,80 @@ movu [r2 + r8], m0 %endmacro +%macro PROCESS_ADDAVG_64x4_HBD_AVX512 0 + movu m0, [r0] + movu m1, [r1] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2], m0 + + movu m0, [r0 + mmsize] + movu m1, [r1 + mmsize] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + mmsize], m0 + + movu m0, [r0 + r3] + movu m1, [r1 + r4] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r5], m0 + + movu m0, [r0 + r3 + mmsize] + movu m1, [r1 + r4 + mmsize] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r5 + mmsize], m0 + + movu m0, [r0 + 2 * r3] + movu m1, [r1 + 2 * r4] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + 2 * r5], m0 + + movu m0, [r0 + 2 * r3 + mmsize] + movu m1, [r1 + 2 * r4 + mmsize] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + 2 * r5 + mmsize], m0 + + movu m0, [r0 + r6] + movu m1, [r1 + r7] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r8], m0 + + movu m0, [r0 + r6 + mmsize] + movu m1, [r1 + r7 + mmsize] + paddw m0, m1 + pmulhrsw m0, m3 + paddw m0, m4 + pmaxsw m0, m2 + pminsw m0, m5 + movu [r2 + r8 + mmsize], m0 +%endmacro + ;----------------------------------------------------------------------------- ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) ;----------------------------------------------------------------------------- @@ -1771,6 +1845,35 @@ ADDAVG_W32_HBD_AVX512 32 ADDAVG_W32_HBD_AVX512 48 ADDAVG_W32_HBD_AVX512 64 + +%macro ADDAVG_W64_HBD_AVX512 1 +INIT_ZMM avx512 +cglobal addAvg_64x%1, 6,9,6 + vbroadcasti32x8 m4, [pw_ %+ ADDAVG_ROUND] + vbroadcasti32x8 m5, [pw_pixel_max] + vbroadcasti32x8 m3, [pw_ %+ ADDAVG_FACTOR] + pxor m2, m2 + add r3, r3 + add r4, r4 + add r5, r5 + lea r6, [3 * r3] + lea r7, [3 * r4] + lea r8, [3 * r5] + +%rep %1/4 - 1 + PROCESS_ADDAVG_64x4_HBD_AVX512 + lea r2, [r2 + 4 * r5] + lea r0, [r0 + 4 * r3] + lea r1, [r1 + 4 * r4] +%endrep + PROCESS_ADDAVG_64x4_HBD_AVX512 + RET +%endmacro + +ADDAVG_W64_HBD_AVX512 16 +ADDAVG_W64_HBD_AVX512 32 +ADDAVG_W64_HBD_AVX512 48 +ADDAVG_W64_HBD_AVX512 64 ;----------------------------------------------------------------------------- ;addAvg avx512 high bit depth code end ;----------------------------------------------------------------------------- _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel