# HG changeset patch # User Gopi Satykrishna Akisetty <gopi.satykris...@multicorewareinc.com> # Date 1502442378 -19800 # Fri Aug 11 14:36:18 2017 +0530 # Node ID 3d8c45642752803c560891fdfbe0a8b5c03ca76a # Parent b30539ebe5c9b2d9412d3a39458a90a7574ac744 [x265-avx512]x86: AVX512 weight_pp
BitDepth | AVX2 performance | AVX512 performance ------------------------------------------------ 8 | 6.23x | 10.60x 10 | 9.43x | 14.59x diff -r b30539ebe5c9 -r 3d8c45642752 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 14 17:19:48 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Aug 11 14:36:18 2017 +0530 @@ -2322,6 +2322,7 @@ p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512); p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); + p.weight_pp = PFX(weight_pp_avx512); } } @@ -4026,6 +4027,7 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512); + p.weight_pp = PFX(weight_pp_avx512); } #endif diff -r b30539ebe5c9 -r 3d8c45642752 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Mon Aug 14 17:19:48 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Fri Aug 11 14:36:18 2017 +0530 @@ -1662,6 +1662,116 @@ jnz .loopH RET %endif + +%if HIGH_BIT_DEPTH +INIT_ZMM avx512 +cglobal weight_pp, 6, 7, 7 +%define correction (14 - BIT_DEPTH) + mov r6d, r6m + shl r6d, 16 - correction + or r6d, r5d + + movd xm0, r6d + vpbroadcastd m0, xm0 + mov r5d, r7m + sub r5d, correction + movd xm1, r5d + + vpbroadcastd m2, r8m + vbroadcasti32x8 m5, [pw_1] + vbroadcasti32x8 m6, [pw_pixel_max] + + add r2d, r2d + add r3d, r3d + sub r2d, r3d + shr r3d, 6 + +.loopH: + mov r5d, r3d + +.loopW: + movu m4, [r0] + punpcklwd m3, m4, m5 + pmaddwd m3, m0 + psrad m3, xm1 + paddd m3, m2 + + punpckhwd m4, m5 + pmaddwd m4, m0 + psrad m4, xm1 + paddd m4, m2 + + packusdw m3, m4 + pminuw m3, m6 + movu [r1], m3 + + add r0, 64 + add r1, 64 + + dec r5d + jnz .loopW + + lea r0, [r0 + r2] + lea r1, [r1 + r2] + + dec r4d + jnz .loopH +%undef correction + RET +%else +INIT_ZMM avx512 +cglobal weight_pp, 6, 7, 6 + + shl r5d, 6 + mov r6d, r6m + shl r6d, 16 + or r6d, r5d + + movd xm0, r6d + vpbroadcastd m0, xm0 + movd xm1, r7m + vpbroadcastd m2, r8m + + vbroadcasti32x8 m5, [pw_1] + + sub r2d, r3d + shr r3d, 5 + +.loopH: + mov r5d, r3d + +.loopW: + pmovzxbw m4, [r0] + punpcklwd m3, m4, m5 + pmaddwd m3, m0 + psrad m3, xm1 + paddd m3, m2 + + punpckhwd m4, m5 + pmaddwd m4, m0 + psrad m4, xm1 + paddd m4, m2 + + packssdw m3, m4 + vextracti64x4 ym4, m3, 1 + packuswb ym3, ym4 + vpermq ym3, ym3, q3120 + movu [r1], ym3 + + add r0, 32 + add r1, 32 + + dec r5d + jnz .loopW + + lea r0, [r0 + r2] + lea r1, [r1 + r2] + + dec r4d + jnz .loopH + RET +%endif + ;------------------------------------------------------------------------------------------------------------------------------------------------- ;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset) ;------------------------------------------------------------------------------------------------------------------------------------------------- diff -r b30539ebe5c9 -r 3d8c45642752 source/encoder/reference.cpp --- a/source/encoder/reference.cpp Mon Aug 14 17:19:48 2017 +0530 +++ b/source/encoder/reference.cpp Fri Aug 11 14:36:18 2017 +0530 @@ -155,12 +155,10 @@ const pixel* src = reconPic->m_picOrg[c] + numWeightedRows * cuHeight * stride; pixel* dst = fpelPlane[c] + numWeightedRows * cuHeight * stride; - // Computing weighted CU rows int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth - int padwidth = (width + 15) & ~15; // weightp assembly needs even 16 byte widths + int padwidth = (width + 31) & ~31; // weightp assembly needs even 32 byte widths primitives.weight_pp(src, dst, stride, padwidth, height, w[c].weight, w[c].round << correction, w[c].shift + correction, w[c].offset); - // Extending Left & Right primitives.extendRowBorder(dst, stride, width, height, marginX); diff -r b30539ebe5c9 -r 3d8c45642752 source/encoder/weightPrediction.cpp --- a/source/encoder/weightPrediction.cpp Mon Aug 14 17:19:48 2017 +0530 +++ b/source/encoder/weightPrediction.cpp Fri Aug 11 14:36:18 2017 +0530 @@ -184,8 +184,7 @@ int denom = w->log2WeightDenom; int round = denom ? 1 << (denom - 1) : 0; int correction = IF_INTERNAL_PREC - X265_DEPTH; /* intermediate interpolation depth */ - int pwidth = ((width + 15) >> 4) << 4; - + int pwidth = ((width + 31) >> 5) << 5; primitives.weight_pp(ref, weightTemp, stride, pwidth, height, weight, round << correction, denom + correction, offset); ref = weightTemp; diff -r b30539ebe5c9 -r 3d8c45642752 source/test/pixelharness.cpp --- a/source/test/pixelharness.cpp Mon Aug 14 17:19:48 2017 +0530 +++ b/source/test/pixelharness.cpp Fri Aug 11 14:36:18 2017 +0530 @@ -291,6 +291,9 @@ memset(opt_dest, 0, 64 * 64 * sizeof(pixel)); int j = 0; int width = 16 * (rand() % 4 + 1); + int cpuid = X265_NS::cpu_detect(); + if (cpuid & X265_CPU_AVX512) + width = 32 * (rand() % 2 + 1); int height = 8; int w0 = rand() % 128; int shift = rand() % 8; // maximum is 7, see setFromWeightAndOffset() _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel