# HG changeset patch # User Ximing Cheng <ximingch...@tencent.com> # Date 1511862059 -28800 # Tue Nov 28 17:40:59 2017 +0800 # Node ID 9cd0cf6e2fd88604d939138e539dd481ec429ab3 # Parent b24454f3ff6de650aab6835e291837fc4e2a4466 intra: sse4 version of strong intrasmoothing
diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/intrapred.cpp --- a/source/common/intrapred.cpp Wed Nov 22 22:00:48 2017 +0530 +++ b/source/common/intrapred.cpp Tue Nov 28 17:40:59 2017 +0800 @@ -29,12 +29,43 @@ namespace { template<int tuSize> -void intraFilter(const pixel* samples, pixel* filtered) /* 1:2:1 filtering of left and top reference samples */ +void intraFilter(const pixel* samples, pixel* filtered, int bUseStrongFilter) { const int tuSize2 = tuSize << 1; pixel topLeft = samples[0], topLast = samples[tuSize2], leftLast = samples[tuSize2 + tuSize2]; + // strong intra filter + if (bUseStrongFilter && tuSize >= 32) + { + const pixel leftMiddle = samples[tuSize2 + tuSize]; + const pixel topMiddle = samples[tuSize]; + const static int threshold = 1 << (X265_DEPTH - 5); + const bool bilinearLeft = abs((leftLast + topLeft) - (2 * leftMiddle)) < threshold; //difference between the + const bool bilinearAbove = abs((topLeft + topLast) - (2 * topMiddle)) < threshold; //ends and the middle + + if (bilinearLeft && bilinearAbove) + { + const int shift = 5 + 1; + int init = (topLeft << shift) + tuSize; + int deltaL, deltaR; + + deltaL = leftLast - topLeft; + deltaR = topLast - topLeft; + + filtered[0] = topLeft; + for (int i = 1; i < tuSize2; i++) + { + filtered[i + tuSize2] = (pixel)((init + deltaL * i) >> shift); // Left Filtering + filtered[i] = (pixel)((init + deltaR * i) >> shift); // Above Filtering + } + filtered[tuSize2] = topLast; + filtered[tuSize2 + tuSize2] = leftLast; + return; + } + } + /* 1:2:1 filtering of left and top reference samples */ + // filtering top for (int i = 1; i < tuSize2; i++) filtered[i] = ((samples[i] << 1) + samples[i - 1] + samples[i + 1] + 2) >> 2; diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/predict.cpp --- a/source/common/predict.cpp Wed Nov 22 22:00:48 2017 +0530 +++ b/source/common/predict.cpp Tue Nov 28 17:40:59 2017 +0800 @@ -594,7 +594,6 @@ void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, int dirMode) { int tuSize = 1 << intraNeighbors.log2TrSize; - int tuSize2 = tuSize << 1; PicYuv* reconPic = cu.m_encData->m_reconPic; pixel* adiOrigin = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx); @@ -605,41 +604,11 @@ pixel* refBuf = intraNeighbourBuf[0]; pixel* fltBuf = intraNeighbourBuf[1]; - pixel topLeft = refBuf[0], topLast = refBuf[tuSize2], leftLast = refBuf[tuSize2 + tuSize2]; - if (dirMode == ALL_IDX ? (8 | 16 | 32) & tuSize : g_intraFilterFlags[dirMode] & tuSize) { // generate filtered intra prediction samples - - if (cu.m_slice->m_sps->bUseStrongIntraSmoothing && tuSize == 32) - { - const int threshold = 1 << (X265_DEPTH - 5); - - pixel topMiddle = refBuf[32], leftMiddle = refBuf[tuSize2 + 32]; - - if (abs(topLeft + topLast - (topMiddle << 1)) < threshold && - abs(topLeft + leftLast - (leftMiddle << 1)) < threshold) - { - // "strong" bilinear interpolation - const int shift = 5 + 1; - int init = (topLeft << shift) + tuSize; - int deltaL, deltaR; - - deltaL = leftLast - topLeft; deltaR = topLast - topLeft; - - fltBuf[0] = topLeft; - for (int i = 1; i < tuSize2; i++) - { - fltBuf[i + tuSize2] = (pixel)((init + deltaL * i) >> shift); // Left Filtering - fltBuf[i] = (pixel)((init + deltaR * i) >> shift); // Above Filtering - } - fltBuf[tuSize2] = topLast; - fltBuf[tuSize2 + tuSize2] = leftLast; - return; - } - } - - primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(refBuf, fltBuf); + int bUseStrongIntraSmoothing = cu.m_slice->m_sps->bUseStrongIntraSmoothing; + primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(refBuf, fltBuf, bUseStrongIntraSmoothing); } } @@ -652,7 +621,7 @@ fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]); if (m_csp == X265_CSP_I444) - primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(intraNeighbourBuf[0], intraNeighbourBuf[1]); + primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(intraNeighbourBuf[0], intraNeighbourBuf[1], 0); } void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *intraNeighbors) diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/primitives.h --- a/source/common/primitives.h Wed Nov 22 22:00:48 2017 +0530 +++ b/source/common/primitives.h Tue Nov 28 17:40:59 2017 +0800 @@ -133,7 +133,7 @@ typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, const pixel *srcPix, int dirMode, int bFilter); typedef void (*intra_allangs_t)(pixel *dst, pixel *refPix, pixel *filtPix, int bLuma); -typedef void (*intra_filter_t)(const pixel* references, pixel* filtered); +typedef void (*intra_filter_t)(const pixel* references, pixel* filtered, int bUseStrongFilter); typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/x86/const-a.asm --- a/source/common/x86/const-a.asm Wed Nov 22 22:00:48 2017 +0530 +++ b/source/common/x86/const-a.asm Tue Nov 28 17:40:59 2017 +0800 @@ -114,6 +114,10 @@ const multiH3, times 1 dw 25, 26, 27, 28, 29, 30, 31, 32 const multiL, times 1 dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 const multiH2, times 1 dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32 +const multiH3_1, times 1 dw 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48 +const multiH3_2, times 1 dw 41, 42, 43, 44, 45, 46, 47, 48 +const multiH4, times 1 dw 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64 +const multiH4_1, times 1 dw 57, 58, 59, 60, 61, 62, 63, 64 const pw_planar16_mul, times 1 dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 const pw_planar32_mul, times 1 dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16 const pw_FFFFFFFFFFFFFFF0, dw 0x00 diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/x86/intrapred.h --- a/source/common/x86/intrapred.h Wed Nov 22 22:00:48 2017 +0530 +++ b/source/common/x86/intrapred.h Tue Nov 28 17:40:59 2017 +0800 @@ -67,7 +67,7 @@ #define DECL_ALL(cpu) \ FUNCDEF_TU(void, all_angs_pred, cpu, pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); \ - FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel *filtered); \ + FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel *filtered, int bUseStrongFilter); \ DECL_ANGS(4, cpu); \ DECL_ANGS(8, cpu); \ DECL_ANGS(16, cpu); \ diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/x86/intrapred8.asm --- a/source/common/x86/intrapred8.asm Wed Nov 22 22:00:48 2017 +0530 +++ b/source/common/x86/intrapred8.asm Tue Nov 28 17:40:59 2017 +0800 @@ -543,6 +543,10 @@ cextern multiH cextern multiH2 cextern multiH3 +cextern multiH3_1 +cextern multiH3_2 +cextern multiH4 +cextern multiH4_1 cextern multi_2Row cextern trans8_shuf cextern pw_planar16_mul @@ -22313,11 +22317,142 @@ mov [r1 + 64], r3b ; LeftLast RET -INIT_XMM sse4 -cglobal intra_filter_32x32, 2,4,6 - mov r2b, byte [r0 + 64] ; topLast - mov r3b, byte [r0 + 128] ; LeftLast - +; this function add strong intra filter +INIT_XMM sse4 +cglobal intra_filter_32x32, 3,8,7 + movzx r3d, byte [r0 + 64] ; topLast + movzx r4d, byte [r0 + 128] ; LeftLast + + ; strong intra filter is disabled + cmp r2m, byte 0 + jz .normal_filter32 + ; decide to do strong intra filter + movzx r5d, byte [r0] ; topLeft + movzx r6d, byte [r0 + 32] ; topMiddle + + ; threshold = 8 + mov r2d, r3d + add r2d, r5d ; (topLast + topLeft) + shl r6d, 1 ; 2 * topMiddle + mov r7d, r2d + sub r2d, r6d ; (topLast + topLeft) - 2 * topMiddle + sub r6d, r7d ; 2 * topMiddle - (topLast + topLeft) + cmovg r2d, r6d + cmp r2d, 8 + ; bilinearAbove is false + jns .normal_filter32 + + movzx r6d, byte [r0 + 96] ; leftMiddle + mov r2d, r5d + add r2d, r4d + shl r6d, 1 + mov r7d, r2d + sub r2d, r6d + sub r6d, r7d + cmovg r2d, r6d + cmp r2d, 8 + ; bilinearLeft is false + jns .normal_filter32 + + ; do strong intra filter shift = 6 + mov r2d, r5d + shl r2d, 6 + add r2d, 32 ; init + mov r6d, r4d + sub r6d, r5d ; deltaL + mov r7d, r3d + sub r7d, r5d ; deltaR + + movd m0, r2d + pshuflw m0, m0, 0 + movlhps m0, m0 + mova m4, m0 + + + movd m1, r7d + pshuflw m1, m1, 0 + movlhps m1, m1 + pmullw m2, m1, [multiL] ; [ 1 2 3 4 5 6 7 8] + pmullw m3, m1, [multiH] ; [ 9 10 11 12 13 14 15 16] + paddw m5, m0, m2 + paddw m6, m4, m3 + psraw m5, 6 + psraw m6, 6 + packuswb m5, m6 + movu [r1 + 1], m5 + + pmullw m2, m1, [multiH2] ; [17 18 19 20 21 22 23 24] + pmullw m3, m1, [multiH3] ; [25 26 27 28 29 30 31 32] + paddw m5, m0, m2 + paddw m6, m4, m3 + psraw m5, 6 + psraw m6, 6 + packuswb m5, m6 + movu [r1 + 17], m5 + + pmullw m2, m1, [multiH3_1] ; [33 - 40] + pmullw m3, m1, [multiH3_2] ; [41 - 48] + paddw m5, m0, m2 + paddw m6, m4, m3 + psraw m5, 6 + psraw m6, 6 + packuswb m5, m6 + movu [r1 + 33], m5 + + pmullw m2, m1, [multiH4] ; [49 - 56] + pmullw m1, [multiH4_1] ; [57 - 64] + paddw m5, m0, m2 + paddw m6, m4, m1 + psraw m5, 6 + psraw m6, 6 + packuswb m5, m6 + movu [r1 + 49], m5 + + movd m1, r6d + pshuflw m1, m1, 0 + movlhps m1, m1 + pmullw m2, m1, [multiL] ; [ 1 2 3 4 5 6 7 8] + pmullw m3, m1, [multiH] ; [ 9 10 11 12 13 14 15 16] + paddw m5, m0, m2 + paddw m6, m4, m3 + psraw m5, 6 + psraw m6, 6 + packuswb m5, m6 + movu [r1 + 65], m5 + + pmullw m2, m1, [multiH2] ; [17 18 19 20 21 22 23 24] + pmullw m3, m1, [multiH3] ; [25 26 27 28 29 30 31 32] + paddw m5, m0, m2 + paddw m6, m4, m3 + psraw m5, 6 + psraw m6, 6 + packuswb m5, m6 + movu [r1 + 81], m5 + + pmullw m2, m1, [multiH3_1] ; [49 - 56] + pmullw m3, m1, [multiH3_2] ; [57 - 64] + paddw m5, m0, m2 + paddw m6, m4, m3 + psraw m5, 6 + psraw m6, 6 + packuswb m5, m6 + movu [r1 + 97], m5 + + pmullw m2, m1, [multiH4] ; [49 - 56] + pmullw m1, [multiH4_1] ; [57 - 64] + paddw m0, m2 + paddw m4, m1 + psraw m0, 6 + psraw m4, 6 + packuswb m0, m4 + movu [r1 + 113], m0 + + mov [r1], r5b ; topLeft + mov [r1 + 64], r3b ; topLast + mov [r1 + 128], r4b ; LeftLast + RET + +.normal_filter32 ; filtering top ; 0 to 15 pmovzxbw m0, [r0 + 0] @@ -22514,8 +22649,8 @@ packuswb m1, m5 movu [r1 + 112], m1 - mov [r1 + 64], r2b ; topLast - mov [r1 + 128], r3b ; LeftLast + mov [r1 + 64], r3b ; topLast + mov [r1 + 128], r4b ; LeftLast RET INIT_YMM avx2 diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/encoder/slicetype.cpp --- a/source/encoder/slicetype.cpp Wed Nov 22 22:00:48 2017 +0530 +++ b/source/encoder/slicetype.cpp Tue Nov 28 17:40:59 2017 +0800 @@ -349,7 +349,7 @@ for (int i = 1; i <= 2 * cuSize; i++) samples[cuSize2 + i] = pixCur[i * fenc.lumaStride]; /* left */ - primitives.cu[sizeIdx].intra_filter(samples, filtered); + primitives.cu[sizeIdx].intra_filter(samples, filtered, 0); int cost, icost = me.COST_MAX; uint32_t ilowmode = 0; _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel