Min, thanks for this series of patches, all pushed. On Tue, Jun 9, 2015 at 11:36 PM, Min Chen <[email protected]> wrote:
> # HG changeset patch > # User Min Chen <[email protected]> > # Date 1433872879 25200 > # Node ID e5b6f0a984bdd8ab16b63fb1c11a508a444515ec > # Parent 134670771e0c1dd0800c3e9db0a1f9f69c467e36 > asm: improve costCoeffRemain by bypass uncoded coeff > --- > source/common/dct.cpp | 17 ++++---- > source/common/primitives.h | 2 +- > source/common/x86/pixel-util.h | 2 +- > source/common/x86/pixel-util8.asm | 41 +++++++++---------- > source/encoder/entropy.cpp | 82 > ++++++++++++++++++++++++++++++++----- > 5 files changed, 101 insertions(+), 43 deletions(-) > > diff -r 134670771e0c -r e5b6f0a984bd source/common/dct.cpp > --- a/source/common/dct.cpp Tue Jun 09 11:01:15 2015 -0700 > +++ b/source/common/dct.cpp Tue Jun 09 11:01:19 2015 -0700 > @@ -874,19 +874,19 @@ > return (sum & 0xFFFFFF); > } > > -uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero) > +uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero, int idx) > { > uint32_t goRiceParam = 0; > - int firstCoeff2 = 1; > - uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel > > uint32_t sum = 0; > - int idx = 0; > + int baseLevel = 3; > do > { > - int baseLevel = (baseLevelN & 3) | firstCoeff2; > - X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + > firstCoeff2) : 1), "baseLevel check failurr\n"); > - baseLevelN >>= 2; > + if (idx >= C1FLAG_NUMBER) > + baseLevel = 1; > + > + // TODO: the IDX is not really idx, so this check inactive > + //X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + > firstCoeff2) : 1), "baseLevel check failurr\n"); > int codeNumber = absCoeff[idx] - baseLevel; > > if (codeNumber >= 0) > @@ -912,8 +912,7 @@ > goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2); > X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n"); > } > - if (absCoeff[idx] >= 2) > - firstCoeff2 = 0; > + baseLevel = 2; > idx++; > } > while(idx < numNonZero); > diff -r 134670771e0c -r e5b6f0a984bd source/common/primitives.h > --- a/source/common/primitives.h Tue Jun 09 11:01:15 2015 -0700 > +++ b/source/common/primitives.h Tue Jun 09 11:01:19 2015 -0700 > @@ -187,7 +187,7 @@ > typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const > intptr_t trSize, const uint16_t scanTbl[16]); > > typedef uint32_t (*costCoeffNxN_t)(const uint16_t *scan, const coeff_t > *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, > uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int > subPosBase); > -typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero); > +typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero, > int idx); > > /* Function pointers to optimized encoder primitives. Each pointer can > reference > * either an assembly routine, a SIMD intrinsic primitive, or a C > function */ > diff -r 134670771e0c -r e5b6f0a984bd source/common/x86/pixel-util.h > --- a/source/common/x86/pixel-util.h Tue Jun 09 11:01:15 2015 -0700 > +++ b/source/common/x86/pixel-util.h Tue Jun 09 11:01:19 2015 -0700 > @@ -83,7 +83,7 @@ > uint32_t x265_findPosFirstLast_ssse3(const int16_t *dstCoeff, const > intptr_t trSize, const uint16_t scanTbl[16]); > > uint32_t x265_costCoeffNxN_sse4(const uint16_t *scan, const coeff_t > *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, > uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int > subPosBase); > -uint32_t x265_costCoeffRemain_sse4(uint16_t *absCoeff, int numNonZero); > +uint32_t x265_costCoeffRemain_sse4(uint16_t *absCoeff, int numNonZero, > int idx); > > > #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \ > diff -r 134670771e0c -r e5b6f0a984bd source/common/x86/pixel-util8.asm > --- a/source/common/x86/pixel-util8.asm Tue Jun 09 11:01:15 2015 -0700 > +++ b/source/common/x86/pixel-util8.asm Tue Jun 09 11:01:19 2015 -0700 > @@ -6572,7 +6572,7 @@ > ;} > ;while(idx < numNonZero); > > -; uint32_t costCoeffRemain(uint16_t *absCoeff, int numNonZero) > +; uint32_t costCoeffRemain(uint16_t *absCoeff, int numNonZero, int idx) > INIT_XMM sse4 > cglobal costCoeffRemain, 0,7,1 > ; assign RCX to R3 > @@ -6580,48 +6580,43 @@ > %if WIN64 > DECLARE_REG_TMP 3,1,2,0 > mov t0, r0 > + mov r4d, r2d > %elif ARCH_X86_64 > ; *nix x64 didn't do anything > DECLARE_REG_TMP 0,1,2,3 > + mov r4d, r2d > %else ; X86_32 > DECLARE_REG_TMP 6,3,2,1 > mov t0, r0m > + mov r4d, r2m > %endif > > - mova m0, [t0] > - packsswb m0, [t0 + mmsize] > - pcmpgtb m0, [pb_1] > - pmovmskb r2d, m0 > - bsf r2d, r2d > - lea r2d, [r2 * 2 + 1] > - xor r4d, r4d > - bts r4d, r2d > - dec r4d > - and r4d, 0x55555555 > - or r4d, 0x5555AAAA > - > xor t3d, t3d > xor r5d, r5d > > + lea t0, [t0 + r4 * 2] > + mov r2d, 3 > + > ; register mapping > - ; r4d - baseLevelN > - ; r2 - tmp > + ; r2d - baseLevel & tmp > + ; r4d - idx > ; t3 - goRiceParam > - ; eax - tmp - absCoeff[idx] > + ; eax - absCoeff[idx] & tmp > ; r5 - sum > > .loop: > + mov eax, 1 > + cmp r4d, 8 > + cmovge r2d, eax > + > movzx eax, word [t0] > add t0, 2 > - mov r2d, r4d > - and r2d, 3 > - shr r4d, 2 > sub eax, r2d ; codeNumber = absCoeff[idx] - > baseLevel > jl .next > > shr eax, t3b ; codeNumber = > ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION > > - lea r2d, [eax - 3 + 1] ; CLZ(cidx, codeNumber + 1); > + lea r2d, [rax - 3 + 1] ; CLZ(cidx, codeNumber + 1); > bsr r2d, r2d > add r2d, r2d ; codeNumber = (length + length) > > @@ -6644,8 +6639,10 @@ > add t3b, al > > .next: > - dec dword r1m > - jnz .loop > + inc r4d > + mov r2d, 2 > + cmp r4d, r1m > + jl .loop > > mov eax, r5d > RET > diff -r 134670771e0c -r e5b6f0a984bd source/encoder/entropy.cpp > --- a/source/encoder/entropy.cpp Tue Jun 09 11:01:15 2015 -0700 > +++ b/source/encoder/entropy.cpp Tue Jun 09 11:01:19 2015 -0700 > @@ -1431,6 +1431,55 @@ > encodeBin(cu.getCbf(absPartIdx, ttype, lowestTUDepth), > m_contextState[OFF_QT_CBF_CTX + ctx]); > } > > +#if CHECKED_BUILD || _DEBUG > +uint32_t costCoeffRemain_c0(uint16_t *absCoeff, int numNonZero) > +{ > + uint32_t goRiceParam = 0; > + int firstCoeff2 = 1; > + uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel > + > + uint32_t sum = 0; > + int idx = 0; > + do > + { > + int baseLevel = (baseLevelN & 3) | firstCoeff2; > + X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + > firstCoeff2) : 1), "baseLevel check failurr\n"); > + baseLevelN >>= 2; > + int codeNumber = absCoeff[idx] - baseLevel; > + > + if (codeNumber >= 0) > + { > + //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel, > goRiceParam); > + uint32_t length = 0; > + > + codeNumber = ((uint32_t)codeNumber >> goRiceParam) - > COEF_REMAIN_BIN_REDUCTION; > + if (codeNumber >= 0) > + { > + { > + unsigned long cidx; > + CLZ(cidx, codeNumber + 1); > + length = cidx; > + } > + X265_CHECK((codeNumber != 0) || (length == 0), "length > check failure\n"); > + > + codeNumber = (length + length); > + } > + sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + > codeNumber); > + > + if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << > goRiceParam)) > + goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2); > + X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n"); > + } > + if (absCoeff[idx] >= 2) > + firstCoeff2 = 0; > + idx++; > + } > + while(idx < numNonZero); > + > + return sum; > +} > +#endif // debug only code > + > void Entropy::codeCoeffNxN(const CUData& cu, const coeff_t* coeff, > uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype) > { > uint32_t trSize = 1 << log2TrSize; > @@ -1519,7 +1568,7 @@ > uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX] > : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA]; > uint32_t c1 = 1; > int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1; > - ALIGN_VAR_32(uint16_t, absCoeff[1 << MLS_CG_SIZE]); > + ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE)]); > uint32_t numNonZero = 1; > unsigned long lastNZPosInCG; > unsigned long firstNZPosInCG; > @@ -1700,6 +1749,7 @@ > uint32_t numC1Flag = X265_MIN(numNonZero, C1FLAG_NUMBER); > X265_CHECK(numC1Flag > 0, "numC1Flag check failure\n"); > > + uint32_t firstC2Idx = 8; > uint32_t firstC2Flag = 2; > uint32_t c1Next = 0xFFFFFFFE; > if (!m_bitIf) > @@ -1720,9 +1770,13 @@ > > if (symbol1) > c1Next = 0; > + > if (symbol1 + firstC2Flag == 3) > firstC2Flag = symbol2; > > + if (symbol1 + firstC2Idx == 9) > + firstC2Idx = idx; > + > c1 = (c1Next & 3); > c1Next >>= 2; > X265_CHECK(c1 <= 3, "c1 check failure\n"); > @@ -1749,9 +1803,10 @@ > //encodeBinsEP((coeffSigns >> hiddenShift), numNonZero - > hiddenShift); > m_fracBits += (numNonZero - hiddenShift) << 15; > > - if (!c1 || numNonZero > C1FLAG_NUMBER) > + if (numNonZero > firstC2Idx) > { > - uint32_t sum = primitives.costCoeffRemain(absCoeff, > numNonZero); > + sum = primitives.costCoeffRemain(absCoeff, > numNonZero, firstC2Idx); > + X265_CHECK(sum == costCoeffRemain_c0(absCoeff, > numNonZero), "costCoeffRemain check failure\n"); > m_fracBits += ((uint64_t)sum << 15); > } > } > @@ -1771,6 +1826,9 @@ > if (symbol1 + firstC2Flag == 3) > firstC2Flag = symbol2; > > + if (symbol1 + firstC2Idx == 9) > + firstC2Idx = idx; > + > c1 = (c1Next & 3); > c1Next >>= 2; > X265_CHECK(c1 <= 3, "c1 check failure\n"); > @@ -1793,15 +1851,17 @@ > { > // Standard path > uint32_t goRiceParam = 0; > + int baseLevel = 3; > +#if CHECKED_BUILD || _DEBUG > int firstCoeff2 = 1; > - uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode > format baseLevel > - > - idx = 0; > +#endif > + idx = firstC2Idx; > do > { > - int baseLevel = (baseLevelN & 3) | firstCoeff2; > + if (idx >= C1FLAG_NUMBER) > + baseLevel = 1; > + // TODO: fast algorithm maybe broken this check > logic > X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? > (2 + firstCoeff2) : 1), "baseLevel check failurr\n"); > - baseLevelN >>= 2; > > if (absCoeff[idx] >= baseLevel) > { > @@ -1810,8 +1870,10 @@ > goRiceParam = (goRiceParam + 1) - > (goRiceParam >> 2); > X265_CHECK(goRiceParam <= 4, "goRiceParam > check failure\n"); > } > - if (absCoeff[idx] >= 2) > - firstCoeff2 = 0; > +#if CHECKED_BUILD || _DEBUG > + firstCoeff2 = 0; > +#endif > + baseLevel = 2; > idx++; > } > while(idx < numNonZero); > > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel >
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
