On Tue, Mar 4, 2014 at 4:40 AM, Satoshi Nakagawa <nakagawa...@oki.com> wrote: > # HG changeset patch > # User Satoshi Nakagawa <nakagawa...@oki.com> > # Date 1393929339 -32400 > # Tue Mar 04 19:35:39 2014 +0900 > # Node ID 7a61566806f691ddff84cbbc42801f6c2d46df88 > # Parent 3cbde0b893e34e5770cc311d3f4b6fe064c27774 > cleanup m_sharedPredTransformSkip[] > > NEW_CALCRECON macro is TODO mark for asm experts, to optimize register > assignment.
Sorry I haven't responded to this yet; I would like Min to review it before I push it. > diff -r 3cbde0b893e3 -r 7a61566806f6 source/Lib/TLibEncoder/TEncSearch.cpp > --- a/source/Lib/TLibEncoder/TEncSearch.cpp Mon Mar 03 13:37:35 2014 -0600 > +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Tue Mar 04 19:35:39 2014 +0900 > @@ -63,7 +63,6 @@ > m_qtTempTUCoeffCr = NULL; > for (int i = 0; i < 3; i++) > { > - m_sharedPredTransformSkip[i] = NULL; > m_qtTempTransformSkipFlag[i] = NULL; > m_qtTempCbf[i] = NULL; > } > @@ -96,7 +95,6 @@ > for (uint32_t i = 0; i < 3; ++i) > { > X265_FREE(m_qtTempCbf[i]); > - X265_FREE(m_sharedPredTransformSkip[i]); > X265_FREE(m_qtTempTransformSkipFlag[i]); > } > > @@ -153,9 +151,6 @@ > CHECKED_MALLOC(m_qtTempTransformSkipFlag[1], uint8_t, numPartitions); > CHECKED_MALLOC(m_qtTempTransformSkipFlag[2], uint8_t, numPartitions); > > - CHECKED_MALLOC(m_sharedPredTransformSkip[0], pixel, MAX_TS_WIDTH * > MAX_TS_HEIGHT); > - CHECKED_MALLOC(m_sharedPredTransformSkip[1], pixel, MAX_TS_WIDTH * > MAX_TS_HEIGHT); > - CHECKED_MALLOC(m_sharedPredTransformSkip[2], pixel, MAX_TS_WIDTH * > MAX_TS_HEIGHT); > CHECKED_MALLOC(m_qtTempTUCoeffY, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT); > CHECKED_MALLOC(m_qtTempTUCoeffCb, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT); > CHECKED_MALLOC(m_qtTempTUCoeffCr, TCoeff, MAX_TS_WIDTH * MAX_TS_HEIGHT); > @@ -414,7 +409,6 @@ > Pel* fenc = fencYuv->getLumaAddr(absPartIdx); > Pel* pred = predYuv->getLumaAddr(absPartIdx); > int16_t* residual = resiYuv->getLumaAddr(absPartIdx); > - Pel* recon = predYuv->getLumaAddr(absPartIdx); > int chFmt = cu->getChromaFormat(); > int part = partitionFromSizes(width, height); > > @@ -439,15 +433,6 @@ > cu->getPattern()->initAdiPattern(cu, absPartIdx, trDepth, m_predBuf, > m_predBufStride, m_predBufHeight, m_refAbove, m_refLeft, m_refAboveFlt, > m_refLeftFlt); > //===== get prediction signal ===== > predIntraLumaAng(lumaPredMode, pred, stride, width); > - // save prediction > - if (default0Save1Load2 == 1) > - { > - primitives.luma_copy_pp[part](m_sharedPredTransformSkip[0], > width, pred, stride); > - } > - } > - else > - { > - primitives.luma_copy_pp[part](pred, stride, > m_sharedPredTransformSkip[0], width); > } > > //===== get residual signal ===== > @@ -491,12 +476,19 @@ > primitives.blockfill_s[size](resiTmp, stride, 0); > } > > + assert(width <= 32); > +#if NEW_CALCRECON > //===== reconstruction ===== > - assert(width <= 32); > + primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, > stride, MAX_CU_SIZE, reconIPredStride); > + //===== update distortion ===== > + outDist += primitives.sse_sp[part](reconQt, MAX_CU_SIZE, fenc, stride); > +#else > + ALIGN_VAR_32(pixel, recon[MAX_CU_SIZE * MAX_CU_SIZE]); > + //===== reconstruction ===== > primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, > stride, MAX_CU_SIZE, reconIPredStride); > - > //===== update distortion ===== > outDist += primitives.sse_pp[part](fenc, stride, recon, stride); > +#endif > } > > void TEncSearch::xIntraCodingChromaBlk(TComDataCU* cu, > @@ -534,7 +526,6 @@ > Pel* fenc = (chromaId > 0 ? fencYuv->getCrAddr(absPartIdx) > : fencYuv->getCbAddr(absPartIdx)); > Pel* pred = (chromaId > 0 ? predYuv->getCrAddr(absPartIdx) > : predYuv->getCbAddr(absPartIdx)); > int16_t* residual = (chromaId > 0 ? resiYuv->getCrAddr(absPartIdx) > : resiYuv->getCbAddr(absPartIdx)); > - Pel* recon = (chromaId > 0 ? predYuv->getCrAddr(absPartIdx) > : predYuv->getCbAddr(absPartIdx)); > > uint32_t qtlayer = > cu->getSlice()->getSPS()->getQuadtreeTULog2MaxSize() - trSizeLog2; > uint32_t numCoeffPerInc = (cu->getSlice()->getSPS()->getMaxCUWidth() * > cu->getSlice()->getSPS()->getMaxCUHeight() >> > (cu->getSlice()->getSPS()->getMaxCUDepth() << 1)) >> (m_hChromaShift + > m_vChromaShift); > @@ -561,19 +552,6 @@ > > //===== get prediction signal ===== > predIntraChromaAng(chromaPred, chromaPredMode, pred, stride, width, > height, chFmt); > - > - // save prediction > - if (default0Save1Load2 == 1) > - { > - Pel* predbuf = m_sharedPredTransformSkip[1 + chromaId]; > - primitives.luma_copy_pp[part](predbuf, width, pred, stride); > - } > - } > - else > - { > - // load prediction > - Pel* predbuf = m_sharedPredTransformSkip[1 + chromaId]; > - primitives.luma_copy_pp[part](pred, stride, predbuf, width); > } > > //===== get residual signal ===== > @@ -627,12 +605,20 @@ > } > } > > + assert(((intptr_t)residual & (width - 1)) == 0); > + assert(width <= 32); > +#if NEW_CALCRECON > //===== reconstruction ===== > - assert(((uint32_t)(size_t)residual & (width - 1)) == 0); > - assert(width <= 32); > + primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, > stride, reconQtStride, reconIPredStride); > + //===== update distortion ===== > + uint32_t dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc, > stride); > +#else > + ALIGN_VAR_32(pixel, recon[MAX_CU_SIZE * MAX_CU_SIZE]); > + //===== reconstruction ===== > primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, > stride, reconQtStride, reconIPredStride); > //===== update distortion ===== > uint32_t dist = primitives.sse_pp[part](fenc, stride, recon, stride); > +#endif > if (ttype == TEXT_CHROMA_U) > { > outDist += m_rdCost->scaleChromaDistCb(dist); > diff -r 3cbde0b893e3 -r 7a61566806f6 source/Lib/TLibEncoder/TEncSearch.h > --- a/source/Lib/TLibEncoder/TEncSearch.h Mon Mar 03 13:37:35 2014 -0600 > +++ b/source/Lib/TLibEncoder/TEncSearch.h Tue Mar 04 19:35:39 2014 +0900 > @@ -84,7 +84,6 @@ > protected: > > ShortYuv* m_qtTempShortYuv; > - pixel* m_sharedPredTransformSkip[3]; > > TCoeff** m_qtTempCoeffY; > TCoeff** m_qtTempCoeffCb; > diff -r 3cbde0b893e3 -r 7a61566806f6 source/common/pixel.cpp > --- a/source/common/pixel.cpp Mon Mar 03 13:37:35 2014 -0600 > +++ b/source/common/pixel.cpp Tue Mar 04 19:35:39 2014 +0900 > @@ -460,20 +460,33 @@ > } > > template<int blockSize> > -void calcRecons(pixel* pred, int16_t* residual, pixel* recon, int16_t* > recqt, pixel* recipred, int stride, int qtstride, int ipredstride) > +void calcRecons(pixel* pred, int16_t* residual, > +#if NEW_CALCRECON > + pixel*, > +#else > + pixel* recon, > +#endif > + int16_t* recqt, pixel* recipred, int stride, int qtstride, > int ipredstride) > { > for (int uiY = 0; uiY < blockSize; uiY++) > { > for (int uiX = 0; uiX < blockSize; uiX++) > { > +#if NEW_CALCRECON > + recqt[uiX] = (int16_t)ClipY(static_cast<int16_t>(pred[uiX]) + > residual[uiX]); > + recipred[uiX] = (pixel)recqt[uiX]; > +#else > recon[uiX] = (pixel)ClipY(static_cast<int16_t>(pred[uiX]) + > residual[uiX]); > recqt[uiX] = (int16_t)recon[uiX]; > recipred[uiX] = recon[uiX]; > +#endif > } > > pred += stride; > residual += stride; > +#if !NEW_CALCRECON > recon += stride; > +#endif > recqt += qtstride; > recipred += ipredstride; > } > diff -r 3cbde0b893e3 -r 7a61566806f6 source/common/primitives.h > --- a/source/common/primitives.h Mon Mar 03 13:37:35 2014 -0600 > +++ b/source/common/primitives.h Tue Mar 04 19:35:39 2014 +0900 > @@ -34,6 +34,8 @@ > #include "cpu.h" > #include "x265.h" > > +#define NEW_CALCRECON 1 // TODO: remove recon[] arg > + > #define FENC_STRIDE 64 > > #define NUM_INTRA_MODE 35 > diff -r 3cbde0b893e3 -r 7a61566806f6 source/common/x86/pixel-util8.asm > --- a/source/common/x86/pixel-util8.asm Mon Mar 03 13:37:35 2014 -0600 > +++ b/source/common/x86/pixel-util8.asm Tue Mar 04 19:35:39 2014 +0900 > @@ -57,6 +57,7 @@ > cextern pw_2000 > cextern pw_pixel_max > > +%define NEW_CALCRECON 1 ; TODO: remove recon[] arg > > ;----------------------------------------------------------------------------- > ; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* > reconqt, pixel *reconipred, int stride, int strideqt, int strideipred) > > ;----------------------------------------------------------------------------- > @@ -101,7 +102,9 @@ > CLIPW m0, m4, m5 > > ; store recon[] and recipred[] > +%if NEW_CALCRECON == 0 > movh [t2], m0 > +%endif > movh [t4], m0 > %if ARCH_X86_64 == 0 > add t4, t7 > @@ -113,7 +116,9 @@ > movhps [t4 + t7], m0 > lea t4, [t4 + t7 * 2] > %endif > +%if NEW_CALCRECON == 0 > movhps [t2 + t5], m0 > +%endif > > ; store recqt[] > movh [t3], m0 > @@ -123,7 +128,9 @@ > > lea t0, [t0 + t5 * 2] > lea t1, [t1 + t5 * 2] > +%if NEW_CALCRECON == 0 > lea t2, [t2 + t5 * 2] > +%endif > > dec t8d > jnz .loop > @@ -165,11 +172,15 @@ > packuswb m1, m1 > > ; store recon[] and recipred[] > +%if NEW_CALCRECON == 0 > movd [t2], m1 > +%endif > movd [t4], m1 > add t4, t7 > pshufd m2, m1, 1 > +%if NEW_CALCRECON == 0 > movd [t2 + t5], m2 > +%endif > movd [t4], m2 > add t4, t7 > > @@ -182,7 +193,9 @@ > > lea t0, [t0 + t5 * 2] > lea t1, [t1 + t5 * 4] > +%if NEW_CALCRECON == 0 > lea t2, [t2 + t5 * 2] > +%endif > > dec t8d > jnz .loop > @@ -231,8 +244,10 @@ > CLIPW m1, m4, m5 > > ; store recon[] and recipred[] > +%if NEW_CALCRECON == 0 > movu [t2], m0 > movu [t2 + t5], m1 > +%endif > movu [t4], m0 > %if ARCH_X86_64 == 0 > add t4, t7 > @@ -253,7 +268,9 @@ > > lea t0, [t0 + t5 * 2] > lea t1, [t1 + t5 * 2] > +%if NEW_CALCRECON == 0 > lea t2, [t2 + t5 * 2] > +%endif > > dec t8d > jnz .loop > @@ -295,8 +312,10 @@ > packuswb m1, m2 > > ; store recon[] and recipred[] > +%if NEW_CALCRECON == 0 > movlps [t2], m1 > movhps [t2 + t5], m1 > +%endif > movlps [t4], m1 > %if ARCH_X86_64 == 0 > add t4, t7 > @@ -317,7 +336,9 @@ > > lea t0, [t0 + t5 * 2] > lea t1, [t1 + t5 * 4] > +%if NEW_CALCRECON == 0 > lea t2, [t2 + t5 * 2] > +%endif > > dec t8d > jnz .loop > @@ -367,8 +388,10 @@ > CLIPW m1, m4, m5 > > ; store recon[] and recipred[] > +%if NEW_CALCRECON == 0 > movu [t2], m0 > movu [t2 + 16], m1 > +%endif > movu [t4], m0 > movu [t4 + 16], m1 > %if ARCH_X86_64 == 0 > @@ -391,8 +414,10 @@ > CLIPW m1, m4, m5 > > ; store recon[] and recipred[] > +%if NEW_CALCRECON == 0 > movu [t2 + t5], m0 > movu [t2 + t5 + 16], m1 > +%endif > %if ARCH_X86_64 == 0 > movu [t4], m0 > movu [t4 + 16], m1 > @@ -411,7 +436,9 @@ > > lea t0, [t0 + t5 * 2] > lea t1, [t1 + t5 * 2] > +%if NEW_CALCRECON == 0 > lea t2, [t2 + t5 * 2] > +%endif > > dec t8d > jnz .loop > @@ -451,7 +478,9 @@ > packuswb m1, m2 > > ; store recon[] and recipred[] > +%if NEW_CALCRECON == 0 > movu [t2], m1 > +%endif > movu [t4], m1 > > ; store recqt[] > @@ -464,7 +493,9 @@ > add t4, t7 > add t0, t5 > lea t1, [t1 + t5 * 2] > +%if NEW_CALCRECON == 0 > add t2, t5 > +%endif > > dec t8d > jnz .loop > @@ -513,8 +544,10 @@ > CLIPW m1, m4, m5 > > ; store recon[] and recipred[] > +%if NEW_CALCRECON == 0 > movu [t2], m0 > movu [t2 + 16], m1 > +%endif > movu [t4], m0 > movu [t4 + 16], m1 > > @@ -532,8 +565,10 @@ > CLIPW m1, m4, m5 > > ; store recon[] and recipred[] > +%if NEW_CALCRECON == 0 > movu [t2 + 32], m0 > movu [t2 + 48], m1 > +%endif > movu [t4 + 32], m0 > movu [t4 + 48], m1 > %if ARCH_X86_64 == 0 > @@ -556,8 +591,10 @@ > CLIPW m1, m4, m5 > > ; store recon[] and recipred[] > +%if NEW_CALCRECON == 0 > movu [t2 + t5], m0 > movu [t2 + t5 + 16], m1 > +%endif > %if ARCH_X86_64 == 0 > movu [t4], m0 > movu [t4 + 16], m1 > @@ -580,8 +617,10 @@ > CLIPW m1, m4, m5 > > ; store recon[] and recipred[] > +%if NEW_CALCRECON == 0 > movu [t2 + t5 + 32], m0 > movu [t2 + t5 + 48], m1 > +%endif > %if ARCH_X86_64 == 0 > movu [t4 + 32], m0 > movu [t4 + 48], m1 > @@ -600,7 +639,9 @@ > > lea t0, [t0 + t5 * 2] > lea t1, [t1 + t5 * 2] > +%if NEW_CALCRECON == 0 > lea t2, [t2 + t5 * 2] > +%endif > > dec t8d > jnz .loop > @@ -648,8 +689,10 @@ > packuswb m3, m4 > > ; store recon[] and recipred[] > +%if NEW_CALCRECON == 0 > movu [t2], m1 > movu [t2 + 16], m3 > +%endif > movu [t4], m1 > movu [t4 + 16], m3 > > @@ -667,7 +710,9 @@ > add t4, t7 > add t0, t5 > lea t1, [t1 + t5 * 2] > +%if NEW_CALCRECON == 0 > add t2, t5 > +%endif > > dec t8d > jnz .loop > diff -r 3cbde0b893e3 -r 7a61566806f6 source/test/pixelharness.cpp > --- a/source/test/pixelharness.cpp Mon Mar 03 13:37:35 2014 -0600 > +++ b/source/test/pixelharness.cpp Tue Mar 04 19:35:39 2014 +0900 > @@ -351,10 +351,12 @@ > { > return false; > } > +#if !NEW_CALCRECON > if (memcmp(ref_reco, opt_reco, 64 * 64 * sizeof(pixel))) > { > return false; > } > +#endif > if (memcmp(ref_pred, opt_pred, 64 * 64 * sizeof(pixel))) > { > return false; > _______________________________________________ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel -- Steve Borho _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel