[x265] [PATCH] cu-lossless: fix inter hash mistake
# HG changeset patch # User Deepthi Nandakumar deep...@multicorewareinc.com # Date 1409037983 -19800 # Tue Aug 26 12:56:23 2014 +0530 # Node ID 6573fd72294e481133f2d1636100d6c8419fb597 # Parent 9387a276897fc3ab11bbbe20d4f0d7831caf3115 cu-lossless: fix inter hash mistake The CU needs to be re-encoded if lossless is chosen as the best mode. diff -r 9387a276897f -r 6573fd72294e source/Lib/TLibEncoder/TEncSearch.cpp --- a/source/Lib/TLibEncoder/TEncSearch.cpp Mon Aug 25 17:07:45 2014 -0500 +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Tue Aug 26 12:56:23 2014 +0530 @@ -2325,6 +2325,7 @@ } uint64_t bestCost = MAX_INT64; +uint32_t bestMode = 0; for (uint32_t modeId = 0; modeId numModes; modeId++) { @@ -2392,6 +2393,7 @@ if (cu-getQtRootCbf(0)) xSetResidualQTData(cu, 0, outBestResiYuv, depth, true); +bestMode = modeId; //0 for lossless bestBits = bits; bestCost = cost; bestCoeffBits = cu-m_coeffBits; @@ -2401,6 +2403,19 @@ X265_CHECK(bestCost != MAX_INT64, no best cost\n); +if(bIsTQBypassEnable !bestMode) +{ +cu-setCUTransquantBypassSubParts(true, 0, depth); +m_entropyCoder-load(m_rdEntropyCoders[depth][CI_CURR_BEST]); +uint64_t cost = 0; +uint32_t zeroDistortion = 0; +uint32_t bits = 0; +uint32_t distortion = 0; +xEstimateResidualQT(cu, 0, fencYuv, predYuv, outResiYuv, depth, cost, bits, distortion, zeroDistortion); +xSetResidualQTData(cu, 0, NULL, depth, false); +m_entropyCoder-store(m_rdEntropyCoders[depth][CI_TEMP_BEST]); +} + if (cu-getQtRootCbf(0)) outReconYuv-addClip(predYuv, outBestResiYuv, log2CUSize); else ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH 1 of 3] analysis: fix inter hash mistake with --cu-lossless
Thanks, Min. This is a solution but will affect performance with an extra TComDataCU*. I have sent another patch where I'm just re-encoding the CU if lossless is chosen as the best mode. This will not affect normal analysis. Can you review that? Deepthi On Tue, Aug 26, 2014 at 3:47 AM, Steve Borho st...@borho.org wrote: # HG changeset patch # User Min Chen chenm...@163.com # Date 1409002891 18000 # Mon Aug 25 16:41:31 2014 -0500 # Node ID 0bf2756898bc78e5660a6b607b2f3cda97834264 # Parent 5acfb12ec5d17cc700e313fc99248e2408e5967b analysis: fix inter hash mistake with --cu-lossless diff -r 5acfb12ec5d1 -r 0bf2756898bc source/Lib/TLibEncoder/TEncSearch.cpp --- a/source/Lib/TLibEncoder/TEncSearch.cpp Mon Aug 25 17:53:12 2014 +0900 +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Mon Aug 25 16:41:31 2014 -0500 @@ -2293,7 +2293,7 @@ * \returns void */ void TEncSearch::encodeResAndCalcRdInterCU(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* outResiYuv, - ShortYuv* outBestResiYuv, TComYuv* outReconYuv) + ShortYuv* outBestResiYuv, TComYuv* outReconYuv, TComDataCU* tmpCu) { X265_CHECK(!cu-isIntra(0), intra CU not expected\n); @@ -2321,6 +2321,7 @@ } uint64_t bestCost = MAX_INT64; +bool bestTransquantBypassFlag = bIsTQBypassEnable; for (uint32_t modeId = 0; modeId numModes; modeId++) { @@ -2388,15 +2389,29 @@ if (cu-getQtRootCbf(0)) xSetResidualQTData(cu, 0, outBestResiYuv, depth, true); +bestTransquantBypassFlag = bIsLosslessMode; bestBits = bits; bestCost = cost; bestCoeffBits = cu-m_coeffBits; m_entropyCoder-store(m_rdEntropyCoders[depth][CI_TEMP_BEST]); } + +// Save lossless mode coeff +if (bIsLosslessMode) +{ +tmpCu-copyPartFrom(cu, 0, depth, false); +} } X265_CHECK(bestCost != MAX_INT64, no best cost\n); +if (bestTransquantBypassFlag !m_param-bLossless) +{ +assert(log2CUSize 2); +cu-setCUTransquantBypassSubParts(true, 0, depth); +cu-copyPartFrom(tmpCu, 0, depth, false); +} + if (cu-getQtRootCbf(0)) outReconYuv-addClip(predYuv, outBestResiYuv, log2CUSize); else diff -r 5acfb12ec5d1 -r 0bf2756898bc source/Lib/TLibEncoder/TEncSearch.h --- a/source/Lib/TLibEncoder/TEncSearch.h Mon Aug 25 17:53:12 2014 +0900 +++ b/source/Lib/TLibEncoder/TEncSearch.h Mon Aug 25 16:41:31 2014 -0500 @@ -147,7 +147,7 @@ /// encode residual and compute rd-cost for inter mode void encodeResAndCalcRdInterCU(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, ShortYuv* resiYuv, ShortYuv* bestResiYuv, - TComYuv* reconYuv); + TComYuv* reconYuv, TComDataCU* tmpCu); void encodeResAndCalcRdSkipCU(TComDataCU* cu, TComYuv* fencYuv, TComYuv* predYuv, TComYuv* reconYuv); void xRecurIntraCodingQT(TComDataCU* cu, uint32_t trDepth, uint32_t absPartIdx, TComYuv* fencYuv, diff -r 5acfb12ec5d1 -r 0bf2756898bc source/encoder/analysis.cpp --- a/source/encoder/analysis.cpp Mon Aug 25 17:53:12 2014 +0900 +++ b/source/encoder/analysis.cpp Mon Aug 25 16:41:31 2014 -0500 @@ -82,7 +82,7 @@ uint32_t sizeL = cuSize * cuSize; uint32_t sizeC = sizeL (CHROMA_H_SHIFT(csp) + CHROMA_V_SHIFT(csp)); -ok = m_memPool[i].initialize(numPartitions, sizeL, sizeC, 8, tqBypass); +ok = m_memPool[i].initialize(numPartitions, sizeL, sizeC, 9, tqBypass); m_interCU_2Nx2N[i] = new TComDataCU; m_interCU_2Nx2N[i]-create(m_memPool[i], numPartitions, cuSize, csp, 0, tqBypass); @@ -108,6 +108,9 @@ m_tempCU[i] = new TComDataCU; m_tempCU[i]-create(m_memPool[i], numPartitions, cuSize, csp, 7, tqBypass); +m_tempLosslessCU[i] = new TComDataCU; +m_tempLosslessCU[i]-create(m_memPool[i], numPartitions, cuSize, csp, 8, tqBypass); + m_bestPredYuv[i] = new TComYuv; ok = m_bestPredYuv[i]-create(cuSize, cuSize, csp); @@ -158,6 +161,7 @@ delete m_bestMergeCU[i]; delete m_bestCU[i]; delete m_tempCU[i]; +delete m_tempLosslessCU[i]; if (m_bestPredYuv m_bestPredYuv[i]) { @@ -240,6 +244,7 @@ // initialize CU data m_bestCU[0]-initCU(cu-m_pic, cu-getAddr()); m_tempCU[0]-initCU(cu-m_pic, cu-getAddr()); +m_tempLosslessCU[0]-initCU(cu-m_pic, cu-getAddr()); // analysis of CU uint32_t numPartition = cu-getTotalNumPart(); @@ -394,6 +399,7 @@ uint32_tnextDepth = depth + 1; TComDataCU* subBestPartCU = m_bestCU[nextDepth]; TComDataCU* subTempPartCU = m_tempCU[nextDepth]; +TComDataCU*
[x265] fix m_initSliceContext (uninitialised m_sliceQp)
# HG changeset patch # User Satoshi Nakagawa nakagawa...@oki.com # Date 1409041357 -32400 # Tue Aug 26 17:22:37 2014 +0900 # Node ID c18255467f12da1a780340ade55292c32d95bfdd # Parent 5acfb12ec5d17cc700e313fc99248e2408e5967b fix m_initSliceContext (uninitialised m_sliceQp) diff -r 5acfb12ec5d1 -r c18255467f12 source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Mon Aug 25 17:53:12 2014 +0900 +++ b/source/encoder/frameencoder.cpp Tue Aug 26 17:22:37 2014 +0900 @@ -158,8 +158,6 @@ int64_t startCompressTime = x265_mdate(); Slice* slice = m_frame-m_picSym-m_slice; -m_initSliceContext.resetEntropy(slice); - /* Emit access unit delimiter unless this is the first frame and the user is * not repeating headers (since AUD is supposed to be the first NAL in the access * unit) */ @@ -225,12 +223,15 @@ m_frameFilter.m_sao.m_refDepth = 2 + !IS_REFERENCED(slice); break; } -m_frameFilter.start(m_frame); // Clip slice QP to 0-51 spec range before encoding qp = Clip3(-QP_BD_OFFSET, MAX_QP, qp); slice-m_sliceQp = qp; +m_initSliceContext.resetEntropy(slice); + +m_frameFilter.start(m_frame); + if (m_frame-m_lowres.bKeyframe) { if (m_param-bEmitHRDSEI) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] search: whitespace nits
# HG changeset patch # User Deepthi Nandakumar deep...@multicorewareinc.com # Date 1409038258 -19800 # Tue Aug 26 13:00:58 2014 +0530 # Node ID 00adc4fc9cdbd6c2f468a877c4323a8f0e8980f9 # Parent 6573fd72294e481133f2d1636100d6c8419fb597 search: whitespace nits diff -r 6573fd72294e -r 00adc4fc9cdb source/Lib/TLibEncoder/TEncSearch.cpp --- a/source/Lib/TLibEncoder/TEncSearch.cpp Tue Aug 26 12:56:23 2014 +0530 +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Tue Aug 26 13:00:58 2014 +0530 @@ -2403,7 +2403,7 @@ X265_CHECK(bestCost != MAX_INT64, no best cost\n); -if(bIsTQBypassEnable !bestMode) +if (bIsTQBypassEnable !bestMode) { cu-setCUTransquantBypassSubParts(true, 0, depth); m_entropyCoder-load(m_rdEntropyCoders[depth][CI_CURR_BEST]); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: optimize dct4
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1409046621 -19800 # Tue Aug 26 15:20:21 2014 +0530 # Node ID bbd5b3f269b095760d21877e94d67df8bd72f479 # Parent 5acfb12ec5d17cc700e313fc99248e2408e5967b asm: optimize dct4 diff -r 5acfb12ec5d1 -r bbd5b3f269b0 source/common/x86/dct8.asm --- a/source/common/x86/dct8.asmMon Aug 25 17:53:12 2014 +0900 +++ b/source/common/x86/dct8.asmTue Aug 26 15:20:21 2014 +0530 @@ -30,6 +30,8 @@ SECTION_RODATA 32 +dct4_shuf: db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13 + tab_dct4: times 4 dw 64, 64 times 4 dw 83, 36 times 4 dw 64, -64 @@ -118,16 +120,14 @@ movhm0, [r0 + 0 * r2] movhm1, [r0 + 1 * r2] punpcklqdq m0, m1 -pshufd m0, m0, 0xD8 -pshufhw m0, m0, 0xB1 +pshufb m0, [dct4_shuf] lea r0, [r0 + 2 * r2] movhm1, [r0] movhm2, [r0 + r2] punpcklqdq m1, m2 -pshufd m1, m1, 0xD8 -pshufhw m1, m1, 0xB1 +pshufb m1, [dct4_shuf] punpcklqdq m2, m0, m1 punpckhqdq m0, m1 @@ -140,8 +140,7 @@ paddd m3, m7 psrad m3, DCT_SHIFT packssdwm0, m3 -pshufd m0, m0, 0xD8 -pshufhw m0, m0, 0xB1 +pshufb m0, [dct4_shuf] pmaddwd m1, m6 paddd m1, m7 psrad m1, DCT_SHIFT @@ -149,9 +148,8 @@ paddd m2, m7 psrad m2, DCT_SHIFT packssdwm1, m2 -pshufd m1, m1, 0xD8 -pshufhw m1, m1, 0xB1 +pshufb m1, [dct4_shuf] punpcklqdq m2, m0, m1 punpckhqdq m0, m1 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] fix m_initSliceContext (uninitialised m_sliceQp)
Thanks, queued for default (does not apply on stable due to the SAO refactor). On Tue, Aug 26, 2014 at 1:55 PM, Satoshi Nakagawa nakagawa...@oki.com wrote: # HG changeset patch # User Satoshi Nakagawa nakagawa...@oki.com # Date 1409041357 -32400 # Tue Aug 26 17:22:37 2014 +0900 # Node ID c18255467f12da1a780340ade55292c32d95bfdd # Parent 5acfb12ec5d17cc700e313fc99248e2408e5967b fix m_initSliceContext (uninitialised m_sliceQp) diff -r 5acfb12ec5d1 -r c18255467f12 source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp Mon Aug 25 17:53:12 2014 +0900 +++ b/source/encoder/frameencoder.cpp Tue Aug 26 17:22:37 2014 +0900 @@ -158,8 +158,6 @@ int64_t startCompressTime = x265_mdate(); Slice* slice = m_frame-m_picSym-m_slice; -m_initSliceContext.resetEntropy(slice); - /* Emit access unit delimiter unless this is the first frame and the user is * not repeating headers (since AUD is supposed to be the first NAL in the access * unit) */ @@ -225,12 +223,15 @@ m_frameFilter.m_sao.m_refDepth = 2 + !IS_REFERENCED(slice); break; } -m_frameFilter.start(m_frame); // Clip slice QP to 0-51 spec range before encoding qp = Clip3(-QP_BD_OFFSET, MAX_QP, qp); slice-m_sliceQp = qp; +m_initSliceContext.resetEntropy(slice); + +m_frameFilter.start(m_frame); + if (m_frame-m_lowres.bKeyframe) { if (m_param-bEmitHRDSEI) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH] asm: optimize dct4, replaced pshufd(latency 4-6)+pshufhw(latency 2) instructions with pshufb(latency 1)
sorry, ignore this patch, I forgot one more little modification. On Wed, Aug 27, 2014 at 10:27 AM, dnyanesh...@multicorewareinc.com wrote: # HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1409115349 -19800 # Wed Aug 27 10:25:49 2014 +0530 # Node ID f49ed93e3daff100903e5fd7aa1bd874b9e79caf # Parent 32891b95f6693a39afbdf7929e12e3e0c6e990d1 asm: optimize dct4, replaced pshufd(latency 4-6)+pshufhw(latency 2) instructions with pshufb(latency 1) diff -r 32891b95f669 -r f49ed93e3daf source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 26 15:03:38 2014 -0500 +++ b/source/common/x86/asm-primitives.cpp Wed Aug 27 10:25:49 2014 +0530 @@ -1375,7 +1375,7 @@ p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse2; p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse2; -p.dct[DCT_4x4] = x265_dct4_sse2; +p.dct[DCT_4x4] = x265_dct4_ssse3; p.idct[IDCT_4x4] = x265_idct4_sse2; p.idct[IDST_4x4] = x265_idst4_sse2; @@ -1545,7 +1545,7 @@ p.transpose[BLOCK_64x64] = x265_transpose64_sse2; p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2; p.ssim_end_4 = x265_pixel_ssim_end4_sse2; -p.dct[DCT_4x4] = x265_dct4_sse2; +p.dct[DCT_4x4] = x265_dct4_ssse3; p.idct[IDCT_4x4] = x265_idct4_sse2; p.idct[IDST_4x4] = x265_idst4_sse2; p.planecopy_sp = x265_downShift_16_sse2; diff -r 32891b95f669 -r f49ed93e3daf source/common/x86/dct8.asm --- a/source/common/x86/dct8.asmTue Aug 26 15:03:38 2014 -0500 +++ b/source/common/x86/dct8.asmWed Aug 27 10:25:49 2014 +0530 @@ -30,6 +30,8 @@ SECTION_RODATA 32 +dct4_shuf: db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13 + tab_dct4: times 4 dw 64, 64 times 4 dw 83, 36 times 4 dw 64, -64 @@ -98,7 +100,7 @@ ;-- ;void dct4(int16_t *src, int32_t *dst, intptr_t stride) ;-- -INIT_XMM sse2 +INIT_XMM ssse3 cglobal dct4, 3, 4, 8 %if BIT_DEPTH == 10 %define DCT_SHIFT 3 @@ -112,22 +114,21 @@ add r2d, r2d lea r3, [tab_dct4] +movam3, [dct4_shuf] movam4, [r3 + 0 * 16] movam5, [r3 + 1 * 16] movam6, [r3 + 2 * 16] movhm0, [r0 + 0 * r2] movhm1, [r0 + 1 * r2] punpcklqdq m0, m1 -pshufd m0, m0, 0xD8 -pshufhw m0, m0, 0xB1 +pshufb m0, m3 lea r0, [r0 + 2 * r2] movhm1, [r0] movhm2, [r0 + r2] punpcklqdq m1, m2 -pshufd m1, m1, 0xD8 -pshufhw m1, m1, 0xB1 +pshufb m1, m3 punpcklqdq m2, m0, m1 punpckhqdq m0, m1 @@ -140,8 +141,8 @@ paddd m3, m7 psrad m3, DCT_SHIFT packssdwm0, m3 -pshufd m0, m0, 0xD8 -pshufhw m0, m0, 0xB1 +movam3, [dct4_shuf] +pshufb m0, m3 pmaddwd m1, m6 paddd m1, m7 psrad m1, DCT_SHIFT @@ -149,9 +150,8 @@ paddd m2, m7 psrad m2, DCT_SHIFT packssdwm1, m2 -pshufd m1, m1, 0xD8 -pshufhw m1, m1, 0xB1 +pshufb m1, m3 punpcklqdq m2, m0, m1 punpckhqdq m0, m1 diff -r 32891b95f669 -r f49ed93e3daf source/common/x86/dct8.h --- a/source/common/x86/dct8.h Tue Aug 26 15:03:38 2014 -0500 +++ b/source/common/x86/dct8.h Wed Aug 27 10:25:49 2014 +0530 @@ -24,7 +24,7 @@ #ifndef X265_DCT8_H #define X265_DCT8_H -void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride); +void x265_dct4_ssse3(int16_t *src, int32_t *dst, intptr_t stride); void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride); void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride); void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] asm: optimize dct4, replaced pshufd(latency 4-6)+pshufhw(latency 2) instructions with pshufb(latency 1)
# HG changeset patch # User Dnyaneshwar G dnyanesh...@multicorewareinc.com # Date 1409115913 -19800 # Wed Aug 27 10:35:13 2014 +0530 # Node ID 9e19a59e1de22bc39924365626c48fdb2557592e # Parent 32891b95f6693a39afbdf7929e12e3e0c6e990d1 asm: optimize dct4, replaced pshufd(latency 4-6)+pshufhw(latency 2) instructions with pshufb(latency 1) diff -r 32891b95f669 -r 9e19a59e1de2 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 26 15:03:38 2014 -0500 +++ b/source/common/x86/asm-primitives.cpp Wed Aug 27 10:35:13 2014 +0530 @@ -1375,7 +1375,6 @@ p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse2; p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse2; -p.dct[DCT_4x4] = x265_dct4_sse2; p.idct[IDCT_4x4] = x265_idct4_sse2; p.idct[IDST_4x4] = x265_idst4_sse2; @@ -1388,6 +1387,7 @@ INTRA_ANG_SSSE3(ssse3); +p.dct[DCT_4x4] = x265_dct4_ssse3; p.dct[DST_4x4] = x265_dst4_ssse3; p.idct[IDCT_8x8] = x265_idct8_ssse3; p.count_nonzero = x265_count_nonzero_ssse3; @@ -1545,7 +1545,6 @@ p.transpose[BLOCK_64x64] = x265_transpose64_sse2; p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2; p.ssim_end_4 = x265_pixel_ssim_end4_sse2; -p.dct[DCT_4x4] = x265_dct4_sse2; p.idct[IDCT_4x4] = x265_idct4_sse2; p.idct[IDST_4x4] = x265_idst4_sse2; p.planecopy_sp = x265_downShift_16_sse2; @@ -1582,6 +1581,7 @@ p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_ssse3; p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_ssse3; // for i444 , chroma_p2s can be replaced by luma_p2s +p.dct[DCT_4x4] = x265_dct4_ssse3; p.dct[DST_4x4] = x265_dst4_ssse3; p.idct[IDCT_8x8] = x265_idct8_ssse3; p.count_nonzero = x265_count_nonzero_ssse3; diff -r 32891b95f669 -r 9e19a59e1de2 source/common/x86/dct8.asm --- a/source/common/x86/dct8.asmTue Aug 26 15:03:38 2014 -0500 +++ b/source/common/x86/dct8.asmWed Aug 27 10:35:13 2014 +0530 @@ -30,6 +30,8 @@ SECTION_RODATA 32 +dct4_shuf: db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13 + tab_dct4: times 4 dw 64, 64 times 4 dw 83, 36 times 4 dw 64, -64 @@ -98,7 +100,7 @@ ;-- ;void dct4(int16_t *src, int32_t *dst, intptr_t stride) ;-- -INIT_XMM sse2 +INIT_XMM ssse3 cglobal dct4, 3, 4, 8 %if BIT_DEPTH == 10 %define DCT_SHIFT 3 @@ -112,22 +114,21 @@ add r2d, r2d lea r3, [tab_dct4] +movam3, [dct4_shuf] movam4, [r3 + 0 * 16] movam5, [r3 + 1 * 16] movam6, [r3 + 2 * 16] movhm0, [r0 + 0 * r2] movhm1, [r0 + 1 * r2] punpcklqdq m0, m1 -pshufd m0, m0, 0xD8 -pshufhw m0, m0, 0xB1 +pshufb m0, m3 lea r0, [r0 + 2 * r2] movhm1, [r0] movhm2, [r0 + r2] punpcklqdq m1, m2 -pshufd m1, m1, 0xD8 -pshufhw m1, m1, 0xB1 +pshufb m1, m3 punpcklqdq m2, m0, m1 punpckhqdq m0, m1 @@ -140,8 +141,8 @@ paddd m3, m7 psrad m3, DCT_SHIFT packssdwm0, m3 -pshufd m0, m0, 0xD8 -pshufhw m0, m0, 0xB1 +movam3, [dct4_shuf] +pshufb m0, m3 pmaddwd m1, m6 paddd m1, m7 psrad m1, DCT_SHIFT @@ -149,9 +150,8 @@ paddd m2, m7 psrad m2, DCT_SHIFT packssdwm1, m2 -pshufd m1, m1, 0xD8 -pshufhw m1, m1, 0xB1 +pshufb m1, m3 punpcklqdq m2, m0, m1 punpckhqdq m0, m1 diff -r 32891b95f669 -r 9e19a59e1de2 source/common/x86/dct8.h --- a/source/common/x86/dct8.h Tue Aug 26 15:03:38 2014 -0500 +++ b/source/common/x86/dct8.h Wed Aug 27 10:35:13 2014 +0530 @@ -24,7 +24,7 @@ #ifndef X265_DCT8_H #define X265_DCT8_H -void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride); +void x265_dct4_ssse3(int16_t *src, int32_t *dst, intptr_t stride); void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride); void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride); void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel