Could you please send this as a patch file as well? Thanks, Kirithika ________________________________ From: x265-devel <[email protected]> on behalf of Neil <[email protected]> Sent: Wednesday, May 6, 2026 9:17 PM To: [email protected] <[email protected]> Subject: [x265] [PATCH 1/1] TME: fix threaded-me crashes from stale motion data
[Some people who received this message don't often get email from [email protected]. Learn why this is important at https://aka.ms/LearnAboutSenderIdentification ] From: neil1123 <[email protected]> Threaded ME can leave stale or invalid data in Slice::m_ctuMV. When the main encoder later consumes that data, invalid ref indices or stale motion vectors can make predInterSearch() skip the regular ME path and use bad motion data instead. Fix this by clearing m_ctuMV when FrameData is reused and initializing both refs to REF_NOT_VALID. Validate the computed PU table index and L0/L1 ref indices before consuming a threaded-ME slot; if the slot is missing, stale, out of range, or does not describe a valid uni- or bi-pred result, fall back to the regular ME path. Also resize g_puStartIdx from [128][8] to [2 * MAX_CU_SIZE + 1][NUM_PART_SIZES]. g_puLookup includes a 64x64 PU, so width + height can be 128; the old table only had valid row indices 0..127. Tying the second dimension to NUM_PART_SIZES also avoids another hard-coded partition-count assumption. Finally, move setSourcePU() before selectMVP() in puMotionEstimation(). selectMVP() calls m_me.bufSAD(), which reads the cached source PU prepared by setSourcePU(); calling setSourcePU() afterwards allowed MVP selection to use stale source pixels from a previous PU. Validation on Windows 11 / MSYS2 CLANG64 / clang 22.1.4: - applies cleanly to x265 master 7b3d1f515318f73056abd9e99944e9f79db090bd with git am - git diff --check passes - checked/noasm CLI build passes - 16-frame Threaded ME smoke passes with --threaded-me --pools 32 --frame-threads 1 --no-wpp --- source/common/framedata.cpp | 10 +++ source/encoder/search.cpp | 145 ++++++++++++++++++---------------- source/encoder/threadedme.cpp | 2 +- source/encoder/threadedme.h | 2 +- 4 files changed, 91 insertions(+), 68 deletions(-) diff --git a/source/common/framedata.cpp b/source/common/framedata.cpp index aa317dfe8..611f40f9d 100644 --- a/source/common/framedata.cpp +++ b/source/common/framedata.cpp @@ -90,6 +90,16 @@ void FrameData::reinit(const SPS& sps) { memset(m_cuStat, 0, sps.numCUsInFrame * sizeof(*m_cuStat)); memset(m_rowStat, 0, sps.numCuInHeight * sizeof(*m_rowStat)); + if (m_param->bThreadedME) + { + uint32_t totalPUs = sps.numCuInWidth * sps.numCuInHeight * MAX_NUM_PUS_PER_CTU; + memset(m_slice->m_ctuMV, 0, totalPUs * sizeof(*m_slice->m_ctuMV)); + for (uint32_t i = 0; i < totalPUs; i++) + { + m_slice->m_ctuMV[i].ref[0] = REF_NOT_VALID; + m_slice->m_ctuMV[i].ref[1] = REF_NOT_VALID; + } + } if (m_param->bDynamicRefine) { memset(m_picCTU->m_collectCURd, 0, MAX_NUM_DYN_REFINE * sps.numCUsInFrame * sizeof(uint64_t)); diff --git a/source/encoder/search.cpp b/source/encoder/search.cpp index 2fcbbd411..e922097ad 100644 --- a/source/encoder/search.cpp +++ b/source/encoder/search.cpp @@ -278,6 +278,11 @@ void Search::puMotionEstimation(const Slice* slice, const CUGeom& cuGeom, CUData int mvpIdx = 0; bool bLowresMVP = false; + PicYuv* recon = slice->m_mref[list][ref].reconPic; + int offset = recon->getLumaAddr(cu.m_cuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) - recon->getLumaAddr(0); + + m_me.setSourcePU(fencPic->m_picOrg[0], fencPic->m_stride, offset, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine); + if (!isMVP) { for(int dir = MD_LEFT; dir <= MD_ABOVE_LEFT ; dir++) @@ -345,16 +350,12 @@ void Search::puMotionEstimation(const Slice* slice, const CUGeom& cuGeom, CUData } } - PicYuv* recon = slice->m_mref[list][ref].reconPic; - int offset = recon->getLumaAddr(cu.m_cuAddr, pu.cuAbsPartIdx + pu.puAbsPartIdx) - recon->getLumaAddr(0); - if (m_param->searchMethod == X265_SEA) { for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++) m_me.integral[planes] = slice->m_refFrameList[list][ref]->m_encData->m_meIntegral[planes] + offset; } - m_me.setSourcePU(fencPic->m_picOrg[0], fencPic->m_stride, offset, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine); setSearchRange(cu, mvp, searchRange, mvmin, mvmax); if (isMVP) @@ -2600,8 +2601,59 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, lastMode, m_listSelBits); bool bDoUnidir = true; + bool useThreadedME = false; + bool threadedBidir = false; + bool threadedUniL0 = false; + bool threadedUniL1 = false; + MEData threadedMEData; cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours); + if (m_param->bThreadedME) + { + int cuSize = 1 << cu.m_log2CUSize[0]; + int lookupWidth = pu.width; + int lookupHeight = pu.height; + bool isAmp = cu.m_partSize[0] >= SIZE_2NxnU; + + if (isAmp) + { + if (cu.m_partSize[0] == SIZE_2NxnU || cu.m_partSize[0] == SIZE_2NxnD) + lookupHeight = puIdx ? (pu.width - pu.height) : pu.height; + else + lookupWidth = puIdx ? (pu.height - pu.width) : pu.width; + } + + if (lookupWidth + lookupHeight <= 2 * MAX_CU_SIZE) + { + int startIdx = g_puStartIdx[lookupWidth + lookupHeight][static_cast<int>(cu.m_partSize[0])]; + int alignWidth = isAmp ? cuSize : pu.width; + int alignHeight = isAmp ? cuSize : pu.height; + int numPUX = m_param->maxCUSize / alignWidth; + int numPUY = m_param->maxCUSize / alignHeight; + int puOffset = isAmp ? (puIdx * numPUX * numPUY) : (cu.m_partSize[0] == SIZE_2NxN ? (puIdx * numPUX) : puIdx); + int relX = (cu.m_cuPelX / alignWidth) % numPUX; + int relY = (cu.m_cuPelY / alignHeight) % numPUY; + int index = startIdx + (relY * numPUX + relX) + puOffset; + + if (index >= 0 && index < MAX_NUM_PUS_PER_CTU) + { + int row = cu.m_cuAddr / m_slice->m_sps->numCuInWidth; + int col = cu.m_cuAddr % m_slice->m_sps->numCuInWidth; + int slotIdx = (col % m_slice->m_sps->numCuInWidth) * m_slice->m_sps->numCuInHeight + row; + + threadedMEData = slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + index]; + + bool validL0 = threadedMEData.ref[0] >= 0 && threadedMEData.ref[0] < numRefIdx[0]; + bool validL1 = numPredDir > 1 && threadedMEData.ref[1] >= 0 && threadedMEData.ref[1] < numRefIdx[1]; + + threadedBidir = validL0 && validL1; + threadedUniL0 = validL0 && threadedMEData.ref[1] == REF_NOT_VALID; + threadedUniL1 = validL1 && threadedMEData.ref[0] == REF_NOT_VALID; + useThreadedME = threadedBidir || threadedUniL0 || threadedUniL1; + } + } + } + /* Uni-directional prediction */ if ((m_param->analysisLoadReuseLevel > 1 && m_param->analysisLoadReuseLevel != 10) || (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || (m_param->bAnalysisType == AVC_INFO) || (useAsMVP)) @@ -2773,7 +2825,7 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma /* if no peer threads were bonded, fall back to doing unidirectional * searches ourselves without overhead of singleMotionEstimation() */ } - if (bDoUnidir && !m_param->bThreadedME) + if (bDoUnidir && (!m_param->bThreadedME || !useThreadedME)) { interMode.bestME[puIdx][0].ref = interMode.bestME[puIdx][1].ref = -1; uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : (uint32_t)-1; @@ -2885,7 +2937,7 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma if (slice->isInterB() && !cu.isBipredRestriction() && /* biprediction is possible for this PU */ cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N && /* 2Nx2N biprediction is handled elsewhere */ - bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT && !m_param->bThreadedME) + bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT && (!m_param->bThreadedME || !useThreadedME)) { bidir[0] = bestME[0]; bidir[1] = bestME[1]; @@ -2991,65 +3043,28 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma bool uniL0 = false; bool uniL1 = false; - if (m_param->bThreadedME) + if (useThreadedME) { - int cuSize = 1 << cu.m_log2CUSize[0]; - - int lookupWidth = pu.width; - int lookupHeight = pu.height; + bestME[0].ref = threadedMEData.ref[0]; + bestME[1].ref = threadedMEData.ref[1]; - bool isAmp = cu.m_partSize[0] >= SIZE_2NxnU; + isBidir = threadedBidir; + uniL0 = threadedUniL0; + uniL1 = threadedUniL1; - if (isAmp) - { - if (cu.m_partSize[0] == SIZE_2NxnU || cu.m_partSize[0] == SIZE_2NxnD) - lookupHeight = (puIdx) ? (pu.width - pu.height) : pu.height; - else - lookupWidth = (puIdx) ? (pu.height - pu.width) : pu.width; - } - - int startIdx = g_puStartIdx[lookupWidth + lookupHeight][static_cast<int>(cu.m_partSize[0])]; - - int alignWidth = isAmp ? cuSize : pu.width; - int alignHeight = isAmp ? cuSize : pu.height; - - int numPUX = m_param->maxCUSize / alignWidth; - int numPUY = m_param->maxCUSize / alignHeight; - - int puOffset = isAmp ? (puIdx * numPUX * numPUY) : (cu.m_partSize[0] == SIZE_2NxN ? (puIdx * numPUX) : puIdx); - - int relX = (cu.m_cuPelX / alignWidth) % numPUX; - int relY = (cu.m_cuPelY / alignHeight) % numPUY; - - int index = startIdx + (relY * numPUX + relX) + puOffset; - - int row = cu.m_cuAddr / m_slice->m_sps->numCuInWidth; - int col = cu.m_cuAddr % m_slice->m_sps->numCuInWidth; - - int slotIdx = (col % m_slice->m_sps->numCuInWidth) * m_slice->m_sps->numCuInHeight + row; - - MEData meData = slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + index]; - - bestME[0].ref = meData.ref[0]; - bestME[1].ref = meData.ref[1]; - - isBidir = (bestME[0].ref >= 0 && bestME[1].ref >= 0); - uniL0 = (bestME[0].ref >= 0 && bestME[1].ref == REF_NOT_VALID); - uniL1 = (bestME[1].ref >= 0 && bestME[0].ref == REF_NOT_VALID); - - if(isBidir) + if (isBidir) { cu.getPMV(interMode.interNeighbours, 0, bestME[0].ref, interMode.amvpCand[0][bestME[0].ref], mvc); cu.getPMV(interMode.interNeighbours, 1, bestME[1].ref, interMode.amvpCand[1][bestME[1].ref], mvc); - bidir[0].mv = meData.mv[0]; - bidir[1].mv = meData.mv[1]; + bidir[0].mv = threadedMEData.mv[0]; + bidir[1].mv = threadedMEData.mv[1]; bidir[0].mvp = interMode.amvpCand[0][bestME[0].ref][0]; bidir[1].mvp = interMode.amvpCand[1][bestME[1].ref][0]; - bidir[0].mvCost = meData.mvCost[0]; - bidir[1].mvCost = meData.mvCost[1]; - bidirCost = meData.cost; - bidirBits = meData.bits; + bidir[0].mvCost = threadedMEData.mvCost[0]; + bidir[1].mvCost = threadedMEData.mvCost[1]; + bidirCost = threadedMEData.cost; + bidirBits = threadedMEData.bits; bestCost = bidirCost; } @@ -3057,11 +3072,11 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma { cu.getPMV(interMode.interNeighbours, 0, bestME[0].ref, interMode.amvpCand[0][bestME[0].ref], mvc); - bestME[0].mv = meData.mv[0]; + bestME[0].mv = threadedMEData.mv[0]; bestME[0].mvp = interMode.amvpCand[0][bestME[0].ref][0]; - bestME[0].mvCost = meData.mvCost[0]; - bestME[0].cost = meData.cost; - bestME[0].bits = meData.bits; + bestME[0].mvCost = threadedMEData.mvCost[0]; + bestME[0].cost = threadedMEData.cost; + bestME[0].bits = threadedMEData.bits; bestCost = bestME[0].cost; } @@ -3069,16 +3084,14 @@ void Search::predInterSearch(Mode& interMode, const CUGeom& cuGeom, bool bChroma { cu.getPMV(interMode.interNeighbours, 1, bestME[1].ref, interMode.amvpCand[1][bestME[1].ref], mvc); - bestME[1].mv = meData.mv[1]; + bestME[1].mv = threadedMEData.mv[1]; bestME[1].mvp = interMode.amvpCand[1][bestME[1].ref][0]; - bestME[1].mvCost = meData.mvCost[1]; - bestME[1].cost = meData.cost; - bestME[1].bits = meData.bits; + bestME[1].mvCost = threadedMEData.mvCost[1]; + bestME[1].cost = threadedMEData.cost; + bestME[1].bits = threadedMEData.bits; bestCost = bestME[1].cost; } - else - x265_log(NULL, X265_LOG_ERROR, "Invalid ME mode"); if (mrgCost < bestCost) isMerge = true; diff --git a/source/encoder/threadedme.cpp b/source/encoder/threadedme.cpp index 1028beb92..197f471de 100644 --- a/source/encoder/threadedme.cpp +++ b/source/encoder/threadedme.cpp @@ -30,7 +30,7 @@ #include <sstream> namespace X265_NS { -int g_puStartIdx[128][8] = {0}; +int g_puStartIdx[2 * MAX_CU_SIZE + 1][NUM_PART_SIZES] = {{0}}; bool ThreadedME::create() { diff --git a/source/encoder/threadedme.h b/source/encoder/threadedme.h index 5e5fc4878..1adb545f5 100644 --- a/source/encoder/threadedme.h +++ b/source/encoder/threadedme.h @@ -40,7 +40,7 @@ namespace X265_NS { -extern int g_puStartIdx[128][8]; +extern int g_puStartIdx[2 * MAX_CU_SIZE + 1][NUM_PART_SIZES]; class Encoder; class Analysis; -- 2.54.0.windows.1 _______________________________________________ x265-devel mailing list [email protected] https://ind01.safelinks.protection.outlook.com/?url=https%3A%2F%2Fmailman.videolan.org%2Flistinfo%2Fx265-devel&data=05%7C02%7Ckirithika%40multicorewareinc.com%7C4125244062c5451956c708deab86df8f%7Cffc5e88b3fa24d69a468344b6b766e7d%7C0%7C0%7C639136792994180708%7CUnknown%7CTWFpbGZsb3d8eyJFbXB0eU1hcGkiOnRydWUsIlYiOiIwLjAuMDAwMCIsIlAiOiJXaW4zMiIsIkFOIjoiTWFpbCIsIldUIjoyfQ%3D%3D%7C0%7C%7C%7C&sdata=CGwmnekFyEjZ8t1JsTaPNdPtWvV38hJvRypc9Lt7OvE%3D&reserved=0<https://mailman.videolan.org/listinfo/x265-devel> ⚠️ External Email Warning: This message originated from outside the organization (McW). Be cautious - Never share passwords or click unexpected links/attachments.
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
