From: neil1123 <[email protected]>

Threaded ME can leave stale or invalid data in Slice::m_ctuMV. When the
main encoder later consumes that data, invalid ref indices or stale motion
vectors can make predInterSearch() skip the regular ME path and use bad
motion data instead.

Fix this by clearing m_ctuMV when FrameData is reused and initializing both
refs to REF_NOT_VALID. Validate the computed PU table index and L0/L1 ref
indices before consuming a threaded-ME slot; if the slot is missing, stale,
out of range, or does not describe a valid uni- or bi-pred result, fall
back to the regular ME path.

Also resize g_puStartIdx from [128][8] to
[2 * MAX_CU_SIZE + 1][NUM_PART_SIZES]. g_puLookup includes a 64x64 PU, so
width + height can be 128; the old table only had valid row indices
0..127. Tying the second dimension to NUM_PART_SIZES also avoids another
hard-coded partition-count assumption.

Finally, move setSourcePU() before selectMVP() in puMotionEstimation().
selectMVP() calls m_me.bufSAD(), which reads the cached source PU prepared
by setSourcePU(); calling setSourcePU() afterwards allowed MVP selection to
use stale source pixels from a previous PU.

Validation on Windows 11 / MSYS2 CLANG64 / clang 22.1.4:
- applies cleanly to x265 master 7b3d1f515318f73056abd9e99944e9f79db090bd with 
git am
- git diff --check passes
- checked/noasm CLI build passes
- 16-frame Threaded ME smoke passes with --threaded-me --pools 32 
--frame-threads 1 --no-wpp
---
 source/common/framedata.cpp   |  10 +++
 source/encoder/search.cpp     | 145 ++++++++++++++++++----------------
 source/encoder/threadedme.cpp |   2 +-
 source/encoder/threadedme.h   |   2 +-
 4 files changed, 91 insertions(+), 68 deletions(-)

diff --git a/source/common/framedata.cpp b/source/common/framedata.cpp
index aa317dfe8..611f40f9d 100644
--- a/source/common/framedata.cpp
+++ b/source/common/framedata.cpp
@@ -90,6 +90,16 @@ void FrameData::reinit(const SPS& sps)
 {
     memset(m_cuStat, 0, sps.numCUsInFrame * sizeof(*m_cuStat));
     memset(m_rowStat, 0, sps.numCuInHeight * sizeof(*m_rowStat));
+    if (m_param->bThreadedME)
+    {
+        uint32_t totalPUs = sps.numCuInWidth * sps.numCuInHeight * 
MAX_NUM_PUS_PER_CTU;
+        memset(m_slice->m_ctuMV, 0, totalPUs * sizeof(*m_slice->m_ctuMV));
+        for (uint32_t i = 0; i < totalPUs; i++)
+        {
+            m_slice->m_ctuMV[i].ref[0] = REF_NOT_VALID;
+            m_slice->m_ctuMV[i].ref[1] = REF_NOT_VALID;
+        }
+    }
     if (m_param->bDynamicRefine)
     {
         memset(m_picCTU->m_collectCURd, 0, MAX_NUM_DYN_REFINE * 
sps.numCUsInFrame * sizeof(uint64_t));
diff --git a/source/encoder/search.cpp b/source/encoder/search.cpp
index 2fcbbd411..e922097ad 100644
--- a/source/encoder/search.cpp
+++ b/source/encoder/search.cpp
@@ -278,6 +278,11 @@ void Search::puMotionEstimation(const Slice* slice, const 
CUGeom& cuGeom, CUData
                 int mvpIdx = 0;
 
                 bool bLowresMVP = false;
+                PicYuv* recon = slice->m_mref[list][ref].reconPic;
+                int offset = recon->getLumaAddr(cu.m_cuAddr, pu.cuAbsPartIdx + 
pu.puAbsPartIdx) - recon->getLumaAddr(0);
+
+                m_me.setSourcePU(fencPic->m_picOrg[0], fencPic->m_stride, 
offset, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine);
+
                 if (!isMVP)
                 {
                     for(int dir = MD_LEFT; dir <= MD_ABOVE_LEFT ; dir++)
@@ -345,16 +350,12 @@ void Search::puMotionEstimation(const Slice* slice, const 
CUGeom& cuGeom, CUData
                     }
                 }
 
-                PicYuv* recon = slice->m_mref[list][ref].reconPic;
-                int offset = recon->getLumaAddr(cu.m_cuAddr, pu.cuAbsPartIdx + 
pu.puAbsPartIdx) - recon->getLumaAddr(0);
-
                 if (m_param->searchMethod == X265_SEA)
                 {
                     for (int planes = 0; planes < INTEGRAL_PLANE_NUM; planes++)
                         m_me.integral[planes] = 
slice->m_refFrameList[list][ref]->m_encData->m_meIntegral[planes] + offset;
                 }
 
-                m_me.setSourcePU(fencPic->m_picOrg[0], fencPic->m_stride, 
offset, pu.width, pu.height, m_param->searchMethod, m_param->subpelRefine);
                 setSearchRange(cu, mvp, searchRange, mvmin, mvmax);
 
                 if (isMVP)
@@ -2600,8 +2601,59 @@ void Search::predInterSearch(Mode& interMode, const 
CUGeom& cuGeom, bool bChroma
 
         getBlkBits((PartSize)cu.m_partSize[0], slice->isInterP(), puIdx, 
lastMode, m_listSelBits);
         bool bDoUnidir = true;
+        bool useThreadedME = false;
+        bool threadedBidir = false;
+        bool threadedUniL0 = false;
+        bool threadedUniL1 = false;
+        MEData threadedMEData;
 
         cu.getNeighbourMV(puIdx, pu.puAbsPartIdx, interMode.interNeighbours);
+        if (m_param->bThreadedME)
+        {
+            int cuSize = 1 << cu.m_log2CUSize[0];
+            int lookupWidth = pu.width;
+            int lookupHeight = pu.height;
+            bool isAmp = cu.m_partSize[0] >= SIZE_2NxnU;
+
+            if (isAmp)
+            {
+                if (cu.m_partSize[0] == SIZE_2NxnU || cu.m_partSize[0] == 
SIZE_2NxnD)
+                    lookupHeight = puIdx ? (pu.width - pu.height) : pu.height;
+                else
+                    lookupWidth = puIdx ? (pu.height - pu.width) : pu.width;
+            }
+
+            if (lookupWidth + lookupHeight <= 2 * MAX_CU_SIZE)
+            {
+                int startIdx = g_puStartIdx[lookupWidth + 
lookupHeight][static_cast<int>(cu.m_partSize[0])];
+                int alignWidth = isAmp ? cuSize : pu.width;
+                int alignHeight = isAmp ? cuSize : pu.height;
+                int numPUX = m_param->maxCUSize / alignWidth;
+                int numPUY = m_param->maxCUSize / alignHeight;
+                int puOffset = isAmp ? (puIdx * numPUX * numPUY) : 
(cu.m_partSize[0] == SIZE_2NxN ? (puIdx * numPUX) : puIdx);
+                int relX = (cu.m_cuPelX / alignWidth) % numPUX;
+                int relY = (cu.m_cuPelY / alignHeight) % numPUY;
+                int index = startIdx + (relY * numPUX + relX) + puOffset;
+
+                if (index >= 0 && index < MAX_NUM_PUS_PER_CTU)
+                {
+                    int row = cu.m_cuAddr / m_slice->m_sps->numCuInWidth;
+                    int col = cu.m_cuAddr % m_slice->m_sps->numCuInWidth;
+                    int slotIdx = (col % m_slice->m_sps->numCuInWidth) * 
m_slice->m_sps->numCuInHeight + row;
+
+                    threadedMEData = slice->m_ctuMV[slotIdx * 
MAX_NUM_PUS_PER_CTU + index];
+
+                    bool validL0 = threadedMEData.ref[0] >= 0 && 
threadedMEData.ref[0] < numRefIdx[0];
+                    bool validL1 = numPredDir > 1 && threadedMEData.ref[1] >= 
0 && threadedMEData.ref[1] < numRefIdx[1];
+
+                    threadedBidir = validL0 && validL1;
+                    threadedUniL0 = validL0 && threadedMEData.ref[1] == 
REF_NOT_VALID;
+                    threadedUniL1 = validL1 && threadedMEData.ref[0] == 
REF_NOT_VALID;
+                    useThreadedME = threadedBidir || threadedUniL0 || 
threadedUniL1;
+                }
+            }
+        }
+
         /* Uni-directional prediction */
         if ((m_param->analysisLoadReuseLevel > 1 && 
m_param->analysisLoadReuseLevel != 10)
             || (m_param->analysisMultiPassRefine && m_param->rc.bStatRead) || 
(m_param->bAnalysisType == AVC_INFO) || (useAsMVP))
@@ -2773,7 +2825,7 @@ void Search::predInterSearch(Mode& interMode, const 
CUGeom& cuGeom, bool bChroma
             /* if no peer threads were bonded, fall back to doing 
unidirectional
              * searches ourselves without overhead of singleMotionEstimation() 
*/
         }
-        if (bDoUnidir && !m_param->bThreadedME)
+        if (bDoUnidir && (!m_param->bThreadedME || !useThreadedME))
         {
             interMode.bestME[puIdx][0].ref = interMode.bestME[puIdx][1].ref = 
-1;
             uint32_t refMask = refMasks[puIdx] ? refMasks[puIdx] : 
(uint32_t)-1;
@@ -2885,7 +2937,7 @@ void Search::predInterSearch(Mode& interMode, const 
CUGeom& cuGeom, bool bChroma
 
         if (slice->isInterB() && !cu.isBipredRestriction() &&  /* biprediction 
is possible for this PU */
             cu.m_partSize[pu.puAbsPartIdx] != SIZE_2Nx2N &&    /* 2Nx2N 
biprediction is handled elsewhere */
-            bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT && 
!m_param->bThreadedME)
+            bestME[0].cost != MAX_UINT && bestME[1].cost != MAX_UINT && 
(!m_param->bThreadedME || !useThreadedME))
         {
             bidir[0] = bestME[0];
             bidir[1] = bestME[1];
@@ -2991,65 +3043,28 @@ void Search::predInterSearch(Mode& interMode, const 
CUGeom& cuGeom, bool bChroma
         bool uniL0 = false;
         bool uniL1 = false;
 
-        if (m_param->bThreadedME)
+        if (useThreadedME)
         {
-            int cuSize = 1 << cu.m_log2CUSize[0];
-
-            int lookupWidth = pu.width;
-            int lookupHeight = pu.height;
+            bestME[0].ref = threadedMEData.ref[0];
+            bestME[1].ref = threadedMEData.ref[1];
 
-            bool isAmp = cu.m_partSize[0] >= SIZE_2NxnU;
+            isBidir = threadedBidir;
+            uniL0 = threadedUniL0;
+            uniL1 = threadedUniL1;
 
-            if (isAmp)
-            {
-                if (cu.m_partSize[0] == SIZE_2NxnU || cu.m_partSize[0] == 
SIZE_2NxnD)
-                    lookupHeight = (puIdx) ? (pu.width - pu.height) : 
pu.height;
-                else
-                    lookupWidth = (puIdx) ? (pu.height - pu.width) : pu.width;
-            }
-
-            int startIdx = g_puStartIdx[lookupWidth + 
lookupHeight][static_cast<int>(cu.m_partSize[0])];
-
-            int alignWidth = isAmp ? cuSize : pu.width;
-            int alignHeight = isAmp ? cuSize : pu.height;
-
-            int numPUX = m_param->maxCUSize / alignWidth;
-            int numPUY = m_param->maxCUSize / alignHeight;
-
-            int puOffset = isAmp ? (puIdx * numPUX * numPUY) : 
(cu.m_partSize[0] == SIZE_2NxN ? (puIdx * numPUX) : puIdx);
- 
-            int relX = (cu.m_cuPelX / alignWidth) % numPUX;
-            int relY = (cu.m_cuPelY / alignHeight) % numPUY;
-
-            int index = startIdx + (relY * numPUX + relX) + puOffset;
-
-            int row = cu.m_cuAddr / m_slice->m_sps->numCuInWidth;
-            int col = cu.m_cuAddr % m_slice->m_sps->numCuInWidth;
-
-            int slotIdx = (col % m_slice->m_sps->numCuInWidth) * 
m_slice->m_sps->numCuInHeight + row;
-
-            MEData meData = slice->m_ctuMV[slotIdx * MAX_NUM_PUS_PER_CTU + 
index];
-
-            bestME[0].ref = meData.ref[0];
-            bestME[1].ref = meData.ref[1];
-
-            isBidir = (bestME[0].ref >= 0 && bestME[1].ref >= 0);
-            uniL0 = (bestME[0].ref >= 0 && bestME[1].ref == REF_NOT_VALID);
-            uniL1 = (bestME[1].ref >= 0 && bestME[0].ref == REF_NOT_VALID);
-
-            if(isBidir)
+            if (isBidir)
             {
                 cu.getPMV(interMode.interNeighbours, 0, bestME[0].ref, 
interMode.amvpCand[0][bestME[0].ref], mvc);
                 cu.getPMV(interMode.interNeighbours, 1, bestME[1].ref, 
interMode.amvpCand[1][bestME[1].ref], mvc);
 
-                bidir[0].mv = meData.mv[0];
-                bidir[1].mv = meData.mv[1];
+                bidir[0].mv = threadedMEData.mv[0];
+                bidir[1].mv = threadedMEData.mv[1];
                 bidir[0].mvp = interMode.amvpCand[0][bestME[0].ref][0];
                 bidir[1].mvp = interMode.amvpCand[1][bestME[1].ref][0];
-                bidir[0].mvCost = meData.mvCost[0];
-                bidir[1].mvCost = meData.mvCost[1];
-                bidirCost = meData.cost;
-                bidirBits = meData.bits;
+                bidir[0].mvCost = threadedMEData.mvCost[0];
+                bidir[1].mvCost = threadedMEData.mvCost[1];
+                bidirCost = threadedMEData.cost;
+                bidirBits = threadedMEData.bits;
 
                 bestCost = bidirCost;
             }
@@ -3057,11 +3072,11 @@ void Search::predInterSearch(Mode& interMode, const 
CUGeom& cuGeom, bool bChroma
             {
                 cu.getPMV(interMode.interNeighbours, 0, bestME[0].ref, 
interMode.amvpCand[0][bestME[0].ref], mvc);
 
-                bestME[0].mv = meData.mv[0];
+                bestME[0].mv = threadedMEData.mv[0];
                 bestME[0].mvp = interMode.amvpCand[0][bestME[0].ref][0];
-                bestME[0].mvCost = meData.mvCost[0];
-                bestME[0].cost = meData.cost;
-                bestME[0].bits = meData.bits;
+                bestME[0].mvCost = threadedMEData.mvCost[0];
+                bestME[0].cost = threadedMEData.cost;
+                bestME[0].bits = threadedMEData.bits;
 
                 bestCost = bestME[0].cost;
             }
@@ -3069,16 +3084,14 @@ void Search::predInterSearch(Mode& interMode, const 
CUGeom& cuGeom, bool bChroma
             {
                 cu.getPMV(interMode.interNeighbours, 1, bestME[1].ref, 
interMode.amvpCand[1][bestME[1].ref], mvc);
 
-                bestME[1].mv = meData.mv[1];
+                bestME[1].mv = threadedMEData.mv[1];
                 bestME[1].mvp = interMode.amvpCand[1][bestME[1].ref][0];
-                bestME[1].mvCost = meData.mvCost[1];
-                bestME[1].cost = meData.cost;
-                bestME[1].bits = meData.bits;
+                bestME[1].mvCost = threadedMEData.mvCost[1];
+                bestME[1].cost = threadedMEData.cost;
+                bestME[1].bits = threadedMEData.bits;
 
                 bestCost = bestME[1].cost;
             }
-            else
-                x265_log(NULL, X265_LOG_ERROR, "Invalid ME mode");
 
             if (mrgCost < bestCost)
                 isMerge = true;
diff --git a/source/encoder/threadedme.cpp b/source/encoder/threadedme.cpp
index 1028beb92..197f471de 100644
--- a/source/encoder/threadedme.cpp
+++ b/source/encoder/threadedme.cpp
@@ -30,7 +30,7 @@
 #include <sstream>
 
 namespace X265_NS {
-int g_puStartIdx[128][8] = {0};
+int g_puStartIdx[2 * MAX_CU_SIZE + 1][NUM_PART_SIZES] = {{0}};
 
 bool ThreadedME::create()
 {
diff --git a/source/encoder/threadedme.h b/source/encoder/threadedme.h
index 5e5fc4878..1adb545f5 100644
--- a/source/encoder/threadedme.h
+++ b/source/encoder/threadedme.h
@@ -40,7 +40,7 @@
 
 namespace X265_NS {
 
-extern int g_puStartIdx[128][8];
+extern int g_puStartIdx[2 * MAX_CU_SIZE + 1][NUM_PART_SIZES];
 
 class Encoder;
 class Analysis;
-- 
2.54.0.windows.1

_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel

Reply via email to