[x265] [PATCH 2 of 2] dynamic-refine: enable switching between inter refinement levels 0-3

bhavna Fri, 09 Mar 2018 05:43:13 -0800

# HG changeset patch
# User Bhavna Hariharan <bha...@multicorewareinc.com>
# Date 1520595579 -19800
#      Fri Mar 09 17:09:39 2018 +0530
# Node ID e5425bd33176d6366f34d93e80f9cb1c9c4ebe6f
# Parent  d292dacb81d8607ce0b2fb106b7383b360863e9d
dynamic-refine: enable switching between inter refinement levels 0-3
based on the content and the encoder properties.


The algorithm has 2 parts
1) Training - Encode frames with refine-inter 3 and calulate corresponding
feature values until saturation of values. The training restarts when a scenecut
is encountered.
2) Classification - Based on the prior probability calculated from the training
data and the feature metric of the current CU an appropriate refine-inter level
is chosen for the CU.

diff -r d292dacb81d8 -r e5425bd33176 source/common/cudata.cpp
--- a/source/common/cudata.cpp  Fri Mar 09 09:44:53 2018 +0530
+++ b/source/common/cudata.cpp  Fri Mar 09 17:09:39 2018 +0530
@@ -317,6 +317,16 @@
     m_cuAboveLeft = (m_cuLeft && m_cuAbove) ? m_encData->getPicCTU(m_cuAddr - 
widthInCU - 1) : NULL;
     m_cuAboveRight = (m_cuAbove && ((m_cuAddr % widthInCU) < (widthInCU - 1))) 
? m_encData->getPicCTU(m_cuAddr - widthInCU + 1) : NULL;
     memset(m_distortion, 0, m_numPartitions * sizeof(sse_t));
+
+    if (m_encData->m_param->bDynamicRefine)
+    {
+        int size = m_encData->m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
+        CHECKED_MALLOC_ZERO(m_collectCURd, uint64_t, size);
+        CHECKED_MALLOC_ZERO(m_collectCUVariance, uint32_t, size);
+        CHECKED_MALLOC_ZERO(m_collectCUCount, uint32_t, size);
+    }
+fail:
+    return;
 }
 
 // initialize Sub partition
diff -r d292dacb81d8 -r e5425bd33176 source/common/cudata.h
--- a/source/common/cudata.h    Fri Mar 09 09:44:53 2018 +0530
+++ b/source/common/cudata.h    Fri Mar 09 17:09:39 2018 +0530
@@ -224,6 +224,11 @@
     uint64_t      m_fAc_den[3];
     uint64_t      m_fDc_den[3];
 
+    /* Feature values per CTU for dynamic refinement */
+    uint64_t*       m_collectCURd;
+    uint32_t*       m_collectCUVariance;
+    uint32_t*       m_collectCUCount;
+
     CUData();
 
     void     initialize(const CUDataMemPool& dataPool, uint32_t depth, const 
x265_param& param, int instance);
diff -r d292dacb81d8 -r e5425bd33176 source/common/frame.cpp
--- a/source/common/frame.cpp   Fri Mar 09 09:44:53 2018 +0530
+++ b/source/common/frame.cpp   Fri Mar 09 17:09:39 2018 +0530
@@ -53,6 +53,7 @@
     m_addOnDepth = NULL;
     m_addOnCtuInfo = NULL;
     m_addOnPrevChange = NULL;
+    m_classifyFrame = false;
 }
 
 bool Frame::create(x265_param *param, float* quantOffsets)
@@ -85,6 +86,14 @@
         m_analysis2Pass.analysisFramedata = NULL;
     }
 
+    if (param->bDynamicRefine)
+    {
+        int size = m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
+        CHECKED_MALLOC_ZERO(m_classifyRd, uint64_t, size);
+        CHECKED_MALLOC_ZERO(m_classifyVariance, uint64_t, size);
+        CHECKED_MALLOC_ZERO(m_classifyCount, uint32_t, size);
+    }
+
     if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) && 
m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode || 
!!param->bAQMotion, param->rc.qgSize))
     {
         X265_CHECK((m_reconColCount == NULL), "m_reconColCount was 
initialized");
@@ -226,4 +235,11 @@
     }
     m_lowres.destroy();
     X265_FREE(m_rcData);
+
+    if (m_param->bDynamicRefine)
+    {
+        X265_FREE_ZERO(m_classifyRd);
+        X265_FREE_ZERO(m_classifyVariance);
+        X265_FREE_ZERO(m_classifyCount);
+    }
 }
diff -r d292dacb81d8 -r e5425bd33176 source/common/frame.h
--- a/source/common/frame.h     Fri Mar 09 09:44:53 2018 +0530
+++ b/source/common/frame.h     Fri Mar 09 17:09:39 2018 +0530
@@ -122,6 +122,14 @@
     uint8_t**              m_addOnDepth;
     uint8_t**              m_addOnCtuInfo;
     int**                  m_addOnPrevChange;
+
+    /* Average feature values of frames being considered for classification */
+    uint64_t*              m_classifyRd;
+    uint64_t*              m_classifyVariance;
+    uint32_t*              m_classifyCount;
+
+    bool                   m_classifyFrame;
+
     Frame();
 
     bool create(x265_param *param, float* quantOffsets);
diff -r d292dacb81d8 -r e5425bd33176 source/common/lowres.cpp
--- a/source/common/lowres.cpp  Fri Mar 09 09:44:53 2018 +0530
+++ b/source/common/lowres.cpp  Fri Mar 09 17:09:39 2018 +0530
@@ -59,10 +59,12 @@
         CHECKED_MALLOC_ZERO(qpAqMotionOffset, double, cuCountFullRes);
         CHECKED_MALLOC_ZERO(invQscaleFactor, int, cuCountFullRes);
         CHECKED_MALLOC_ZERO(qpCuTreeOffset, double, cuCountFullRes);
-        CHECKED_MALLOC_ZERO(blockVariance, uint32_t, cuCountFullRes);
         if (qgSize == 8)
             CHECKED_MALLOC_ZERO(invQscaleFactor8x8, int, cuCount);
     }
+
+    if (origPic->m_param->bDynamicRefine)
+        CHECKED_MALLOC_ZERO(blockVariance, uint32_t, cuCountFullRes);
     CHECKED_MALLOC(propagateCost, uint16_t, cuCount);
 
     /* allocate lowres buffers */
diff -r d292dacb81d8 -r e5425bd33176 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp       Fri Mar 09 09:44:53 2018 +0530
+++ b/source/encoder/analysis.cpp       Fri Mar 09 17:09:39 2018 +0530
@@ -1184,7 +1184,7 @@
 
         if (m_evaluateInter)
         {
-            if (m_param->interRefine == 2)
+            if (m_refineLevel == 2)
             {
                 if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
                     skipModes = true;
@@ -1307,7 +1307,7 @@
             md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
             checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], 
cuGeom);
             if (m_param->rdLevel)
-                skipModes = (m_param->bEnableEarlySkip || m_param->interRefine 
== 2)
+                skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2)
                 && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d 
threshold per depth
         }
         if (md.bestMode && m_param->bEnableRecursionSkip && !bCtuInfoCheck && 
!(m_param->bMVType && m_param->analysisReuseLevel == 7 && (m_modeFlag[0] || 
m_modeFlag[1])))
@@ -1874,7 +1874,7 @@
 
         if (m_evaluateInter)
         {
-            if (m_param->interRefine == 2)
+            if (m_refineLevel == 2)
             {
                 if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
                     skipModes = true;
@@ -2004,7 +2004,7 @@
             md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
             md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
             checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], 
cuGeom);
-            skipModes = (m_param->bEnableEarlySkip || m_param->interRefine == 
2) &&
+            skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2) &&
                 md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
             refMasks[0] = allSplitRefs;
             md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
@@ -2413,7 +2413,16 @@
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
     bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
 
-    int split = (m_param->interRefine && cuGeom.log2CUSize == 
(uint32_t)(g_log2Size[m_param->minCUSize] + 1) && bDecidedDepth);
+    TrainingData td;
+    td.init(parentCTU, cuGeom);
+
+    if (!m_param->bDynamicRefine)
+        m_refineLevel = m_param->interRefine;
+    else
+        m_refineLevel = m_frame->m_classifyFrame ? 0 : 3;
+
+    int split = (m_refineLevel && cuGeom.log2CUSize == 
(uint32_t)(g_log2Size[m_param->minCUSize] + 1) && bDecidedDepth);
+    td.split = split;
 
     if (bDecidedDepth)
     {
@@ -2423,7 +2432,7 @@
         md.bestMode = &mode;
         mode.cu.initSubCU(parentCTU, cuGeom, qp);
         PartSize size = (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx];
-        if (parentCTU.isIntra(cuGeom.absPartIdx) && m_param->interRefine < 2)
+        if (parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2)
         {
             if (m_param->intraRefine == 4)
                 compressIntraCU(parentCTU, cuGeom, qp);
@@ -2439,7 +2448,7 @@
                 checkIntra(mode, cuGeom, size);
             }
         }
-        else if (!parentCTU.isIntra(cuGeom.absPartIdx) && m_param->interRefine 
< 2)
+        else if (!parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2)
         {
             mode.cu.copyFromPic(parentCTU, cuGeom, m_csp, false);
             uint32_t numPU = parentCTU.getNumPartInter(cuGeom.absPartIdx);
@@ -2501,7 +2510,7 @@
                 }
                 motionCompensation(mode.cu, pu, mode.predYuv, true, (m_csp != 
X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
             }
-            if (!m_param->interRefine && 
parentCTU.isSkipped(cuGeom.absPartIdx))
+            if (!m_param->interRefine && !m_param->bDynamicRefine && 
parentCTU.isSkipped(cuGeom.absPartIdx))
                 encodeResAndCalcRdSkipCU(mode);
             else
                 encodeResAndCalcRdInterCU(mode, cuGeom);
@@ -2512,7 +2521,7 @@
                 checkDQP(mode, cuGeom);
         }
 
-        if (m_param->interRefine < 2)
+        if (m_refineLevel < 2)
         {
             if (m_bTryLossless)
                 tryLossless(cuGeom);
@@ -2540,7 +2549,10 @@
             }
         }
 
-        if (m_param->interRefine > 1 || (m_param->interRefine && 
parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP  && !mode.cu.isSkipped(0)))
+        if (m_param->bDynamicRefine)
+            classifyCU(parentCTU,cuGeom, *md.bestMode, td);
+
+        if (m_refineLevel > 1 || (m_refineLevel && 
parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP  && !mode.cu.isSkipped(0)))
         {
             m_evaluateInter = 1;
             m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, 
qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp);
@@ -2599,7 +2611,7 @@
         else
             updateModeCost(*splitPred);
 
-        if (m_param->interRefine)
+        if (m_refineLevel)
         {
             if (m_param->rdLevel > 1)
                 checkBestMode(*splitPred, cuGeom.depth);
@@ -2613,6 +2625,83 @@
         md.bestMode->cu.copyToPic(depth);
         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, 
parentCTU.m_cuAddr, cuGeom.absPartIdx);
     }
+    if (m_param->bDynamicRefine && bDecidedDepth)
+        trainCU(parentCTU, cuGeom, *md.bestMode, td);
+}
+
+void Analysis::classifyCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& 
bestMode, TrainingData& trainData)
+{
+    uint32_t depth = cuGeom.depth;
+    trainData.cuVariance = calculateCUVariance(ctu, cuGeom);
+    if (m_frame->m_classifyFrame)
+    {
+        uint64_t diffRefine[X265_REFINE_INTER_LEVELS];
+        uint64_t diffRefineRd[X265_REFINE_INTER_LEVELS];
+        float probRefine[X265_REFINE_INTER_LEVELS] = { 0 };
+        uint8_t varRefineLevel = 0;
+        uint8_t rdRefineLevel = 0;
+        uint64_t cuCost = bestMode.rdCost;
+
+        int offset = (depth * X265_REFINE_INTER_LEVELS) + 1;
+        if (cuCost < m_frame->m_classifyRd[offset])
+            m_refineLevel = 1;
+        else
+        {
+            uint64_t trainingCount = 0;
+            for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
+            {
+                offset = (depth * X265_REFINE_INTER_LEVELS) + i;
+                trainingCount += m_frame->m_classifyCount[offset];
+            }
+            for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
+            {
+                offset = (depth * X265_REFINE_INTER_LEVELS) + i;
+                /* Calculate distance values */
+                diffRefine[i] = abs((int64_t)(trainData.cuVariance - 
m_frame->m_classifyVariance[offset]));
+                diffRefineRd[i] = abs((int64_t)(cuCost - 
m_frame->m_classifyRd[offset]));
+
+                /* Calculate prior probability - ranges between 0 and 1 */
+                if (trainingCount)
+                    probRefine[i] = ((float)m_frame->m_classifyCount[offset] / 
(float)trainingCount);
+
+                /* Bayesian classification - P(c|x)P(x) = P(x|c)P(c)
+                P(c|x) is the posterior probability of class given predictor.
+                P(c) is the prior probability of class.
+                P(x|c) is the likelihood which is the probability of predictor 
given class.
+                P(x) is the prior probability of predictor.*/
+                if ((diffRefine[i] * probRefine[m_refineLevel]) < 
(diffRefine[m_refineLevel] * probRefine[i]))
+                    varRefineLevel = i;
+                if ((diffRefineRd[i] * probRefine[m_refineLevel]) < 
(diffRefineRd[m_refineLevel] * probRefine[i]))
+                    rdRefineLevel = i;
+            }
+            m_refineLevel = X265_MAX(varRefineLevel, rdRefineLevel);
+        }
+    }
+}
+
+void Analysis::trainCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& 
bestMode, TrainingData& trainData)
+{
+    uint32_t depth = cuGeom.depth;
+    int classify = 0;
+    if (!m_frame->m_classifyFrame)
+    {
+        if (trainData.predMode == ctu.m_predMode[cuGeom.absPartIdx] && 
trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx]
+            && trainData.mergeFlag == ctu.m_mergeFlag[cuGeom.absPartIdx])
+            classify = 0;
+        else if ((depth == m_param->maxCUDepth - 1) && trainData.split)
+            classify = 1;
+        else if (trainData.partSize == SIZE_2Nx2N && trainData.partSize == 
ctu.m_partSize[cuGeom.absPartIdx])
+            classify = 2;
+        else
+            classify = 3;
+    }
+    else
+        classify = m_refineLevel;
+    uint64_t cuCost = bestMode.rdCost;
+    int offset = (depth * X265_REFINE_INTER_LEVELS) + classify;
+    ctu.m_collectCURd[offset] += cuCost;
+    ctu.m_collectCUVariance[offset] += trainData.cuVariance;
+    ctu.m_collectCUCount[offset]++;
 }
 
 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
@@ -3414,6 +3503,33 @@
     return false;
 }
 
+uint32_t Analysis::calculateCUVariance(const CUData& ctu, const CUGeom& cuGeom)
+{
+    uint32_t cuVariance = 0;
+    uint32_t *blockVariance = m_frame->m_lowres.blockVariance;
+    int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
+
+    uint32_t width = m_frame->m_fencPic->m_picWidth;
+    uint32_t height = m_frame->m_fencPic->m_picHeight;
+    uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
+    uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
+    uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / 
loopIncr;
+    uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
+    uint32_t cnt = 0; 
+
+    for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && 
block_yy < height; block_yy += loopIncr)
+    {
+        for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && 
block_xx < width; block_xx += loopIncr)
+        {
+            uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / 
loopIncr);
+            cuVariance += blockVariance[idx];
+            cnt++;
+        }
+    }
+    
+    return cuVariance / cnt;
+}
+
 int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, 
int32_t complexCheck, double baseQp)
 {
     FrameData& curEncData = *m_frame->m_encData;
diff -r d292dacb81d8 -r e5425bd33176 source/encoder/analysis.h
--- a/source/encoder/analysis.h Fri Mar 09 09:44:53 2018 +0530
+++ b/source/encoder/analysis.h Fri Mar 09 17:09:39 2018 +0530
@@ -142,8 +142,29 @@
     uint8_t*                m_multipassModes;
 
     uint8_t                 m_evaluateInter;
+    int32_t                 m_refineLevel;
+
     uint8_t*                m_additionalCtuInfo;
     int*                    m_prevCtuInfoChange;
+
+    struct TrainingData
+    {
+        uint32_t cuVariance;
+        uint8_t predMode;
+        uint8_t partSize;
+        uint8_t mergeFlag;
+        int split;
+
+        void init(const CUData& parentCTU, const CUGeom& cuGeom)
+        {
+            cuVariance = 0;
+            predMode = parentCTU.m_predMode[cuGeom.absPartIdx];
+            partSize = parentCTU.m_partSize[cuGeom.absPartIdx];
+            mergeFlag = parentCTU.m_mergeFlag[cuGeom.absPartIdx];
+            split = 0;
+        }
+    };
+
     /* refine RD based on QP for rd-levels 5 and 6 */
     void qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, 
int32_t lqp);
 
@@ -182,6 +203,10 @@
     void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom);
 
     int calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int32_t 
complexCheck = 0, double baseQP = -1);
+    uint32_t calculateCUVariance(const CUData& ctu, const CUGeom& cuGeom);
+
+    void classifyCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& 
bestMode, TrainingData& trainData);
+    void trainCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& 
bestMode, TrainingData& trainData);
 
     void calculateNormFactor(CUData& ctu, int qp);
     void normFactor(const pixel* src, uint32_t blockSize, CUData& ctu, int qp, 
TextType ttype);
diff -r d292dacb81d8 -r e5425bd33176 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp        Fri Mar 09 09:44:53 2018 +0530
+++ b/source/encoder/encoder.cpp        Fri Mar 09 17:09:39 2018 +0530
@@ -96,6 +96,7 @@
 #endif
 
     m_prevTonemapPayload.payload = NULL;
+    m_startPoint = 0;
 }
 inline char *strcatFilename(const char *input, const char *suffix)
 {
@@ -412,6 +413,17 @@
     if (m_bToneMap)
         m_numCimInfo = 
m_hdr10plus_api->hdr10plus_json_to_movie_cim(m_param->toneMapFile, m_cim);
 #endif
+
+    if (m_param->bDynamicRefine)
+    {
+        int size = m_param->totalFrames * m_param->maxCUDepth * 
X265_REFINE_INTER_LEVELS;
+        CHECKED_MALLOC_ZERO(m_variance, uint64_t, size);
+        CHECKED_MALLOC_ZERO(m_rdCost, uint64_t, size);
+        CHECKED_MALLOC_ZERO(m_trainingCount, uint32_t, size);
+        return;
+    fail:
+        m_aborted = true;
+    }
 }
 
 void Encoder::stopJobs()
@@ -697,7 +709,13 @@
     if (m_bToneMap)
         m_hdr10plus_api->hdr10plus_clear_movie(m_cim, m_numCimInfo);
 #endif
-        
+
+    if (m_param->bDynamicRefine)
+    {
+        X265_FREE(m_variance);
+        X265_FREE(m_rdCost);
+        X265_FREE(m_trainingCount);
+    }
     if (m_exportedPic)
     {
         ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
diff -r d292dacb81d8 -r e5425bd33176 source/encoder/encoder.h
--- a/source/encoder/encoder.h  Fri Mar 09 09:44:53 2018 +0530
+++ b/source/encoder/encoder.h  Fri Mar 09 17:09:39 2018 +0530
@@ -221,6 +221,13 @@
 
     x265_sei_payload        m_prevTonemapPayload;
 
+    /* Collect frame level feature data */
+    uint64_t*               m_rdCost;
+    uint64_t*               m_variance;
+    uint32_t*               m_trainingCount;
+    int32_t                 m_startPoint;
+    Lock                    m_dynamicRefineLock;
+
     Encoder();
     ~Encoder()
     {
diff -r d292dacb81d8 -r e5425bd33176 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp   Fri Mar 09 09:44:53 2018 +0530
+++ b/source/encoder/frameencoder.cpp   Fri Mar 09 17:09:39 2018 +0530
@@ -736,6 +736,9 @@
             m_top->m_rateControl->m_startEndOrder.incr(); // faked 
rateControlEnd calls for negative frames
     }
 
+    if (m_param->bDynamicRefine)
+        computeAvgTrainingData();
+
     /* Analyze CTU rows, most of the hard work is done here.  Frame is
      * compressed in a wave-front pattern if WPP is enabled. Row based loop
      * filters runs behind the CTU compression and reconstruction */
@@ -1457,6 +1460,30 @@
         // Does all the CU analysis, returns best top level mode decision
         Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, 
m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
 
+        if (m_param->bDynamicRefine)
+        {
+            {
+                ScopedLock dynLock(m_top->m_dynamicRefineLock);
+                for (uint32_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
+                {
+                    for (uint32_t depth = 0; depth < m_param->maxCUDepth; 
depth++)
+                    {
+                        int offset = (depth * X265_REFINE_INTER_LEVELS) + i;
+                        int index = (m_frame->m_encodeOrder * 
X265_REFINE_INTER_LEVELS * m_param->maxCUDepth) + offset;
+                        if (ctu->m_collectCUCount[offset])
+                        {
+                            m_top->m_variance[index] += 
ctu->m_collectCUVariance[offset];
+                            m_top->m_rdCost[index] += 
ctu->m_collectCURd[offset];
+                            m_top->m_trainingCount[index] += 
ctu->m_collectCUCount[offset];
+                        }
+                    }
+                }
+            }
+            X265_FREE_ZERO(ctu->m_collectCUVariance);
+            X265_FREE_ZERO(ctu->m_collectCURd);
+            X265_FREE_ZERO(ctu->m_collectCUCount);
+        }
+
         // take a sample of the current active worker count
         ATOMIC_ADD(&m_totalActiveWorkerCount, m_activeWorkerCount);
         ATOMIC_INC(&m_activeWorkerCountSamples);
@@ -1839,6 +1866,58 @@
         m_completionEvent.trigger();
 }
 
+void FrameEncoder::computeAvgTrainingData()
+{
+    if (m_frame->m_lowres.bScenecut)
+        m_top->m_startPoint = m_frame->m_encodeOrder;
+
+    if (m_frame->m_encodeOrder - m_top->m_startPoint < 2 * 
m_param->frameNumThreads)
+        m_frame->m_classifyFrame = false;
+    else
+        m_frame->m_classifyFrame = true;
+
+    int size = m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
+    memset(m_frame->m_classifyRd, 0, size * sizeof(uint64_t));
+    memset(m_frame->m_classifyVariance, 0, size * sizeof(uint64_t));
+    memset(m_frame->m_classifyCount, 0, size * sizeof(uint32_t));
+
+    if (m_frame->m_classifyFrame)
+    {
+        uint32_t limit = m_frame->m_encodeOrder - m_param->frameNumThreads - 1;
+        for (uint32_t i = m_top->m_startPoint + 1; i < limit; i++)
+        {
+            for (uint32_t j = 0; j < X265_REFINE_INTER_LEVELS; j++)
+            {
+                for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
+                {
+                    int offset = (depth * X265_REFINE_INTER_LEVELS) + j;
+                    int index = (i* X265_REFINE_INTER_LEVELS * 
m_param->maxCUDepth) + offset;
+                    if (m_top->m_trainingCount[index])
+                    {
+                        m_frame->m_classifyRd[offset] += 
m_top->m_rdCost[index] / m_top->m_trainingCount[index];
+                        m_frame->m_classifyVariance[offset] += 
m_top->m_variance[index] / m_top->m_trainingCount[index];
+                        m_frame->m_classifyCount[offset] += 
m_top->m_trainingCount[index];
+                    }
+                }
+            }
+        }
+        /* Calculates the average feature values of historic frames that are 
being considered for the current frame */
+        int historyCount = m_frame->m_encodeOrder - m_param->frameNumThreads - 
m_top->m_startPoint - 1;
+        if (historyCount)
+        {
+            for (uint32_t j = 0; j < X265_REFINE_INTER_LEVELS; j++)
+            {
+                for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
+                {
+                    int offset = (depth * X265_REFINE_INTER_LEVELS) + j;
+                    m_frame->m_classifyRd[offset] /= historyCount;
+                    m_frame->m_classifyVariance[offset] /= historyCount;
+                }
+            }
+        }
+    }
+}
+
 /* collect statistics about CU coding decisions, return total QP */
 int FrameEncoder::collectCTUStatistics(const CUData& ctu, FrameStats* log)
 {
diff -r d292dacb81d8 -r e5425bd33176 source/encoder/frameencoder.h
--- a/source/encoder/frameencoder.h     Fri Mar 09 09:44:53 2018 +0530
+++ b/source/encoder/frameencoder.h     Fri Mar 09 17:09:39 2018 +0530
@@ -230,6 +230,7 @@
     void threadMain();
     int  collectCTUStatistics(const CUData& ctu, FrameStats* frameLog);
     void noiseReductionUpdate();
+    void computeAvgTrainingData();
 
     /* Called by WaveFront::findJob() */
     virtual void processRow(int row, int threadId);
diff -r d292dacb81d8 -r e5425bd33176 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp      Fri Mar 09 09:44:53 2018 +0530
+++ b/source/encoder/slicetype.cpp      Fri Mar 09 17:09:39 2018 +0530
@@ -178,12 +178,12 @@
             }
         }
 
-        /* Need variance data for weighted prediction */
+        /* Need variance data for weighted prediction and dynamic refinement*/
         if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
-        {
+        {            
             for (blockY = 0; blockY < maxRow; blockY += loopIncr)
-                for (blockX = 0; blockX < maxCol; blockX += loopIncr)
-                    acEnergyCu(curFrame, blockX, blockY, param->internalCsp, 
param->rc.qgSize);
+                for (blockX = 0; blockX < maxCol; blockX += loopIncr)          
      
+                    acEnergyCu(curFrame, blockX, blockY, param->internalCsp, 
param->rc.qgSize);                
         }
     }
     else
@@ -240,7 +240,7 @@
                 else
                 {
                     uint32_t energy = acEnergyCu(curFrame, blockX, blockY, 
param->internalCsp,param->rc.qgSize);
-                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - 
(modeOneConst + 2 * (X265_DEPTH - 8)));
+                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - 
(modeOneConst + 2 * (X265_DEPTH - 8)));                    
                 }
 
                 if (param->bHDROpt)
@@ -308,6 +308,17 @@
             curFrame->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * 
height[i]) / 2) / (width[i] * height[i]);
         }
     }
+
+    if (param->bDynamicRefine)
+    {
+        blockXY = 0;
+        for (blockY = 0; blockY < maxRow; blockY += loopIncr)
+            for (blockX = 0; blockX < maxCol; blockX += loopIncr)
+            {
+                curFrame->m_lowres.blockVariance[blockXY] = 
acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
+                blockXY++;
+            }
+    }
 }
 
 void LookaheadTLD::lowresIntraEstimate(Lowres& fenc, uint32_t qgSize)

# HG changeset patch
# User Bhavna Hariharan <bha...@multicorewareinc.com>
# Date 1520595579 -19800
#      Fri Mar 09 17:09:39 2018 +0530
# Node ID e5425bd33176d6366f34d93e80f9cb1c9c4ebe6f
# Parent  d292dacb81d8607ce0b2fb106b7383b360863e9d
dynamic-refine: enable switching between inter refinement levels 0-3
based on the content and the encoder properties.

The algorithm has 2 parts
1) Training - Encode frames with refine-inter 3 and calulate corresponding
feature values until saturation of values. The training restarts when a scenecut
is encountered.
2) Classification - Based on the prior probability calculated from the training
data and the feature metric of the current CU an appropriate refine-inter level
is chosen for the CU.

diff -r d292dacb81d8 -r e5425bd33176 source/common/cudata.cpp
--- a/source/common/cudata.cpp	Fri Mar 09 09:44:53 2018 +0530
+++ b/source/common/cudata.cpp	Fri Mar 09 17:09:39 2018 +0530
@@ -317,6 +317,16 @@
     m_cuAboveLeft = (m_cuLeft && m_cuAbove) ? m_encData->getPicCTU(m_cuAddr - widthInCU - 1) : NULL;
     m_cuAboveRight = (m_cuAbove && ((m_cuAddr % widthInCU) < (widthInCU - 1))) ? m_encData->getPicCTU(m_cuAddr - widthInCU + 1) : NULL;
     memset(m_distortion, 0, m_numPartitions * sizeof(sse_t));
+
+    if (m_encData->m_param->bDynamicRefine)
+    {
+        int size = m_encData->m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
+        CHECKED_MALLOC_ZERO(m_collectCURd, uint64_t, size);
+        CHECKED_MALLOC_ZERO(m_collectCUVariance, uint32_t, size);
+        CHECKED_MALLOC_ZERO(m_collectCUCount, uint32_t, size);
+    }
+fail:
+    return;
 }
 
 // initialize Sub partition
diff -r d292dacb81d8 -r e5425bd33176 source/common/cudata.h
--- a/source/common/cudata.h	Fri Mar 09 09:44:53 2018 +0530
+++ b/source/common/cudata.h	Fri Mar 09 17:09:39 2018 +0530
@@ -224,6 +224,11 @@
     uint64_t      m_fAc_den[3];
     uint64_t      m_fDc_den[3];
 
+    /* Feature values per CTU for dynamic refinement */
+    uint64_t*       m_collectCURd;
+    uint32_t*       m_collectCUVariance;
+    uint32_t*       m_collectCUCount;
+
     CUData();
 
     void     initialize(const CUDataMemPool& dataPool, uint32_t depth, const x265_param& param, int instance);
diff -r d292dacb81d8 -r e5425bd33176 source/common/frame.cpp
--- a/source/common/frame.cpp	Fri Mar 09 09:44:53 2018 +0530
+++ b/source/common/frame.cpp	Fri Mar 09 17:09:39 2018 +0530
@@ -53,6 +53,7 @@
     m_addOnDepth = NULL;
     m_addOnCtuInfo = NULL;
     m_addOnPrevChange = NULL;
+    m_classifyFrame = false;
 }
 
 bool Frame::create(x265_param *param, float* quantOffsets)
@@ -85,6 +86,14 @@
         m_analysis2Pass.analysisFramedata = NULL;
     }
 
+    if (param->bDynamicRefine)
+    {
+        int size = m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
+        CHECKED_MALLOC_ZERO(m_classifyRd, uint64_t, size);
+        CHECKED_MALLOC_ZERO(m_classifyVariance, uint64_t, size);
+        CHECKED_MALLOC_ZERO(m_classifyCount, uint32_t, size);
+    }
+
     if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) && m_lowres.create(m_fencPic, param->bframes, !!param->rc.aqMode || !!param->bAQMotion, param->rc.qgSize))
     {
         X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized");
@@ -226,4 +235,11 @@
     }
     m_lowres.destroy();
     X265_FREE(m_rcData);
+
+    if (m_param->bDynamicRefine)
+    {
+        X265_FREE_ZERO(m_classifyRd);
+        X265_FREE_ZERO(m_classifyVariance);
+        X265_FREE_ZERO(m_classifyCount);
+    }
 }
diff -r d292dacb81d8 -r e5425bd33176 source/common/frame.h
--- a/source/common/frame.h	Fri Mar 09 09:44:53 2018 +0530
+++ b/source/common/frame.h	Fri Mar 09 17:09:39 2018 +0530
@@ -122,6 +122,14 @@
     uint8_t**              m_addOnDepth;
     uint8_t**              m_addOnCtuInfo;
     int**                  m_addOnPrevChange;
+
+    /* Average feature values of frames being considered for classification */
+    uint64_t*              m_classifyRd;
+    uint64_t*              m_classifyVariance;
+    uint32_t*              m_classifyCount;
+
+    bool                   m_classifyFrame;
+
     Frame();
 
     bool create(x265_param *param, float* quantOffsets);
diff -r d292dacb81d8 -r e5425bd33176 source/common/lowres.cpp
--- a/source/common/lowres.cpp	Fri Mar 09 09:44:53 2018 +0530
+++ b/source/common/lowres.cpp	Fri Mar 09 17:09:39 2018 +0530
@@ -59,10 +59,12 @@
         CHECKED_MALLOC_ZERO(qpAqMotionOffset, double, cuCountFullRes);
         CHECKED_MALLOC_ZERO(invQscaleFactor, int, cuCountFullRes);
         CHECKED_MALLOC_ZERO(qpCuTreeOffset, double, cuCountFullRes);
-        CHECKED_MALLOC_ZERO(blockVariance, uint32_t, cuCountFullRes);
         if (qgSize == 8)
             CHECKED_MALLOC_ZERO(invQscaleFactor8x8, int, cuCount);
     }
+
+    if (origPic->m_param->bDynamicRefine)
+        CHECKED_MALLOC_ZERO(blockVariance, uint32_t, cuCountFullRes);
     CHECKED_MALLOC(propagateCost, uint16_t, cuCount);
 
     /* allocate lowres buffers */
diff -r d292dacb81d8 -r e5425bd33176 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp	Fri Mar 09 09:44:53 2018 +0530
+++ b/source/encoder/analysis.cpp	Fri Mar 09 17:09:39 2018 +0530
@@ -1184,7 +1184,7 @@
 
         if (m_evaluateInter)
         {
-            if (m_param->interRefine == 2)
+            if (m_refineLevel == 2)
             {
                 if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
                     skipModes = true;
@@ -1307,7 +1307,7 @@
             md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
             checkMerge2Nx2N_rd0_4(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
             if (m_param->rdLevel)
-                skipModes = (m_param->bEnableEarlySkip || m_param->interRefine == 2)
+                skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2)
                 && md.bestMode && md.bestMode->cu.isSkipped(0); // TODO: sa8d threshold per depth
         }
         if (md.bestMode && m_param->bEnableRecursionSkip && !bCtuInfoCheck && !(m_param->bMVType && m_param->analysisReuseLevel == 7 && (m_modeFlag[0] || m_modeFlag[1])))
@@ -1874,7 +1874,7 @@
 
         if (m_evaluateInter)
         {
-            if (m_param->interRefine == 2)
+            if (m_refineLevel == 2)
             {
                 if (parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP)
                     skipModes = true;
@@ -2004,7 +2004,7 @@
             md.pred[PRED_SKIP].cu.initSubCU(parentCTU, cuGeom, qp);
             md.pred[PRED_MERGE].cu.initSubCU(parentCTU, cuGeom, qp);
             checkMerge2Nx2N_rd5_6(md.pred[PRED_SKIP], md.pred[PRED_MERGE], cuGeom);
-            skipModes = (m_param->bEnableEarlySkip || m_param->interRefine == 2) &&
+            skipModes = (m_param->bEnableEarlySkip || m_refineLevel == 2) &&
                 md.bestMode && !md.bestMode->cu.getQtRootCbf(0);
             refMasks[0] = allSplitRefs;
             md.pred[PRED_2Nx2N].cu.initSubCU(parentCTU, cuGeom, qp);
@@ -2413,7 +2413,16 @@
     bool mightNotSplit = !(cuGeom.flags & CUGeom::SPLIT_MANDATORY);
     bool bDecidedDepth = parentCTU.m_cuDepth[cuGeom.absPartIdx] == depth;
 
-    int split = (m_param->interRefine && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1) && bDecidedDepth);
+    TrainingData td;
+    td.init(parentCTU, cuGeom);
+
+    if (!m_param->bDynamicRefine)
+        m_refineLevel = m_param->interRefine;
+    else
+        m_refineLevel = m_frame->m_classifyFrame ? 0 : 3;
+
+    int split = (m_refineLevel && cuGeom.log2CUSize == (uint32_t)(g_log2Size[m_param->minCUSize] + 1) && bDecidedDepth);
+    td.split = split;
 
     if (bDecidedDepth)
     {
@@ -2423,7 +2432,7 @@
         md.bestMode = &mode;
         mode.cu.initSubCU(parentCTU, cuGeom, qp);
         PartSize size = (PartSize)parentCTU.m_partSize[cuGeom.absPartIdx];
-        if (parentCTU.isIntra(cuGeom.absPartIdx) && m_param->interRefine < 2)
+        if (parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2)
         {
             if (m_param->intraRefine == 4)
                 compressIntraCU(parentCTU, cuGeom, qp);
@@ -2439,7 +2448,7 @@
                 checkIntra(mode, cuGeom, size);
             }
         }
-        else if (!parentCTU.isIntra(cuGeom.absPartIdx) && m_param->interRefine < 2)
+        else if (!parentCTU.isIntra(cuGeom.absPartIdx) && m_refineLevel < 2)
         {
             mode.cu.copyFromPic(parentCTU, cuGeom, m_csp, false);
             uint32_t numPU = parentCTU.getNumPartInter(cuGeom.absPartIdx);
@@ -2501,7 +2510,7 @@
                 }
                 motionCompensation(mode.cu, pu, mode.predYuv, true, (m_csp != X265_CSP_I400 && m_frame->m_fencPic->m_picCsp != X265_CSP_I400));
             }
-            if (!m_param->interRefine && parentCTU.isSkipped(cuGeom.absPartIdx))
+            if (!m_param->interRefine && !m_param->bDynamicRefine && parentCTU.isSkipped(cuGeom.absPartIdx))
                 encodeResAndCalcRdSkipCU(mode);
             else
                 encodeResAndCalcRdInterCU(mode, cuGeom);
@@ -2512,7 +2521,7 @@
                 checkDQP(mode, cuGeom);
         }
 
-        if (m_param->interRefine < 2)
+        if (m_refineLevel < 2)
         {
             if (m_bTryLossless)
                 tryLossless(cuGeom);
@@ -2540,7 +2549,10 @@
             }
         }
 
-        if (m_param->interRefine > 1 || (m_param->interRefine && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP  && !mode.cu.isSkipped(0)))
+        if (m_param->bDynamicRefine)
+            classifyCU(parentCTU,cuGeom, *md.bestMode, td);
+
+        if (m_refineLevel > 1 || (m_refineLevel && parentCTU.m_predMode[cuGeom.absPartIdx] == MODE_SKIP  && !mode.cu.isSkipped(0)))
         {
             m_evaluateInter = 1;
             m_param->rdLevel > 4 ? compressInterCU_rd5_6(parentCTU, cuGeom, qp) : compressInterCU_rd0_4(parentCTU, cuGeom, qp);
@@ -2599,7 +2611,7 @@
         else
             updateModeCost(*splitPred);
 
-        if (m_param->interRefine)
+        if (m_refineLevel)
         {
             if (m_param->rdLevel > 1)
                 checkBestMode(*splitPred, cuGeom.depth);
@@ -2613,6 +2625,83 @@
         md.bestMode->cu.copyToPic(depth);
         md.bestMode->reconYuv.copyToPicYuv(*m_frame->m_reconPic, parentCTU.m_cuAddr, cuGeom.absPartIdx);
     }
+    if (m_param->bDynamicRefine && bDecidedDepth)
+        trainCU(parentCTU, cuGeom, *md.bestMode, td);
+}
+
+void Analysis::classifyCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData)
+{
+    uint32_t depth = cuGeom.depth;
+    trainData.cuVariance = calculateCUVariance(ctu, cuGeom);
+    if (m_frame->m_classifyFrame)
+    {
+        uint64_t diffRefine[X265_REFINE_INTER_LEVELS];
+        uint64_t diffRefineRd[X265_REFINE_INTER_LEVELS];
+        float probRefine[X265_REFINE_INTER_LEVELS] = { 0 };
+        uint8_t varRefineLevel = 0;
+        uint8_t rdRefineLevel = 0;
+        uint64_t cuCost = bestMode.rdCost;
+
+        int offset = (depth * X265_REFINE_INTER_LEVELS) + 1;
+        if (cuCost < m_frame->m_classifyRd[offset])
+            m_refineLevel = 1;
+        else
+        {
+            uint64_t trainingCount = 0;
+            for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
+            {
+                offset = (depth * X265_REFINE_INTER_LEVELS) + i;
+                trainingCount += m_frame->m_classifyCount[offset];
+            }
+            for (uint8_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
+            {
+                offset = (depth * X265_REFINE_INTER_LEVELS) + i;
+                /* Calculate distance values */
+                diffRefine[i] = abs((int64_t)(trainData.cuVariance - m_frame->m_classifyVariance[offset]));
+                diffRefineRd[i] = abs((int64_t)(cuCost - m_frame->m_classifyRd[offset]));
+
+                /* Calculate prior probability - ranges between 0 and 1 */
+                if (trainingCount)
+                    probRefine[i] = ((float)m_frame->m_classifyCount[offset] / (float)trainingCount);
+
+                /* Bayesian classification - P(c|x)P(x) = P(x|c)P(c)
+                P(c|x) is the posterior probability of class given predictor.
+                P(c) is the prior probability of class.
+                P(x|c) is the likelihood which is the probability of predictor given class.
+                P(x) is the prior probability of predictor.*/
+                if ((diffRefine[i] * probRefine[m_refineLevel]) < (diffRefine[m_refineLevel] * probRefine[i]))
+                    varRefineLevel = i;
+                if ((diffRefineRd[i] * probRefine[m_refineLevel]) < (diffRefineRd[m_refineLevel] * probRefine[i]))
+                    rdRefineLevel = i;
+            }
+            m_refineLevel = X265_MAX(varRefineLevel, rdRefineLevel);
+        }
+    }
+}
+
+void Analysis::trainCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData)
+{
+    uint32_t depth = cuGeom.depth;
+    int classify = 0;
+    if (!m_frame->m_classifyFrame)
+    {
+        if (trainData.predMode == ctu.m_predMode[cuGeom.absPartIdx] && trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx]
+            && trainData.mergeFlag == ctu.m_mergeFlag[cuGeom.absPartIdx])
+            classify = 0;
+        else if ((depth == m_param->maxCUDepth - 1) && trainData.split)
+            classify = 1;
+        else if (trainData.partSize == SIZE_2Nx2N && trainData.partSize == ctu.m_partSize[cuGeom.absPartIdx])
+            classify = 2;
+        else
+            classify = 3;
+    }
+    else
+        classify = m_refineLevel;
+    uint64_t cuCost = bestMode.rdCost;
+    int offset = (depth * X265_REFINE_INTER_LEVELS) + classify;
+    ctu.m_collectCURd[offset] += cuCost;
+    ctu.m_collectCUVariance[offset] += trainData.cuVariance;
+    ctu.m_collectCUCount[offset]++;
 }
 
 /* sets md.bestMode if a valid merge candidate is found, else leaves it NULL */
@@ -3414,6 +3503,33 @@
     return false;
 }
 
+uint32_t Analysis::calculateCUVariance(const CUData& ctu, const CUGeom& cuGeom)
+{
+    uint32_t cuVariance = 0;
+    uint32_t *blockVariance = m_frame->m_lowres.blockVariance;
+    int loopIncr = (m_param->rc.qgSize == 8) ? 8 : 16;
+
+    uint32_t width = m_frame->m_fencPic->m_picWidth;
+    uint32_t height = m_frame->m_fencPic->m_picHeight;
+    uint32_t block_x = ctu.m_cuPelX + g_zscanToPelX[cuGeom.absPartIdx];
+    uint32_t block_y = ctu.m_cuPelY + g_zscanToPelY[cuGeom.absPartIdx];
+    uint32_t maxCols = (m_frame->m_fencPic->m_picWidth + (loopIncr - 1)) / loopIncr;
+    uint32_t blockSize = m_param->maxCUSize >> cuGeom.depth;
+    uint32_t cnt = 0; 
+
+    for (uint32_t block_yy = block_y; block_yy < block_y + blockSize && block_yy < height; block_yy += loopIncr)
+    {
+        for (uint32_t block_xx = block_x; block_xx < block_x + blockSize && block_xx < width; block_xx += loopIncr)
+        {
+            uint32_t idx = ((block_yy / loopIncr) * (maxCols)) + (block_xx / loopIncr);
+            cuVariance += blockVariance[idx];
+            cnt++;
+        }
+    }
+    
+    return cuVariance / cnt;
+}
+
 int Analysis::calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int32_t complexCheck, double baseQp)
 {
     FrameData& curEncData = *m_frame->m_encData;
diff -r d292dacb81d8 -r e5425bd33176 source/encoder/analysis.h
--- a/source/encoder/analysis.h	Fri Mar 09 09:44:53 2018 +0530
+++ b/source/encoder/analysis.h	Fri Mar 09 17:09:39 2018 +0530
@@ -142,8 +142,29 @@
     uint8_t*                m_multipassModes;
 
     uint8_t                 m_evaluateInter;
+    int32_t                 m_refineLevel;
+
     uint8_t*                m_additionalCtuInfo;
     int*                    m_prevCtuInfoChange;
+
+    struct TrainingData
+    {
+        uint32_t cuVariance;
+        uint8_t predMode;
+        uint8_t partSize;
+        uint8_t mergeFlag;
+        int split;
+
+        void init(const CUData& parentCTU, const CUGeom& cuGeom)
+        {
+            cuVariance = 0;
+            predMode = parentCTU.m_predMode[cuGeom.absPartIdx];
+            partSize = parentCTU.m_partSize[cuGeom.absPartIdx];
+            mergeFlag = parentCTU.m_mergeFlag[cuGeom.absPartIdx];
+            split = 0;
+        }
+    };
+
     /* refine RD based on QP for rd-levels 5 and 6 */
     void qprdRefine(const CUData& parentCTU, const CUGeom& cuGeom, int32_t qp, int32_t lqp);
 
@@ -182,6 +203,10 @@
     void encodeResidue(const CUData& parentCTU, const CUGeom& cuGeom);
 
     int calculateQpforCuSize(const CUData& ctu, const CUGeom& cuGeom, int32_t complexCheck = 0, double baseQP = -1);
+    uint32_t calculateCUVariance(const CUData& ctu, const CUGeom& cuGeom);
+
+    void classifyCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData);
+    void trainCU(const CUData& ctu, const CUGeom& cuGeom, const Mode& bestMode, TrainingData& trainData);
 
     void calculateNormFactor(CUData& ctu, int qp);
     void normFactor(const pixel* src, uint32_t blockSize, CUData& ctu, int qp, TextType ttype);
diff -r d292dacb81d8 -r e5425bd33176 source/encoder/encoder.cpp
--- a/source/encoder/encoder.cpp	Fri Mar 09 09:44:53 2018 +0530
+++ b/source/encoder/encoder.cpp	Fri Mar 09 17:09:39 2018 +0530
@@ -96,6 +96,7 @@
 #endif
 
     m_prevTonemapPayload.payload = NULL;
+    m_startPoint = 0;
 }
 inline char *strcatFilename(const char *input, const char *suffix)
 {
@@ -412,6 +413,17 @@
     if (m_bToneMap)
         m_numCimInfo = m_hdr10plus_api->hdr10plus_json_to_movie_cim(m_param->toneMapFile, m_cim);
 #endif
+
+    if (m_param->bDynamicRefine)
+    {
+        int size = m_param->totalFrames * m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
+        CHECKED_MALLOC_ZERO(m_variance, uint64_t, size);
+        CHECKED_MALLOC_ZERO(m_rdCost, uint64_t, size);
+        CHECKED_MALLOC_ZERO(m_trainingCount, uint32_t, size);
+        return;
+    fail:
+        m_aborted = true;
+    }
 }
 
 void Encoder::stopJobs()
@@ -697,7 +709,13 @@
     if (m_bToneMap)
         m_hdr10plus_api->hdr10plus_clear_movie(m_cim, m_numCimInfo);
 #endif
-        
+
+    if (m_param->bDynamicRefine)
+    {
+        X265_FREE(m_variance);
+        X265_FREE(m_rdCost);
+        X265_FREE(m_trainingCount);
+    }
     if (m_exportedPic)
     {
         ATOMIC_DEC(&m_exportedPic->m_countRefEncoders);
diff -r d292dacb81d8 -r e5425bd33176 source/encoder/encoder.h
--- a/source/encoder/encoder.h	Fri Mar 09 09:44:53 2018 +0530
+++ b/source/encoder/encoder.h	Fri Mar 09 17:09:39 2018 +0530
@@ -221,6 +221,13 @@
 
     x265_sei_payload        m_prevTonemapPayload;
 
+    /* Collect frame level feature data */
+    uint64_t*               m_rdCost;
+    uint64_t*               m_variance;
+    uint32_t*               m_trainingCount;
+    int32_t                 m_startPoint;
+    Lock                    m_dynamicRefineLock;
+
     Encoder();
     ~Encoder()
     {
diff -r d292dacb81d8 -r e5425bd33176 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp	Fri Mar 09 09:44:53 2018 +0530
+++ b/source/encoder/frameencoder.cpp	Fri Mar 09 17:09:39 2018 +0530
@@ -736,6 +736,9 @@
             m_top->m_rateControl->m_startEndOrder.incr(); // faked rateControlEnd calls for negative frames
     }
 
+    if (m_param->bDynamicRefine)
+        computeAvgTrainingData();
+
     /* Analyze CTU rows, most of the hard work is done here.  Frame is
      * compressed in a wave-front pattern if WPP is enabled. Row based loop
      * filters runs behind the CTU compression and reconstruction */
@@ -1457,6 +1460,30 @@
         // Does all the CU analysis, returns best top level mode decision
         Mode& best = tld.analysis.compressCTU(*ctu, *m_frame, m_cuGeoms[m_ctuGeomMap[cuAddr]], rowCoder);
 
+        if (m_param->bDynamicRefine)
+        {
+            {
+                ScopedLock dynLock(m_top->m_dynamicRefineLock);
+                for (uint32_t i = 0; i < X265_REFINE_INTER_LEVELS; i++)
+                {
+                    for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
+                    {
+                        int offset = (depth * X265_REFINE_INTER_LEVELS) + i;
+                        int index = (m_frame->m_encodeOrder * X265_REFINE_INTER_LEVELS * m_param->maxCUDepth) + offset;
+                        if (ctu->m_collectCUCount[offset])
+                        {
+                            m_top->m_variance[index] += ctu->m_collectCUVariance[offset];
+                            m_top->m_rdCost[index] += ctu->m_collectCURd[offset];
+                            m_top->m_trainingCount[index] += ctu->m_collectCUCount[offset];
+                        }
+                    }
+                }
+            }
+            X265_FREE_ZERO(ctu->m_collectCUVariance);
+            X265_FREE_ZERO(ctu->m_collectCURd);
+            X265_FREE_ZERO(ctu->m_collectCUCount);
+        }
+
         // take a sample of the current active worker count
         ATOMIC_ADD(&m_totalActiveWorkerCount, m_activeWorkerCount);
         ATOMIC_INC(&m_activeWorkerCountSamples);
@@ -1839,6 +1866,58 @@
         m_completionEvent.trigger();
 }
 
+void FrameEncoder::computeAvgTrainingData()
+{
+    if (m_frame->m_lowres.bScenecut)
+        m_top->m_startPoint = m_frame->m_encodeOrder;
+
+    if (m_frame->m_encodeOrder - m_top->m_startPoint < 2 * m_param->frameNumThreads)
+        m_frame->m_classifyFrame = false;
+    else
+        m_frame->m_classifyFrame = true;
+
+    int size = m_param->maxCUDepth * X265_REFINE_INTER_LEVELS;
+    memset(m_frame->m_classifyRd, 0, size * sizeof(uint64_t));
+    memset(m_frame->m_classifyVariance, 0, size * sizeof(uint64_t));
+    memset(m_frame->m_classifyCount, 0, size * sizeof(uint32_t));
+
+    if (m_frame->m_classifyFrame)
+    {
+        uint32_t limit = m_frame->m_encodeOrder - m_param->frameNumThreads - 1;
+        for (uint32_t i = m_top->m_startPoint + 1; i < limit; i++)
+        {
+            for (uint32_t j = 0; j < X265_REFINE_INTER_LEVELS; j++)
+            {
+                for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
+                {
+                    int offset = (depth * X265_REFINE_INTER_LEVELS) + j;
+                    int index = (i* X265_REFINE_INTER_LEVELS * m_param->maxCUDepth) + offset;
+                    if (m_top->m_trainingCount[index])
+                    {
+                        m_frame->m_classifyRd[offset] += m_top->m_rdCost[index] / m_top->m_trainingCount[index];
+                        m_frame->m_classifyVariance[offset] += m_top->m_variance[index] / m_top->m_trainingCount[index];
+                        m_frame->m_classifyCount[offset] += m_top->m_trainingCount[index];
+                    }
+                }
+            }
+        }
+        /* Calculates the average feature values of historic frames that are being considered for the current frame */
+        int historyCount = m_frame->m_encodeOrder - m_param->frameNumThreads - m_top->m_startPoint - 1;
+        if (historyCount)
+        {
+            for (uint32_t j = 0; j < X265_REFINE_INTER_LEVELS; j++)
+            {
+                for (uint32_t depth = 0; depth < m_param->maxCUDepth; depth++)
+                {
+                    int offset = (depth * X265_REFINE_INTER_LEVELS) + j;
+                    m_frame->m_classifyRd[offset] /= historyCount;
+                    m_frame->m_classifyVariance[offset] /= historyCount;
+                }
+            }
+        }
+    }
+}
+
 /* collect statistics about CU coding decisions, return total QP */
 int FrameEncoder::collectCTUStatistics(const CUData& ctu, FrameStats* log)
 {
diff -r d292dacb81d8 -r e5425bd33176 source/encoder/frameencoder.h
--- a/source/encoder/frameencoder.h	Fri Mar 09 09:44:53 2018 +0530
+++ b/source/encoder/frameencoder.h	Fri Mar 09 17:09:39 2018 +0530
@@ -230,6 +230,7 @@
     void threadMain();
     int  collectCTUStatistics(const CUData& ctu, FrameStats* frameLog);
     void noiseReductionUpdate();
+    void computeAvgTrainingData();
 
     /* Called by WaveFront::findJob() */
     virtual void processRow(int row, int threadId);
diff -r d292dacb81d8 -r e5425bd33176 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Fri Mar 09 09:44:53 2018 +0530
+++ b/source/encoder/slicetype.cpp	Fri Mar 09 17:09:39 2018 +0530
@@ -178,12 +178,12 @@
             }
         }
 
-        /* Need variance data for weighted prediction */
+        /* Need variance data for weighted prediction and dynamic refinement*/
         if (param->bEnableWeightedPred || param->bEnableWeightedBiPred)
-        {
+        {            
             for (blockY = 0; blockY < maxRow; blockY += loopIncr)
-                for (blockX = 0; blockX < maxCol; blockX += loopIncr)
-                    acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
+                for (blockX = 0; blockX < maxCol; blockX += loopIncr)                
+                    acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);                
         }
     }
     else
@@ -240,7 +240,7 @@
                 else
                 {
                     uint32_t energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp,param->rc.qgSize);
-                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)));
+                    qp_adj = strength * (X265_LOG2(X265_MAX(energy, 1)) - (modeOneConst + 2 * (X265_DEPTH - 8)));                    
                 }
 
                 if (param->bHDROpt)
@@ -308,6 +308,17 @@
             curFrame->m_lowres.wp_ssd[i] = ssd - (sum * sum + (width[i] * height[i]) / 2) / (width[i] * height[i]);
         }
     }
+
+    if (param->bDynamicRefine)
+    {
+        blockXY = 0;
+        for (blockY = 0; blockY < maxRow; blockY += loopIncr)
+            for (blockX = 0; blockX < maxCol; blockX += loopIncr)
+            {
+                curFrame->m_lowres.blockVariance[blockXY] = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize);
+                blockXY++;
+            }
+    }
 }
 
 void LookaheadTLD::lowresIntraEstimate(Lowres& fenc, uint32_t qgSize)

_______________________________________________
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel

[x265] [PATCH 2 of 2] dynamic-refine: enable switching between inter refinement levels 0-3

Reply via email to