Your algorithm concept is keep best 4 candidates for HPEL/QPEL.It may improve 
compression performance in some case, but spending 4x (or more) compute cost, I 
suggest you made a new search mode for your specially case.The code also have 
some problem need to improve, I put my comment inline in below.
At 2016-07-28 01:17:43,[email protected] wrote:
># HG changeset patch
># User N Vijay Anand <[email protected]>
># Date 1469638855 -19800
>#      Wed Jul 27 22:30:55 2016 +0530
># Node ID 837738a747ead31f905421c3ff413e36f9022ab9
># Parent  5a0e139e29386ecebafc9c555aedcd3e0f61c70c
>Compression Gains improved by ~10%.
>PSNR values need to be reconfirmed.
>Subjective Quality same as x265.
>
>diff -r 5a0e139e2938 -r 837738a747ea source/encoder/motion.cpp
>--- a/source/encoder/motion.cpp        Fri Jul 22 13:13:42 2016 +0530
>+++ b/source/encoder/motion.cpp        Wed Jul 27 22:30:55 2016 +0530
>@@ -99,6 +99,27 @@
> 
> }
> 
>+inline void PushToBMVStack(MV  *bStack, MV & bv, int *bCostStack, int bcost)
>+{
>+    for (int i=0; i<4; i++)
[MC] code style mistake

>+    {
>+        if((bCostStack[i] == bcost) && (bv == bStack[i]))
>+            break;
>+        if((bCostStack[i] > bcost) && (bv != bStack[i]))
>+        {
>+            for (int j=3; j>i; j--)
>+            {
>+                bStack[j] = bStack[j-1];
>+                bCostStack[j] = bCostStack[j-1];
>+            }
[MC] this loop equal to memmove

>+            bStack[i] = bv;
>+            bCostStack[i] = bcost;
>+            break;
>+        }
>+    }
>+    return;
>+}
>+
> MotionEstimate::MotionEstimate()
> {
>     ctuAddr = -1;
>@@ -223,6 +244,7 @@
>             bmv = tmv; \
>             bPointNr = point; \
>             bDistance = dist; \
>+            PushToBMVStack(bmvStack, tmv, bmvCostStack, cost); \
>         } \
>     } while (0)
> 
>@@ -232,6 +254,7 @@
>         int cost = sad(fenc, FENC_STRIDE, fref + (mx) + (my) * stride, 
> stride); \
>         cost += mvcost(MV(mx, my) << 2); \
>         COPY2_IF_LT(bcost, cost, bmv, MV(mx, my)); \
>+        PushToBMVStack(bmvStack, MV(mx,my), bmvCostStack, cost); \
>     } while (0)
> 
> #define COST_MV_X3_DIR(m0x, m0y, m1x, m1y, m2x, m2y, costs) \
>@@ -245,6 +268,9 @@
>         (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
>         (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
>         (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
>+        PushToBMVStack(bmvStack, bmv+MV(m0x,m0y), bmvCostStack, (costs)[0]); \
>+        PushToBMVStack(bmvStack, bmv+MV(m1x,m1y), bmvCostStack, (costs)[1]); \
>+        PushToBMVStack(bmvStack, bmv+MV(m2x,m2y), bmvCostStack, (costs)[2]); \
>     }
> 
> #define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2, 
> d2, m3x, m3y, p3, d3) \
>@@ -260,9 +286,13 @@
>         (costs)[2] += mvcost(MV(m2x, m2y) << 2); \
>         (costs)[3] += mvcost(MV(m3x, m3y) << 2); \
>         COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0, 
> bDistance, d0); \
>+        PushToBMVStack(bmvStack, MV(m0x,m0y), bmvCostStack, (costs)[0]); \
>         COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1, 
> bDistance, d1); \
>+        PushToBMVStack(bmvStack, MV(m1x,m1y), bmvCostStack, (costs)[1]); \
>         COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2, 
> bDistance, d2); \
>+        PushToBMVStack(bmvStack, MV(m2x,m2y), bmvCostStack, (costs)[2]); \
>         COPY4_IF_LT(bcost, costs[3], bmv, MV(m3x, m3y), bPointNr, p3, 
> bDistance, d3); \
>+        PushToBMVStack(bmvStack, MV(m3x,m3y), bmvCostStack, (costs)[3]); \
>     }
> 
> #define COST_MV_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \
>@@ -279,9 +309,13 @@
>         costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
>         costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
>         COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
>+        PushToBMVStack(bmvStack, omv+MV(m0x,m0y), bmvCostStack, (costs)[0]); \
>         COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
>+        PushToBMVStack(bmvStack, omv+MV(m1x,m1y), bmvCostStack, (costs)[1]); \
>         COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
>+        PushToBMVStack(bmvStack, omv+MV(m2x,m2y), bmvCostStack, (costs)[2]); \
>         COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
>+        PushToBMVStack(bmvStack, omv+MV(m3x,m3y), bmvCostStack, (costs)[3]); \
>     }
> 
> #define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \
>@@ -297,6 +331,10 @@
>         (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
>         (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
>         (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
>+        PushToBMVStack(bmvStack, bmv+MV(m0x,m0y), bmvCostStack, (costs)[0]); \
>+        PushToBMVStack(bmvStack, bmv+MV(m1x,m1y), bmvCostStack, (costs)[1]); \
>+        PushToBMVStack(bmvStack, bmv+MV(m2x,m2y), bmvCostStack, (costs)[2]); \
>+        PushToBMVStack(bmvStack, bmv+MV(m3x,m3y), bmvCostStack, (costs)[3]); \
>     }
> 
> #define DIA1_ITER(mx, my) \
>@@ -336,6 +374,8 @@
>                                        const MV &       mvmax,
>                                        MV &             bmv,
>                                        int &            bcost,
>+                                       MV              *bmvStack,
>+                                       int             *bmvCostStack,
>                                        int &            bPointNr,
>                                        int &            bDistance,
>                                        int              earlyExitIters,
>@@ -614,6 +654,8 @@
> 
>     /* re-measure full pel rounded MVP with SAD as search start point */
>     MV bmv = pmv.roundToFPel();
>+    MV bmvStack[4] = {bmv, bmv, bmv, bmv};
[MC] unnecessary initialize since cost is maximum value of SAD in below

>+    int bmvCostStack[4] = {0x7fff, 0x7fff, 0x7fff, 0x7fff};
[MC] we can't assume SAD less than 16 bits
>     int bcost = bprecost;
>     if (pmv.isSubpel())
>         bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride) 
> + mvcost(bmv << 2);
>@@ -755,6 +797,8 @@
>         /* refine predictors */
>         omv = bmv;
>         ucost1 = bcost;
>+        bmvStack[0] = bmv;
>+        bmvCostStack[0] = bcost;
>         DIA1_ITER(pmv.x, pmv.y);
>         if (pmv.notZero())
>             DIA1_ITER(0, 0);
>@@ -878,7 +922,12 @@
>            stride, costs + 4 * k); \
>     fref_base += 2 * dy;
> #define ADD_MVCOST(k, x, y) costs[k] += p_cost_omvx[x * 4 * i] + 
> p_cost_omvy[y * 4 * i]
>-#define MIN_MV(k, x, y)     COPY2_IF_LT(bcost, costs[k], dir, x * 16 + (y & 
>15))
>+#define MIN_MV(k, x, y)  \
>+    do \
>+    {  \
>+        COPY2_IF_LT(bcost, costs[k], dir, x * 16 + (y & 15)); \
>+        PushToBMVStack(bmvStack, bmv+MV(x*i,y*i), bmvCostStack, costs[k]); \
>+    } while (0)
> 
>                 SADS(0, +0, -4, +0, +4, -2, -3, +2, -3);
>                 SADS(1, -4, -2, +4, -2, -4, -1, +4, -1);
>@@ -916,6 +965,7 @@
>                 MIN_MV(13, 4, 2);
>                 MIN_MV(14, -2, 3);
>                 MIN_MV(15, 2, 3);
>+
> #undef SADS
> #undef ADD_MVCOST
> #undef MIN_MV
>@@ -938,7 +988,8 @@
>         int bDistance = 0;
> 
>         const int EarlyExitIters = 3;
>-        StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance, 
>EarlyExitIters, merange);
>+        StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bmvStack, 
>bmvCostStack, bPointNr, bDistance, EarlyExitIters, merange);
>+
[MC] I also suggest use struct to package both your MV and Cost to reduce 
memory access cost, the x64 just pass up to 6 parameters in registers.


>         if (bDistance == 1) >         { >             // if best distance was 
> only 1, check two missing points.  If no new point is found, stop >@@ -989,15 
> +1040,19 @@ >                                stride, costs); >                
>          costs[0] += mvcost(tmv << 2); >                         
> COPY2_IF_LT(bcost, costs[0], bmv, tmv); >+                        
> PushToBMVStack(bmvStack, tmv, bmvCostStack, (costs)[0]); >                    
>      tmv.x += RasterDistance; >                         costs[1] += 
> mvcost(tmv << 2); >                         COPY2_IF_LT(bcost, costs[1], bmv, 
> tmv); >+                        PushToBMVStack(bmvStack, tmv, bmvCostStack, 
> (costs)[1]); >                         tmv.x += RasterDistance; >             
>             costs[2] += mvcost(tmv << 2); >                         
> COPY2_IF_LT(bcost, costs[2], bmv, tmv); >+                        
> PushToBMVStack(bmvStack, tmv, bmvCostStack, (costs)[2]); >                    
>      tmv.x += RasterDistance; >                         costs[3] += 
> mvcost(tmv << 3); >                         COPY2_IF_LT(bcost, costs[3], bmv, 
> tmv); >+                        PushToBMVStack(bmvStack, tmv, bmvCostStack, 
> (costs)[3]); >                     } >                     else >             
>             COST_MV(tmv.x, tmv.y); >@@ -1011,7 +1066,7 @@ >             
> bDistance = 0; >             bPointNr = 0; >             const int MaxIters = 
> 32; >-            StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, 
> bDistance, MaxIters, merange); >+            StarPatternSearch(ref, mvmin, 
> mvmax, bmv, bcost, bmvStack, bmvCostStack, bPointNr, bDistance, MaxIters, 
> merange); >  >             if (bDistance == 1) >             { >@@ -1061,15 
> +1116,19 @@ >                            stride, costs); >                    
>  costs[0] += mvcost(tmv << 2); >                     COPY2_IF_LT(bcost, 
> costs[0], bmv, tmv); >+                    PushToBMVStack(bmvStack, tmv, 
> bmvCostStack, (costs)[0]); >                     tmv.x++; >                   
>   costs[1] += mvcost(tmv << 2); >                     COPY2_IF_LT(bcost, 
> costs[1], bmv, tmv); >+                    PushToBMVStack(bmvStack, tmv, 
> bmvCostStack, (costs)[1]); >                     tmv.x++; >                   
>   costs[2] += mvcost(tmv << 2); >                     COPY2_IF_LT(bcost, 
> costs[2], bmv, tmv); >+                    PushToBMVStack(bmvStack, tmv, 
> bmvCostStack, (costs)[2]); >                     tmv.x++; >                   
>   costs[3] += mvcost(tmv << 2); >                     COPY2_IF_LT(bcost, 
> costs[3], bmv, tmv); >+                    PushToBMVStack(bmvStack, tmv, 
> bmvCostStack, (costs)[3]); >                 } >                 else >       
>               COST_MV(tmv.x, tmv.y); >@@ -1090,7 +1149,13 @@ >         bcost 
> = bprecost; >     } >     else >+    { >         bmv = bmv.toQPel(); // 
> promote search bmv to qpel >+        for (int i=0; i<4; i++) >+        { >+   
>        bmvStack[i] = bmvStack[i].toQPel(); >+        } >+    } >  >     const 
> SubpelWorkload& wl = workload[this->subpelRefine]; >  >@@ -1103,72 +1168,111 
> @@ >     else if (ref->isLowres) >     { >         int bdir = 0; >-        
> for (int i = 1; i <= wl.hpel_dirs; i++) >+        for (int nBmv=0; nBmv<4; 
> nBmv++) >         { >-            MV qmv = bmv + square1[i] * 2; >-           
>  int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv); 
> >-            COPY2_IF_LT(bcost, cost, bdir, i); >+          bdir = 0; >+     
>      bmv =  bmvStack[nBmv]; >+          bcost = bmvCostStack[nBmv]; >+ >+     
>      for (int i = 1; i <= wl.hpel_dirs; i++) >+          { >+              MV 
> qmv = bmv + square1[i] * 2; >+              int cost = 
> ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv); >+            
>   COPY2_IF_LT(bcost, cost, bdir, i); >+          } >+ >+          bmv += 
> square1[bdir] * 2; >+          bcost = ref->lowresQPelCost(fenc, blockOffset, 
> bmv, satd) + mvcost(bmv); >+ >+          bdir = 0; >+          for (int i = 
> 1; i <= wl.qpel_dirs; i++) >+          { >+              MV qmv = bmv + 
> square1[i]; >+              int cost = ref->lowresQPelCost(fenc, blockOffset, 
> qmv, satd) + mvcost(qmv); >+              COPY2_IF_LT(bcost, cost, bdir, i); 
> >+          } >+ >+          bmv += square1[bdir]; >+          bmvStack[nBmv] 
> = bmv; >+          bmvCostStack[nBmv] = bcost; >         } >  >-        bmv 
> += square1[bdir] * 2; >-        bcost = ref->lowresQPelCost(fenc, 
> blockOffset, bmv, satd) + mvcost(bmv); >- >-        bdir = 0; >-        for 
> (int i = 1; i <= wl.qpel_dirs; i++) >+        bmv = bmvStack[0]; >+        
> bcost = bmvCostStack[0]; >+        for (int i=1; i<4; i++) >         { >-     
>        MV qmv = bmv + square1[i]; >-            int cost = 
> ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv); >-           
>  COPY2_IF_LT(bcost, cost, bdir, i); >+          if (bmvCostStack[i]<bcost) >+ 
>          { >+            bmv = bmvStack[i]; >+            bcost = 
> bmvCostStack[i]; >+          } >         } >- >-        bmv += square1[bdir]; 
> >     } >     else >     { >         pixelcmp_t hpelcomp; >  >-        if 
> (wl.hpel_satd) >-        { >-            bcost = subpelCompare(ref, bmv, 
> satd) + mvcost(bmv); >-            hpelcomp = satd; >-        } >-        
> else >-            hpelcomp = sad; >- >-        for (int iter = 0; iter < 
> wl.hpel_iters; iter++) >+        for (int nBmv=0; nBmv<4; nBmv++) >         { 
> >-            int bdir = 0; >-            for (int i = 1; i <= wl.hpel_dirs; 
> i++) >-            { >-                MV qmv = bmv + square1[i] * 2; >-      
>           int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv); >-      
>           COPY2_IF_LT(bcost, cost, bdir, i); >-            } >+          bmv 
> =  bmvStack[nBmv]; >  >-            if (bdir) >-                bmv += 
> square1[bdir] * 2; >-            else >-                break; >+          if 
> (wl.hpel_satd) >+          { >+              bcost = subpelCompare(ref, bmv, 
> satd) + mvcost(bmv); >+              hpelcomp = satd; >+          } >+        
>   else >+              hpelcomp = sad; >+ >+          for (int iter = 0; iter 
> < wl.hpel_iters; iter++) >+          { >+              int bdir = 0; >+       
>        for (int i = 1; i <= wl.hpel_dirs; i++) >+              { >+           
>        MV qmv = bmv + square1[i] * 2; >+                  int cost = 
> subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv); >+                  
> COPY2_IF_LT(bcost, cost, bdir, i); >+              } >+ >+              if 
> (bdir) >+                  bmv += square1[bdir] * 2; >+              else >+  
>                 break; >+          } >+ >+          /* if HPEL search used 
> SAD, remeasure with SATD before QPEL */ >+          if (!wl.hpel_satd) >+     
>          bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv); >+ >+          
> for (int iter = 0; iter < wl.qpel_iters; iter++) >+          { >+             
>  int bdir = 0; >+              for (int i = 1; i <= wl.qpel_dirs; i++) >+     
>          { >+                  MV qmv = bmv + square1[i]; >+                  
> int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv); >+                  
> COPY2_IF_LT(bcost, cost, bdir, i); >+              } >+ >+              if 
> (bdir) >+                  bmv += square1[bdir]; >+              else >+      
>             break; >+          } >+ >+          bmvStack[nBmv] = bmv; >+      
>     bmvCostStack[nBmv] = bcost; >         } >  >-        /* if HPEL search 
> used SAD, remeasure with SATD before QPEL */ >-        if (!wl.hpel_satd) >-  
>           bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv); >- >-        
> for (int iter = 0; iter < wl.qpel_iters; iter++) >+        bmv = bmvStack[0]; 
> >+        bcost = bmvCostStack[0]; >+        for (int i=1; i<4; i++) >        
>  { >-            int bdir = 0; >-            for (int i = 1; i <= 
> wl.qpel_dirs; i++) >-            { >-                MV qmv = bmv + 
> square1[i]; >-                int cost = subpelCompare(ref, qmv, satd) + 
> mvcost(qmv); >-                COPY2_IF_LT(bcost, cost, bdir, i); >-          
>   } >- >-            if (bdir) >-                bmv += square1[bdir]; >-     
>        else >-                break; >+          if (bmvCostStack[i]<bcost) 
> >+          { >+            bmv = bmvStack[i]; >+            bcost = 
> bmvCostStack[i]; >+          } >         } >     }
_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel

Reply via email to