Your algorithm concept is keep best 4 candidates for HPEL/QPEL.It may improve
compression performance in some case, but spending 4x (or more) compute cost, I
suggest you made a new search mode for your specially case.The code also have
some problem need to improve, I put my comment inline in below.
At 2016-07-28 01:17:43,[email protected] wrote:
># HG changeset patch
># User N Vijay Anand <[email protected]>
># Date 1469638855 -19800
># Wed Jul 27 22:30:55 2016 +0530
># Node ID 837738a747ead31f905421c3ff413e36f9022ab9
># Parent 5a0e139e29386ecebafc9c555aedcd3e0f61c70c
>Compression Gains improved by ~10%.
>PSNR values need to be reconfirmed.
>Subjective Quality same as x265.
>
>diff -r 5a0e139e2938 -r 837738a747ea source/encoder/motion.cpp
>--- a/source/encoder/motion.cpp Fri Jul 22 13:13:42 2016 +0530
>+++ b/source/encoder/motion.cpp Wed Jul 27 22:30:55 2016 +0530
>@@ -99,6 +99,27 @@
>
> }
>
>+inline void PushToBMVStack(MV *bStack, MV & bv, int *bCostStack, int bcost)
>+{
>+ for (int i=0; i<4; i++)
[MC] code style mistake
>+ {
>+ if((bCostStack[i] == bcost) && (bv == bStack[i]))
>+ break;
>+ if((bCostStack[i] > bcost) && (bv != bStack[i]))
>+ {
>+ for (int j=3; j>i; j--)
>+ {
>+ bStack[j] = bStack[j-1];
>+ bCostStack[j] = bCostStack[j-1];
>+ }
[MC] this loop equal to memmove
>+ bStack[i] = bv;
>+ bCostStack[i] = bcost;
>+ break;
>+ }
>+ }
>+ return;
>+}
>+
> MotionEstimate::MotionEstimate()
> {
> ctuAddr = -1;
>@@ -223,6 +244,7 @@
> bmv = tmv; \
> bPointNr = point; \
> bDistance = dist; \
>+ PushToBMVStack(bmvStack, tmv, bmvCostStack, cost); \
> } \
> } while (0)
>
>@@ -232,6 +254,7 @@
> int cost = sad(fenc, FENC_STRIDE, fref + (mx) + (my) * stride,
> stride); \
> cost += mvcost(MV(mx, my) << 2); \
> COPY2_IF_LT(bcost, cost, bmv, MV(mx, my)); \
>+ PushToBMVStack(bmvStack, MV(mx,my), bmvCostStack, cost); \
> } while (0)
>
> #define COST_MV_X3_DIR(m0x, m0y, m1x, m1y, m2x, m2y, costs) \
>@@ -245,6 +268,9 @@
> (costs)[0] += mvcost((bmv + MV(m0x, m0y)) << 2); \
> (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
> (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
>+ PushToBMVStack(bmvStack, bmv+MV(m0x,m0y), bmvCostStack, (costs)[0]); \
>+ PushToBMVStack(bmvStack, bmv+MV(m1x,m1y), bmvCostStack, (costs)[1]); \
>+ PushToBMVStack(bmvStack, bmv+MV(m2x,m2y), bmvCostStack, (costs)[2]); \
> }
>
> #define COST_MV_PT_DIST_X4(m0x, m0y, p0, d0, m1x, m1y, p1, d1, m2x, m2y, p2,
> d2, m3x, m3y, p3, d3) \
>@@ -260,9 +286,13 @@
> (costs)[2] += mvcost(MV(m2x, m2y) << 2); \
> (costs)[3] += mvcost(MV(m3x, m3y) << 2); \
> COPY4_IF_LT(bcost, costs[0], bmv, MV(m0x, m0y), bPointNr, p0,
> bDistance, d0); \
>+ PushToBMVStack(bmvStack, MV(m0x,m0y), bmvCostStack, (costs)[0]); \
> COPY4_IF_LT(bcost, costs[1], bmv, MV(m1x, m1y), bPointNr, p1,
> bDistance, d1); \
>+ PushToBMVStack(bmvStack, MV(m1x,m1y), bmvCostStack, (costs)[1]); \
> COPY4_IF_LT(bcost, costs[2], bmv, MV(m2x, m2y), bPointNr, p2,
> bDistance, d2); \
>+ PushToBMVStack(bmvStack, MV(m2x,m2y), bmvCostStack, (costs)[2]); \
> COPY4_IF_LT(bcost, costs[3], bmv, MV(m3x, m3y), bPointNr, p3,
> bDistance, d3); \
>+ PushToBMVStack(bmvStack, MV(m3x,m3y), bmvCostStack, (costs)[3]); \
> }
>
> #define COST_MV_X4(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y) \
>@@ -279,9 +309,13 @@
> costs[2] += mvcost((omv + MV(m2x, m2y)) << 2); \
> costs[3] += mvcost((omv + MV(m3x, m3y)) << 2); \
> COPY2_IF_LT(bcost, costs[0], bmv, omv + MV(m0x, m0y)); \
>+ PushToBMVStack(bmvStack, omv+MV(m0x,m0y), bmvCostStack, (costs)[0]); \
> COPY2_IF_LT(bcost, costs[1], bmv, omv + MV(m1x, m1y)); \
>+ PushToBMVStack(bmvStack, omv+MV(m1x,m1y), bmvCostStack, (costs)[1]); \
> COPY2_IF_LT(bcost, costs[2], bmv, omv + MV(m2x, m2y)); \
>+ PushToBMVStack(bmvStack, omv+MV(m2x,m2y), bmvCostStack, (costs)[2]); \
> COPY2_IF_LT(bcost, costs[3], bmv, omv + MV(m3x, m3y)); \
>+ PushToBMVStack(bmvStack, omv+MV(m3x,m3y), bmvCostStack, (costs)[3]); \
> }
>
> #define COST_MV_X4_DIR(m0x, m0y, m1x, m1y, m2x, m2y, m3x, m3y, costs) \
>@@ -297,6 +331,10 @@
> (costs)[1] += mvcost((bmv + MV(m1x, m1y)) << 2); \
> (costs)[2] += mvcost((bmv + MV(m2x, m2y)) << 2); \
> (costs)[3] += mvcost((bmv + MV(m3x, m3y)) << 2); \
>+ PushToBMVStack(bmvStack, bmv+MV(m0x,m0y), bmvCostStack, (costs)[0]); \
>+ PushToBMVStack(bmvStack, bmv+MV(m1x,m1y), bmvCostStack, (costs)[1]); \
>+ PushToBMVStack(bmvStack, bmv+MV(m2x,m2y), bmvCostStack, (costs)[2]); \
>+ PushToBMVStack(bmvStack, bmv+MV(m3x,m3y), bmvCostStack, (costs)[3]); \
> }
>
> #define DIA1_ITER(mx, my) \
>@@ -336,6 +374,8 @@
> const MV & mvmax,
> MV & bmv,
> int & bcost,
>+ MV *bmvStack,
>+ int *bmvCostStack,
> int & bPointNr,
> int & bDistance,
> int earlyExitIters,
>@@ -614,6 +654,8 @@
>
> /* re-measure full pel rounded MVP with SAD as search start point */
> MV bmv = pmv.roundToFPel();
>+ MV bmvStack[4] = {bmv, bmv, bmv, bmv};
[MC] unnecessary initialize since cost is maximum value of SAD in below
>+ int bmvCostStack[4] = {0x7fff, 0x7fff, 0x7fff, 0x7fff};
[MC] we can't assume SAD less than 16 bits
> int bcost = bprecost;
> if (pmv.isSubpel())
> bcost = sad(fenc, FENC_STRIDE, fref + bmv.x + bmv.y * stride, stride)
> + mvcost(bmv << 2);
>@@ -755,6 +797,8 @@
> /* refine predictors */
> omv = bmv;
> ucost1 = bcost;
>+ bmvStack[0] = bmv;
>+ bmvCostStack[0] = bcost;
> DIA1_ITER(pmv.x, pmv.y);
> if (pmv.notZero())
> DIA1_ITER(0, 0);
>@@ -878,7 +922,12 @@
> stride, costs + 4 * k); \
> fref_base += 2 * dy;
> #define ADD_MVCOST(k, x, y) costs[k] += p_cost_omvx[x * 4 * i] +
> p_cost_omvy[y * 4 * i]
>-#define MIN_MV(k, x, y) COPY2_IF_LT(bcost, costs[k], dir, x * 16 + (y &
>15))
>+#define MIN_MV(k, x, y) \
>+ do \
>+ { \
>+ COPY2_IF_LT(bcost, costs[k], dir, x * 16 + (y & 15)); \
>+ PushToBMVStack(bmvStack, bmv+MV(x*i,y*i), bmvCostStack, costs[k]); \
>+ } while (0)
>
> SADS(0, +0, -4, +0, +4, -2, -3, +2, -3);
> SADS(1, -4, -2, +4, -2, -4, -1, +4, -1);
>@@ -916,6 +965,7 @@
> MIN_MV(13, 4, 2);
> MIN_MV(14, -2, 3);
> MIN_MV(15, 2, 3);
>+
> #undef SADS
> #undef ADD_MVCOST
> #undef MIN_MV
>@@ -938,7 +988,8 @@
> int bDistance = 0;
>
> const int EarlyExitIters = 3;
>- StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr, bDistance,
>EarlyExitIters, merange);
>+ StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bmvStack,
>bmvCostStack, bPointNr, bDistance, EarlyExitIters, merange);
>+
[MC] I also suggest use struct to package both your MV and Cost to reduce
memory access cost, the x64 just pass up to 6 parameters in registers.
> if (bDistance == 1) > { > // if best distance was
> only 1, check two missing points. If no new point is found, stop >@@ -989,15
> +1040,19 @@ > stride, costs); >
> costs[0] += mvcost(tmv << 2); >
> COPY2_IF_LT(bcost, costs[0], bmv, tmv); >+
> PushToBMVStack(bmvStack, tmv, bmvCostStack, (costs)[0]); >
> tmv.x += RasterDistance; > costs[1] +=
> mvcost(tmv << 2); > COPY2_IF_LT(bcost, costs[1], bmv,
> tmv); >+ PushToBMVStack(bmvStack, tmv, bmvCostStack,
> (costs)[1]); > tmv.x += RasterDistance; >
> costs[2] += mvcost(tmv << 2); >
> COPY2_IF_LT(bcost, costs[2], bmv, tmv); >+
> PushToBMVStack(bmvStack, tmv, bmvCostStack, (costs)[2]); >
> tmv.x += RasterDistance; > costs[3] +=
> mvcost(tmv << 3); > COPY2_IF_LT(bcost, costs[3], bmv,
> tmv); >+ PushToBMVStack(bmvStack, tmv, bmvCostStack,
> (costs)[3]); > } > else >
> COST_MV(tmv.x, tmv.y); >@@ -1011,7 +1066,7 @@ >
> bDistance = 0; > bPointNr = 0; > const int MaxIters =
> 32; >- StarPatternSearch(ref, mvmin, mvmax, bmv, bcost, bPointNr,
> bDistance, MaxIters, merange); >+ StarPatternSearch(ref, mvmin,
> mvmax, bmv, bcost, bmvStack, bmvCostStack, bPointNr, bDistance, MaxIters,
> merange); > > if (bDistance == 1) > { >@@ -1061,15
> +1116,19 @@ > stride, costs); >
> costs[0] += mvcost(tmv << 2); > COPY2_IF_LT(bcost,
> costs[0], bmv, tmv); >+ PushToBMVStack(bmvStack, tmv,
> bmvCostStack, (costs)[0]); > tmv.x++; >
> costs[1] += mvcost(tmv << 2); > COPY2_IF_LT(bcost,
> costs[1], bmv, tmv); >+ PushToBMVStack(bmvStack, tmv,
> bmvCostStack, (costs)[1]); > tmv.x++; >
> costs[2] += mvcost(tmv << 2); > COPY2_IF_LT(bcost,
> costs[2], bmv, tmv); >+ PushToBMVStack(bmvStack, tmv,
> bmvCostStack, (costs)[2]); > tmv.x++; >
> costs[3] += mvcost(tmv << 2); > COPY2_IF_LT(bcost,
> costs[3], bmv, tmv); >+ PushToBMVStack(bmvStack, tmv,
> bmvCostStack, (costs)[3]); > } > else >
> COST_MV(tmv.x, tmv.y); >@@ -1090,7 +1149,13 @@ > bcost
> = bprecost; > } > else >+ { > bmv = bmv.toQPel(); //
> promote search bmv to qpel >+ for (int i=0; i<4; i++) >+ { >+
> bmvStack[i] = bmvStack[i].toQPel(); >+ } >+ } > > const
> SubpelWorkload& wl = workload[this->subpelRefine]; > >@@ -1103,72 +1168,111
> @@ > else if (ref->isLowres) > { > int bdir = 0; >-
> for (int i = 1; i <= wl.hpel_dirs; i++) >+ for (int nBmv=0; nBmv<4;
> nBmv++) > { >- MV qmv = bmv + square1[i] * 2; >-
> int cost = ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv);
> >- COPY2_IF_LT(bcost, cost, bdir, i); >+ bdir = 0; >+
> bmv = bmvStack[nBmv]; >+ bcost = bmvCostStack[nBmv]; >+ >+
> for (int i = 1; i <= wl.hpel_dirs; i++) >+ { >+ MV
> qmv = bmv + square1[i] * 2; >+ int cost =
> ref->lowresQPelCost(fenc, blockOffset, qmv, sad) + mvcost(qmv); >+
> COPY2_IF_LT(bcost, cost, bdir, i); >+ } >+ >+ bmv +=
> square1[bdir] * 2; >+ bcost = ref->lowresQPelCost(fenc, blockOffset,
> bmv, satd) + mvcost(bmv); >+ >+ bdir = 0; >+ for (int i =
> 1; i <= wl.qpel_dirs; i++) >+ { >+ MV qmv = bmv +
> square1[i]; >+ int cost = ref->lowresQPelCost(fenc, blockOffset,
> qmv, satd) + mvcost(qmv); >+ COPY2_IF_LT(bcost, cost, bdir, i);
> >+ } >+ >+ bmv += square1[bdir]; >+ bmvStack[nBmv]
> = bmv; >+ bmvCostStack[nBmv] = bcost; > } > >- bmv
> += square1[bdir] * 2; >- bcost = ref->lowresQPelCost(fenc,
> blockOffset, bmv, satd) + mvcost(bmv); >- >- bdir = 0; >- for
> (int i = 1; i <= wl.qpel_dirs; i++) >+ bmv = bmvStack[0]; >+
> bcost = bmvCostStack[0]; >+ for (int i=1; i<4; i++) > { >-
> MV qmv = bmv + square1[i]; >- int cost =
> ref->lowresQPelCost(fenc, blockOffset, qmv, satd) + mvcost(qmv); >-
> COPY2_IF_LT(bcost, cost, bdir, i); >+ if (bmvCostStack[i]<bcost) >+
> { >+ bmv = bmvStack[i]; >+ bcost =
> bmvCostStack[i]; >+ } > } >- >- bmv += square1[bdir];
> > } > else > { > pixelcmp_t hpelcomp; > >- if
> (wl.hpel_satd) >- { >- bcost = subpelCompare(ref, bmv,
> satd) + mvcost(bmv); >- hpelcomp = satd; >- } >-
> else >- hpelcomp = sad; >- >- for (int iter = 0; iter <
> wl.hpel_iters; iter++) >+ for (int nBmv=0; nBmv<4; nBmv++) > {
> >- int bdir = 0; >- for (int i = 1; i <= wl.hpel_dirs;
> i++) >- { >- MV qmv = bmv + square1[i] * 2; >-
> int cost = subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv); >-
> COPY2_IF_LT(bcost, cost, bdir, i); >- } >+ bmv
> = bmvStack[nBmv]; > >- if (bdir) >- bmv +=
> square1[bdir] * 2; >- else >- break; >+ if
> (wl.hpel_satd) >+ { >+ bcost = subpelCompare(ref, bmv,
> satd) + mvcost(bmv); >+ hpelcomp = satd; >+ } >+
> else >+ hpelcomp = sad; >+ >+ for (int iter = 0; iter
> < wl.hpel_iters; iter++) >+ { >+ int bdir = 0; >+
> for (int i = 1; i <= wl.hpel_dirs; i++) >+ { >+
> MV qmv = bmv + square1[i] * 2; >+ int cost =
> subpelCompare(ref, qmv, hpelcomp) + mvcost(qmv); >+
> COPY2_IF_LT(bcost, cost, bdir, i); >+ } >+ >+ if
> (bdir) >+ bmv += square1[bdir] * 2; >+ else >+
> break; >+ } >+ >+ /* if HPEL search used
> SAD, remeasure with SATD before QPEL */ >+ if (!wl.hpel_satd) >+
> bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv); >+ >+
> for (int iter = 0; iter < wl.qpel_iters; iter++) >+ { >+
> int bdir = 0; >+ for (int i = 1; i <= wl.qpel_dirs; i++) >+
> { >+ MV qmv = bmv + square1[i]; >+
> int cost = subpelCompare(ref, qmv, satd) + mvcost(qmv); >+
> COPY2_IF_LT(bcost, cost, bdir, i); >+ } >+ >+ if
> (bdir) >+ bmv += square1[bdir]; >+ else >+
> break; >+ } >+ >+ bmvStack[nBmv] = bmv; >+
> bmvCostStack[nBmv] = bcost; > } > >- /* if HPEL search
> used SAD, remeasure with SATD before QPEL */ >- if (!wl.hpel_satd) >-
> bcost = subpelCompare(ref, bmv, satd) + mvcost(bmv); >- >-
> for (int iter = 0; iter < wl.qpel_iters; iter++) >+ bmv = bmvStack[0];
> >+ bcost = bmvCostStack[0]; >+ for (int i=1; i<4; i++) >
> { >- int bdir = 0; >- for (int i = 1; i <=
> wl.qpel_dirs; i++) >- { >- MV qmv = bmv +
> square1[i]; >- int cost = subpelCompare(ref, qmv, satd) +
> mvcost(qmv); >- COPY2_IF_LT(bcost, cost, bdir, i); >-
> } >- >- if (bdir) >- bmv += square1[bdir]; >-
> else >- break; >+ if (bmvCostStack[i]<bcost)
> >+ { >+ bmv = bmvStack[i]; >+ bcost =
> bmvCostStack[i]; >+ } > } > }_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel