Re: [x265] [x265 patch] Fix: Performance drop in aq-mode 4

2019-10-14 Thread Aruna Matheswaran
Pushed to Release_3.2.

On Fri, Oct 11, 2019 at 1:41 PM Akil  wrote:

> This should work.
>
> # HG changeset patch
> # User Akil Ayyappan
> # Date 1570778152 -19800
> #  Fri Oct 11 12:45:52 2019 +0530
> # Branch Release_3.2
> # Node ID efe5ac3c25dac009efbffaf5ed5e54734a02f812
> # Parent  377cb2b0c3698342008a9304e8e7f5bedcf3f1f4
> Fix: Performance drop in aq-mode 4
>
> This patch moves the memory handling part of the edge information required
> for aq-mode 4
> to the Frame class-level in that way it can be reused by the threads.
>
> diff -r 377cb2b0c369 -r efe5ac3c25da source/common/frame.cpp
> --- a/source/common/frame.cpp Tue Sep 24 15:02:05 2019 +0530
> +++ b/source/common/frame.cpp Fri Oct 11 12:45:52 2019 +0530
> @@ -57,6 +57,9 @@
>  m_addOnPrevChange = NULL;
>  m_classifyFrame = false;
>  m_fieldNum = 0;
> +m_edgePic = NULL;
> +m_gaussianPic = NULL;
> +m_thetaPic = NULL;
>  }
>
>  bool Frame::create(x265_param *param, float* quantOffsets)
> @@ -97,6 +100,20 @@
>  CHECKED_MALLOC_ZERO(m_classifyCount, uint32_t, size);
>  }
>
> +if (param->rc.aqMode == X265_AQ_EDGE || (param->rc.zonefileCount &&
> param->rc.aqMode != 0))
> +{
> +uint32_t numCuInWidth = (param->sourceWidth + param->maxCUSize -
> 1) / param->maxCUSize;
> +uint32_t numCuInHeight = (param->sourceHeight + param->maxCUSize
> - 1) / param->maxCUSize;
> +uint32_t m_lumaMarginX = param->maxCUSize + 32; // search margin
> and 8-tap filter half-length, padded for 32-byte alignment
> +uint32_t m_lumaMarginY = param->maxCUSize + 16; // margin for
> 8-tap filter and infinite padding
> +intptr_t m_stride = (numCuInWidth * param->maxCUSize) +
> (m_lumaMarginX << 1);
> +int maxHeight = numCuInHeight * param->maxCUSize;
> +
> +m_edgePic = X265_MALLOC(pixel, m_stride * (maxHeight +
> (m_lumaMarginY * 2)));
> +m_gaussianPic = X265_MALLOC(pixel, m_stride * (maxHeight +
> (m_lumaMarginY * 2)));
> +m_thetaPic = X265_MALLOC(pixel, m_stride * (maxHeight +
> (m_lumaMarginY * 2)));
> +}
> +
>  if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) &&
> m_lowres.create(param, m_fencPic, param->rc.qgSize))
>  {
>  X265_CHECK((m_reconColCount == NULL), "m_reconColCount was
> initialized");
> @@ -242,4 +259,11 @@
>  X265_FREE_ZERO(m_classifyVariance);
>  X265_FREE_ZERO(m_classifyCount);
>  }
> +
> +if (m_param->rc.aqMode == X265_AQ_EDGE || (m_param->rc.zonefileCount
> && m_param->rc.aqMode != 0))
> +{
> +X265_FREE(m_edgePic);
> +X265_FREE(m_gaussianPic);
> +X265_FREE(m_thetaPic);
> +}
>  }
> diff -r 377cb2b0c369 -r efe5ac3c25da source/common/frame.h
> --- a/source/common/frame.h Tue Sep 24 15:02:05 2019 +0530
> +++ b/source/common/frame.h Fri Oct 11 12:45:52 2019 +0530
> @@ -131,6 +131,11 @@
>  bool   m_classifyFrame;
>  intm_fieldNum;
>
> +/* aq-mode 4 : Gaussian, edge and theta frames for edge information */
> +pixel* m_edgePic;
> +pixel* m_gaussianPic;
> +pixel* m_thetaPic;
> +
>  Frame();
>
>  bool create(x265_param *param, float* quantOffsets);
> diff -r 377cb2b0c369 -r efe5ac3c25da source/encoder/slicetype.cpp
> --- a/source/encoder/slicetype.cpp Tue Sep 24 15:02:05 2019 +0530
> +++ b/source/encoder/slicetype.cpp Fri Oct 11 12:45:52 2019 +0530
> @@ -85,12 +85,22 @@
>
>  } // end anonymous namespace
>
> -void edgeFilter(Frame *curFrame, pixel *pic1, pixel *pic2, pixel *pic3,
> intptr_t stride, int height, int width)
> +void edgeFilter(Frame *curFrame, x265_param* param)
>  {
> +int height = curFrame->m_fencPic->m_picHeight;
> +int width = curFrame->m_fencPic->m_picWidth;
> +intptr_t stride = curFrame->m_fencPic->m_stride;
> +uint32_t numCuInHeight = (height + param->maxCUSize - 1) /
> param->maxCUSize;
> +int maxHeight = numCuInHeight * param->maxCUSize;
> +
> +memset(curFrame->m_edgePic, 0, stride * (maxHeight +
> (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
> +memset(curFrame->m_gaussianPic, 0, stride * (maxHeight +
> (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
> +memset(curFrame->m_thetaPic, 0, stride * (maxHeight +
> (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
> +
>  pixel *src = (pixel*)curFrame->m_fencPic->m_picOrg[0];
> -pixel *edgePic = pic1 + curFrame->m_fencPic->m_lumaMarginY * stride +
> curFrame->m_fencPic->m_lumaMarginX;
> -pixel *refPic = pic2 + curFrame->m_fencPic->m_lumaMarginY * stride +
> curFrame->m_fencPic->m_lumaMarginX;
> -pixel *edgeTheta = pic3 + curFrame->m_fencPic->m_lumaMarginY * stride
> + curFrame->m_fencPic->m_lumaMarginX;
> +pixel *edgePic = curFrame->m_edgePic +
> curFrame->m_fencPic->m_lumaMarginY * stride +
> curFrame->m_fencPic->m_lumaMarginX;
> +pixel *refPic = curFrame->m_gaussianPic +
> 

Re: [x265] [x265 patch] Fix: Performance drop in aq-mode 4

2019-10-11 Thread Akil
This should work.

# HG changeset patch
# User Akil Ayyappan
# Date 1570778152 -19800
#  Fri Oct 11 12:45:52 2019 +0530
# Branch Release_3.2
# Node ID efe5ac3c25dac009efbffaf5ed5e54734a02f812
# Parent  377cb2b0c3698342008a9304e8e7f5bedcf3f1f4
Fix: Performance drop in aq-mode 4

This patch moves the memory handling part of the edge information required
for aq-mode 4
to the Frame class-level in that way it can be reused by the threads.

diff -r 377cb2b0c369 -r efe5ac3c25da source/common/frame.cpp
--- a/source/common/frame.cpp Tue Sep 24 15:02:05 2019 +0530
+++ b/source/common/frame.cpp Fri Oct 11 12:45:52 2019 +0530
@@ -57,6 +57,9 @@
 m_addOnPrevChange = NULL;
 m_classifyFrame = false;
 m_fieldNum = 0;
+m_edgePic = NULL;
+m_gaussianPic = NULL;
+m_thetaPic = NULL;
 }

 bool Frame::create(x265_param *param, float* quantOffsets)
@@ -97,6 +100,20 @@
 CHECKED_MALLOC_ZERO(m_classifyCount, uint32_t, size);
 }

+if (param->rc.aqMode == X265_AQ_EDGE || (param->rc.zonefileCount &&
param->rc.aqMode != 0))
+{
+uint32_t numCuInWidth = (param->sourceWidth + param->maxCUSize -
1) / param->maxCUSize;
+uint32_t numCuInHeight = (param->sourceHeight + param->maxCUSize -
1) / param->maxCUSize;
+uint32_t m_lumaMarginX = param->maxCUSize + 32; // search margin
and 8-tap filter half-length, padded for 32-byte alignment
+uint32_t m_lumaMarginY = param->maxCUSize + 16; // margin for
8-tap filter and infinite padding
+intptr_t m_stride = (numCuInWidth * param->maxCUSize) +
(m_lumaMarginX << 1);
+int maxHeight = numCuInHeight * param->maxCUSize;
+
+m_edgePic = X265_MALLOC(pixel, m_stride * (maxHeight +
(m_lumaMarginY * 2)));
+m_gaussianPic = X265_MALLOC(pixel, m_stride * (maxHeight +
(m_lumaMarginY * 2)));
+m_thetaPic = X265_MALLOC(pixel, m_stride * (maxHeight +
(m_lumaMarginY * 2)));
+}
+
 if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) &&
m_lowres.create(param, m_fencPic, param->rc.qgSize))
 {
 X265_CHECK((m_reconColCount == NULL), "m_reconColCount was
initialized");
@@ -242,4 +259,11 @@
 X265_FREE_ZERO(m_classifyVariance);
 X265_FREE_ZERO(m_classifyCount);
 }
+
+if (m_param->rc.aqMode == X265_AQ_EDGE || (m_param->rc.zonefileCount
&& m_param->rc.aqMode != 0))
+{
+X265_FREE(m_edgePic);
+X265_FREE(m_gaussianPic);
+X265_FREE(m_thetaPic);
+}
 }
diff -r 377cb2b0c369 -r efe5ac3c25da source/common/frame.h
--- a/source/common/frame.h Tue Sep 24 15:02:05 2019 +0530
+++ b/source/common/frame.h Fri Oct 11 12:45:52 2019 +0530
@@ -131,6 +131,11 @@
 bool   m_classifyFrame;
 intm_fieldNum;

+/* aq-mode 4 : Gaussian, edge and theta frames for edge information */
+pixel* m_edgePic;
+pixel* m_gaussianPic;
+pixel* m_thetaPic;
+
 Frame();

 bool create(x265_param *param, float* quantOffsets);
diff -r 377cb2b0c369 -r efe5ac3c25da source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Tue Sep 24 15:02:05 2019 +0530
+++ b/source/encoder/slicetype.cpp Fri Oct 11 12:45:52 2019 +0530
@@ -85,12 +85,22 @@

 } // end anonymous namespace

-void edgeFilter(Frame *curFrame, pixel *pic1, pixel *pic2, pixel *pic3,
intptr_t stride, int height, int width)
+void edgeFilter(Frame *curFrame, x265_param* param)
 {
+int height = curFrame->m_fencPic->m_picHeight;
+int width = curFrame->m_fencPic->m_picWidth;
+intptr_t stride = curFrame->m_fencPic->m_stride;
+uint32_t numCuInHeight = (height + param->maxCUSize - 1) /
param->maxCUSize;
+int maxHeight = numCuInHeight * param->maxCUSize;
+
+memset(curFrame->m_edgePic, 0, stride * (maxHeight +
(curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
+memset(curFrame->m_gaussianPic, 0, stride * (maxHeight +
(curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
+memset(curFrame->m_thetaPic, 0, stride * (maxHeight +
(curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
+
 pixel *src = (pixel*)curFrame->m_fencPic->m_picOrg[0];
-pixel *edgePic = pic1 + curFrame->m_fencPic->m_lumaMarginY * stride +
curFrame->m_fencPic->m_lumaMarginX;
-pixel *refPic = pic2 + curFrame->m_fencPic->m_lumaMarginY * stride +
curFrame->m_fencPic->m_lumaMarginX;
-pixel *edgeTheta = pic3 + curFrame->m_fencPic->m_lumaMarginY * stride
+ curFrame->m_fencPic->m_lumaMarginX;
+pixel *edgePic = curFrame->m_edgePic +
curFrame->m_fencPic->m_lumaMarginY * stride +
curFrame->m_fencPic->m_lumaMarginX;
+pixel *refPic = curFrame->m_gaussianPic +
curFrame->m_fencPic->m_lumaMarginY * stride +
curFrame->m_fencPic->m_lumaMarginX;
+pixel *edgeTheta = curFrame->m_thetaPic +
curFrame->m_fencPic->m_lumaMarginY * stride +
curFrame->m_fencPic->m_lumaMarginX;

 for (int i = 0; i < height; i++)
 {
@@ -103,7 +113,7 @@

 //Applying Gaussian filter on the picture
 src = 

Re: [x265] [x265 patch] Fix: Performance drop in aq-mode 4

2019-10-10 Thread Aruna Matheswaran
The patch is not applying on Release_3.2. Please rebase and send the patch.

On Thu, Oct 10, 2019 at 2:31 PM Akil  wrote:

> # HG changeset patch
> # User Akil Ayyappan
> # Date 1570594514 -19800
> #  Wed Oct 09 09:45:14 2019 +0530
> # Node ID b66d88859a528ae80f6f19eae7553fe7fcdb88e6
> # Parent  354901970679c787efdfdcc6577228e9c06785cf
> Fix: Performance drop in aq-mode 4
>
> This patch moves the memory allocation part of the edge information
> required for aq-mode 4
> to the Frame class-level in that way it can be reused by the threads.
>
> diff -r 354901970679 -r b66d88859a52 source/common/frame.cpp
> --- a/source/common/frame.cpp Fri Sep 13 15:57:26 2019 +0530
> +++ b/source/common/frame.cpp Wed Oct 09 09:45:14 2019 +0530
> @@ -58,6 +58,9 @@
>  m_classifyFrame = false;
>  m_fieldNum = 0;
>  m_picStruct = 0;
> +m_edgePic = NULL;
> +m_gaussianPic = NULL;
> +m_thetaPic = NULL;
>  }
>
>  bool Frame::create(x265_param *param, float* quantOffsets)
> @@ -98,6 +101,20 @@
>  CHECKED_MALLOC_ZERO(m_classifyCount, uint32_t, size);
>  }
>
> +if (param->rc.aqMode == X265_AQ_EDGE || (param->rc.zonefileCount &&
> param->rc.aqMode != 0))
> +{
> +uint32_t numCuInWidth = (param->sourceWidth + param->maxCUSize -
> 1) / param->maxCUSize;
> +uint32_t numCuInHeight = (param->sourceHeight + param->maxCUSize
> - 1) / param->maxCUSize;
> +uint32_t m_lumaMarginX = param->maxCUSize + 32; // search margin
> and 8-tap filter half-length, padded for 32-byte alignment
> +uint32_t m_lumaMarginY = param->maxCUSize + 16; // margin for
> 8-tap filter and infinite padding
> +intptr_t m_stride = (numCuInWidth * param->maxCUSize) +
> (m_lumaMarginX << 1);
> +int maxHeight = numCuInHeight * param->maxCUSize;
> +
> +m_edgePic = X265_MALLOC(pixel, m_stride * (maxHeight +
> (m_lumaMarginY * 2)));
> +m_gaussianPic = X265_MALLOC(pixel, m_stride * (maxHeight +
> (m_lumaMarginY * 2)));
> +m_thetaPic = X265_MALLOC(pixel, m_stride * (maxHeight +
> (m_lumaMarginY * 2)));
> +}
> +
>  if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) &&
> m_lowres.create(param, m_fencPic, param->rc.qgSize))
>  {
>  X265_CHECK((m_reconColCount == NULL), "m_reconColCount was
> initialized");
> @@ -243,4 +260,11 @@
>  X265_FREE_ZERO(m_classifyVariance);
>  X265_FREE_ZERO(m_classifyCount);
>  }
> +
> +if (m_param->rc.aqMode == X265_AQ_EDGE || (m_param->rc.zonefileCount
> && m_param->rc.aqMode != 0))
> +{
> +X265_FREE(m_edgePic);
> +X265_FREE(m_gaussianPic);
> +X265_FREE(m_thetaPic);
> +}
>  }
> diff -r 354901970679 -r b66d88859a52 source/common/frame.h
> --- a/source/common/frame.h Fri Sep 13 15:57:26 2019 +0530
> +++ b/source/common/frame.h Wed Oct 09 09:45:14 2019 +0530
> @@ -132,6 +132,11 @@
>  bool   m_classifyFrame;
>  intm_fieldNum;
>
> +/* aq-mode 4 : Gaussian, edge and theta frames for edge information */
> +pixel* m_edgePic;
> +pixel* m_gaussianPic;
> +pixel* m_thetaPic;
> +
>  Frame();
>
>  bool create(x265_param *param, float* quantOffsets);
> diff -r 354901970679 -r b66d88859a52 source/encoder/slicetype.cpp
> --- a/source/encoder/slicetype.cpp Fri Sep 13 15:57:26 2019 +0530
> +++ b/source/encoder/slicetype.cpp Wed Oct 09 09:45:14 2019 +0530
> @@ -85,12 +85,22 @@
>
>  } // end anonymous namespace
>
> -void edgeFilter(Frame *curFrame, pixel *pic1, pixel *pic2, pixel *pic3,
> intptr_t stride, int height, int width)
> +void edgeFilter(Frame *curFrame, x265_param* param)
>  {
> +int height = curFrame->m_fencPic->m_picHeight;
> +int width = curFrame->m_fencPic->m_picWidth;
> +intptr_t stride = curFrame->m_fencPic->m_stride;
> +uint32_t numCuInHeight = (height + param->maxCUSize - 1) /
> param->maxCUSize;
> +int maxHeight = numCuInHeight * param->maxCUSize;
> +
> +memset(curFrame->m_edgePic, 0, stride * (maxHeight +
> (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
> +memset(curFrame->m_gaussianPic, 0, stride * (maxHeight +
> (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
> +memset(curFrame->m_thetaPic, 0, stride * (maxHeight +
> (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel));
> +
>  pixel *src = (pixel*)curFrame->m_fencPic->m_picOrg[0];
> -pixel *edgePic = pic1 + curFrame->m_fencPic->m_lumaMarginY * stride +
> curFrame->m_fencPic->m_lumaMarginX;
> -pixel *refPic = pic2 + curFrame->m_fencPic->m_lumaMarginY * stride +
> curFrame->m_fencPic->m_lumaMarginX;
> -pixel *edgeTheta = pic3 + curFrame->m_fencPic->m_lumaMarginY * stride
> + curFrame->m_fencPic->m_lumaMarginX;
> +pixel *edgePic = curFrame->m_edgePic +
> curFrame->m_fencPic->m_lumaMarginY * stride +
> curFrame->m_fencPic->m_lumaMarginX;
> +pixel *refPic = curFrame->m_gaussianPic +
>