Re: [x265] [x265 patch] Fix: Performance drop in aq-mode 4
Pushed to Release_3.2. On Fri, Oct 11, 2019 at 1:41 PM Akil wrote: > This should work. > > # HG changeset patch > # User Akil Ayyappan > # Date 1570778152 -19800 > # Fri Oct 11 12:45:52 2019 +0530 > # Branch Release_3.2 > # Node ID efe5ac3c25dac009efbffaf5ed5e54734a02f812 > # Parent 377cb2b0c3698342008a9304e8e7f5bedcf3f1f4 > Fix: Performance drop in aq-mode 4 > > This patch moves the memory handling part of the edge information required > for aq-mode 4 > to the Frame class-level in that way it can be reused by the threads. > > diff -r 377cb2b0c369 -r efe5ac3c25da source/common/frame.cpp > --- a/source/common/frame.cpp Tue Sep 24 15:02:05 2019 +0530 > +++ b/source/common/frame.cpp Fri Oct 11 12:45:52 2019 +0530 > @@ -57,6 +57,9 @@ > m_addOnPrevChange = NULL; > m_classifyFrame = false; > m_fieldNum = 0; > +m_edgePic = NULL; > +m_gaussianPic = NULL; > +m_thetaPic = NULL; > } > > bool Frame::create(x265_param *param, float* quantOffsets) > @@ -97,6 +100,20 @@ > CHECKED_MALLOC_ZERO(m_classifyCount, uint32_t, size); > } > > +if (param->rc.aqMode == X265_AQ_EDGE || (param->rc.zonefileCount && > param->rc.aqMode != 0)) > +{ > +uint32_t numCuInWidth = (param->sourceWidth + param->maxCUSize - > 1) / param->maxCUSize; > +uint32_t numCuInHeight = (param->sourceHeight + param->maxCUSize > - 1) / param->maxCUSize; > +uint32_t m_lumaMarginX = param->maxCUSize + 32; // search margin > and 8-tap filter half-length, padded for 32-byte alignment > +uint32_t m_lumaMarginY = param->maxCUSize + 16; // margin for > 8-tap filter and infinite padding > +intptr_t m_stride = (numCuInWidth * param->maxCUSize) + > (m_lumaMarginX << 1); > +int maxHeight = numCuInHeight * param->maxCUSize; > + > +m_edgePic = X265_MALLOC(pixel, m_stride * (maxHeight + > (m_lumaMarginY * 2))); > +m_gaussianPic = X265_MALLOC(pixel, m_stride * (maxHeight + > (m_lumaMarginY * 2))); > +m_thetaPic = X265_MALLOC(pixel, m_stride * (maxHeight + > (m_lumaMarginY * 2))); > +} > + > if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) && > m_lowres.create(param, m_fencPic, param->rc.qgSize)) > { > X265_CHECK((m_reconColCount == NULL), "m_reconColCount was > initialized"); > @@ -242,4 +259,11 @@ > X265_FREE_ZERO(m_classifyVariance); > X265_FREE_ZERO(m_classifyCount); > } > + > +if (m_param->rc.aqMode == X265_AQ_EDGE || (m_param->rc.zonefileCount > && m_param->rc.aqMode != 0)) > +{ > +X265_FREE(m_edgePic); > +X265_FREE(m_gaussianPic); > +X265_FREE(m_thetaPic); > +} > } > diff -r 377cb2b0c369 -r efe5ac3c25da source/common/frame.h > --- a/source/common/frame.h Tue Sep 24 15:02:05 2019 +0530 > +++ b/source/common/frame.h Fri Oct 11 12:45:52 2019 +0530 > @@ -131,6 +131,11 @@ > bool m_classifyFrame; > intm_fieldNum; > > +/* aq-mode 4 : Gaussian, edge and theta frames for edge information */ > +pixel* m_edgePic; > +pixel* m_gaussianPic; > +pixel* m_thetaPic; > + > Frame(); > > bool create(x265_param *param, float* quantOffsets); > diff -r 377cb2b0c369 -r efe5ac3c25da source/encoder/slicetype.cpp > --- a/source/encoder/slicetype.cpp Tue Sep 24 15:02:05 2019 +0530 > +++ b/source/encoder/slicetype.cpp Fri Oct 11 12:45:52 2019 +0530 > @@ -85,12 +85,22 @@ > > } // end anonymous namespace > > -void edgeFilter(Frame *curFrame, pixel *pic1, pixel *pic2, pixel *pic3, > intptr_t stride, int height, int width) > +void edgeFilter(Frame *curFrame, x265_param* param) > { > +int height = curFrame->m_fencPic->m_picHeight; > +int width = curFrame->m_fencPic->m_picWidth; > +intptr_t stride = curFrame->m_fencPic->m_stride; > +uint32_t numCuInHeight = (height + param->maxCUSize - 1) / > param->maxCUSize; > +int maxHeight = numCuInHeight * param->maxCUSize; > + > +memset(curFrame->m_edgePic, 0, stride * (maxHeight + > (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel)); > +memset(curFrame->m_gaussianPic, 0, stride * (maxHeight + > (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel)); > +memset(curFrame->m_thetaPic, 0, stride * (maxHeight + > (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel)); > + > pixel *src = (pixel*)curFrame->m_fencPic->m_picOrg[0]; > -pixel *edgePic = pic1 + curFrame->m_fencPic->m_lumaMarginY * stride + > curFrame->m_fencPic->m_lumaMarginX; > -pixel *refPic = pic2 + curFrame->m_fencPic->m_lumaMarginY * stride + > curFrame->m_fencPic->m_lumaMarginX; > -pixel *edgeTheta = pic3 + curFrame->m_fencPic->m_lumaMarginY * stride > + curFrame->m_fencPic->m_lumaMarginX; > +pixel *edgePic = curFrame->m_edgePic + > curFrame->m_fencPic->m_lumaMarginY * stride + > curFrame->m_fencPic->m_lumaMarginX; > +pixel *refPic = curFrame->m_gaussianPic + >
Re: [x265] [x265 patch] Fix: Performance drop in aq-mode 4
This should work. # HG changeset patch # User Akil Ayyappan # Date 1570778152 -19800 # Fri Oct 11 12:45:52 2019 +0530 # Branch Release_3.2 # Node ID efe5ac3c25dac009efbffaf5ed5e54734a02f812 # Parent 377cb2b0c3698342008a9304e8e7f5bedcf3f1f4 Fix: Performance drop in aq-mode 4 This patch moves the memory handling part of the edge information required for aq-mode 4 to the Frame class-level in that way it can be reused by the threads. diff -r 377cb2b0c369 -r efe5ac3c25da source/common/frame.cpp --- a/source/common/frame.cpp Tue Sep 24 15:02:05 2019 +0530 +++ b/source/common/frame.cpp Fri Oct 11 12:45:52 2019 +0530 @@ -57,6 +57,9 @@ m_addOnPrevChange = NULL; m_classifyFrame = false; m_fieldNum = 0; +m_edgePic = NULL; +m_gaussianPic = NULL; +m_thetaPic = NULL; } bool Frame::create(x265_param *param, float* quantOffsets) @@ -97,6 +100,20 @@ CHECKED_MALLOC_ZERO(m_classifyCount, uint32_t, size); } +if (param->rc.aqMode == X265_AQ_EDGE || (param->rc.zonefileCount && param->rc.aqMode != 0)) +{ +uint32_t numCuInWidth = (param->sourceWidth + param->maxCUSize - 1) / param->maxCUSize; +uint32_t numCuInHeight = (param->sourceHeight + param->maxCUSize - 1) / param->maxCUSize; +uint32_t m_lumaMarginX = param->maxCUSize + 32; // search margin and 8-tap filter half-length, padded for 32-byte alignment +uint32_t m_lumaMarginY = param->maxCUSize + 16; // margin for 8-tap filter and infinite padding +intptr_t m_stride = (numCuInWidth * param->maxCUSize) + (m_lumaMarginX << 1); +int maxHeight = numCuInHeight * param->maxCUSize; + +m_edgePic = X265_MALLOC(pixel, m_stride * (maxHeight + (m_lumaMarginY * 2))); +m_gaussianPic = X265_MALLOC(pixel, m_stride * (maxHeight + (m_lumaMarginY * 2))); +m_thetaPic = X265_MALLOC(pixel, m_stride * (maxHeight + (m_lumaMarginY * 2))); +} + if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) && m_lowres.create(param, m_fencPic, param->rc.qgSize)) { X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized"); @@ -242,4 +259,11 @@ X265_FREE_ZERO(m_classifyVariance); X265_FREE_ZERO(m_classifyCount); } + +if (m_param->rc.aqMode == X265_AQ_EDGE || (m_param->rc.zonefileCount && m_param->rc.aqMode != 0)) +{ +X265_FREE(m_edgePic); +X265_FREE(m_gaussianPic); +X265_FREE(m_thetaPic); +} } diff -r 377cb2b0c369 -r efe5ac3c25da source/common/frame.h --- a/source/common/frame.h Tue Sep 24 15:02:05 2019 +0530 +++ b/source/common/frame.h Fri Oct 11 12:45:52 2019 +0530 @@ -131,6 +131,11 @@ bool m_classifyFrame; intm_fieldNum; +/* aq-mode 4 : Gaussian, edge and theta frames for edge information */ +pixel* m_edgePic; +pixel* m_gaussianPic; +pixel* m_thetaPic; + Frame(); bool create(x265_param *param, float* quantOffsets); diff -r 377cb2b0c369 -r efe5ac3c25da source/encoder/slicetype.cpp --- a/source/encoder/slicetype.cpp Tue Sep 24 15:02:05 2019 +0530 +++ b/source/encoder/slicetype.cpp Fri Oct 11 12:45:52 2019 +0530 @@ -85,12 +85,22 @@ } // end anonymous namespace -void edgeFilter(Frame *curFrame, pixel *pic1, pixel *pic2, pixel *pic3, intptr_t stride, int height, int width) +void edgeFilter(Frame *curFrame, x265_param* param) { +int height = curFrame->m_fencPic->m_picHeight; +int width = curFrame->m_fencPic->m_picWidth; +intptr_t stride = curFrame->m_fencPic->m_stride; +uint32_t numCuInHeight = (height + param->maxCUSize - 1) / param->maxCUSize; +int maxHeight = numCuInHeight * param->maxCUSize; + +memset(curFrame->m_edgePic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel)); +memset(curFrame->m_gaussianPic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel)); +memset(curFrame->m_thetaPic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel)); + pixel *src = (pixel*)curFrame->m_fencPic->m_picOrg[0]; -pixel *edgePic = pic1 + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX; -pixel *refPic = pic2 + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX; -pixel *edgeTheta = pic3 + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX; +pixel *edgePic = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX; +pixel *refPic = curFrame->m_gaussianPic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX; +pixel *edgeTheta = curFrame->m_thetaPic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX; for (int i = 0; i < height; i++) { @@ -103,7 +113,7 @@ //Applying Gaussian filter on the picture src =
Re: [x265] [x265 patch] Fix: Performance drop in aq-mode 4
The patch is not applying on Release_3.2. Please rebase and send the patch. On Thu, Oct 10, 2019 at 2:31 PM Akil wrote: > # HG changeset patch > # User Akil Ayyappan > # Date 1570594514 -19800 > # Wed Oct 09 09:45:14 2019 +0530 > # Node ID b66d88859a528ae80f6f19eae7553fe7fcdb88e6 > # Parent 354901970679c787efdfdcc6577228e9c06785cf > Fix: Performance drop in aq-mode 4 > > This patch moves the memory allocation part of the edge information > required for aq-mode 4 > to the Frame class-level in that way it can be reused by the threads. > > diff -r 354901970679 -r b66d88859a52 source/common/frame.cpp > --- a/source/common/frame.cpp Fri Sep 13 15:57:26 2019 +0530 > +++ b/source/common/frame.cpp Wed Oct 09 09:45:14 2019 +0530 > @@ -58,6 +58,9 @@ > m_classifyFrame = false; > m_fieldNum = 0; > m_picStruct = 0; > +m_edgePic = NULL; > +m_gaussianPic = NULL; > +m_thetaPic = NULL; > } > > bool Frame::create(x265_param *param, float* quantOffsets) > @@ -98,6 +101,20 @@ > CHECKED_MALLOC_ZERO(m_classifyCount, uint32_t, size); > } > > +if (param->rc.aqMode == X265_AQ_EDGE || (param->rc.zonefileCount && > param->rc.aqMode != 0)) > +{ > +uint32_t numCuInWidth = (param->sourceWidth + param->maxCUSize - > 1) / param->maxCUSize; > +uint32_t numCuInHeight = (param->sourceHeight + param->maxCUSize > - 1) / param->maxCUSize; > +uint32_t m_lumaMarginX = param->maxCUSize + 32; // search margin > and 8-tap filter half-length, padded for 32-byte alignment > +uint32_t m_lumaMarginY = param->maxCUSize + 16; // margin for > 8-tap filter and infinite padding > +intptr_t m_stride = (numCuInWidth * param->maxCUSize) + > (m_lumaMarginX << 1); > +int maxHeight = numCuInHeight * param->maxCUSize; > + > +m_edgePic = X265_MALLOC(pixel, m_stride * (maxHeight + > (m_lumaMarginY * 2))); > +m_gaussianPic = X265_MALLOC(pixel, m_stride * (maxHeight + > (m_lumaMarginY * 2))); > +m_thetaPic = X265_MALLOC(pixel, m_stride * (maxHeight + > (m_lumaMarginY * 2))); > +} > + > if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) && > m_lowres.create(param, m_fencPic, param->rc.qgSize)) > { > X265_CHECK((m_reconColCount == NULL), "m_reconColCount was > initialized"); > @@ -243,4 +260,11 @@ > X265_FREE_ZERO(m_classifyVariance); > X265_FREE_ZERO(m_classifyCount); > } > + > +if (m_param->rc.aqMode == X265_AQ_EDGE || (m_param->rc.zonefileCount > && m_param->rc.aqMode != 0)) > +{ > +X265_FREE(m_edgePic); > +X265_FREE(m_gaussianPic); > +X265_FREE(m_thetaPic); > +} > } > diff -r 354901970679 -r b66d88859a52 source/common/frame.h > --- a/source/common/frame.h Fri Sep 13 15:57:26 2019 +0530 > +++ b/source/common/frame.h Wed Oct 09 09:45:14 2019 +0530 > @@ -132,6 +132,11 @@ > bool m_classifyFrame; > intm_fieldNum; > > +/* aq-mode 4 : Gaussian, edge and theta frames for edge information */ > +pixel* m_edgePic; > +pixel* m_gaussianPic; > +pixel* m_thetaPic; > + > Frame(); > > bool create(x265_param *param, float* quantOffsets); > diff -r 354901970679 -r b66d88859a52 source/encoder/slicetype.cpp > --- a/source/encoder/slicetype.cpp Fri Sep 13 15:57:26 2019 +0530 > +++ b/source/encoder/slicetype.cpp Wed Oct 09 09:45:14 2019 +0530 > @@ -85,12 +85,22 @@ > > } // end anonymous namespace > > -void edgeFilter(Frame *curFrame, pixel *pic1, pixel *pic2, pixel *pic3, > intptr_t stride, int height, int width) > +void edgeFilter(Frame *curFrame, x265_param* param) > { > +int height = curFrame->m_fencPic->m_picHeight; > +int width = curFrame->m_fencPic->m_picWidth; > +intptr_t stride = curFrame->m_fencPic->m_stride; > +uint32_t numCuInHeight = (height + param->maxCUSize - 1) / > param->maxCUSize; > +int maxHeight = numCuInHeight * param->maxCUSize; > + > +memset(curFrame->m_edgePic, 0, stride * (maxHeight + > (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel)); > +memset(curFrame->m_gaussianPic, 0, stride * (maxHeight + > (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel)); > +memset(curFrame->m_thetaPic, 0, stride * (maxHeight + > (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel)); > + > pixel *src = (pixel*)curFrame->m_fencPic->m_picOrg[0]; > -pixel *edgePic = pic1 + curFrame->m_fencPic->m_lumaMarginY * stride + > curFrame->m_fencPic->m_lumaMarginX; > -pixel *refPic = pic2 + curFrame->m_fencPic->m_lumaMarginY * stride + > curFrame->m_fencPic->m_lumaMarginX; > -pixel *edgeTheta = pic3 + curFrame->m_fencPic->m_lumaMarginY * stride > + curFrame->m_fencPic->m_lumaMarginX; > +pixel *edgePic = curFrame->m_edgePic + > curFrame->m_fencPic->m_lumaMarginY * stride + > curFrame->m_fencPic->m_lumaMarginX; > +pixel *refPic = curFrame->m_gaussianPic + >