# HG changeset patch # User Akil Ayyappan<a...@multicorewareinc.com> # Date 1570594514 -19800 # Wed Oct 09 09:45:14 2019 +0530 # Node ID b66d88859a528ae80f6f19eae7553fe7fcdb88e6 # Parent 354901970679c787efdfdcc6577228e9c06785cf Fix: Performance drop in aq-mode 4
This patch moves the memory allocation part of the edge information required for aq-mode 4 to the Frame class-level in that way it can be reused by the threads. diff -r 354901970679 -r b66d88859a52 source/common/frame.cpp --- a/source/common/frame.cpp Fri Sep 13 15:57:26 2019 +0530 +++ b/source/common/frame.cpp Wed Oct 09 09:45:14 2019 +0530 @@ -58,6 +58,9 @@ m_classifyFrame = false; m_fieldNum = 0; m_picStruct = 0; + m_edgePic = NULL; + m_gaussianPic = NULL; + m_thetaPic = NULL; } bool Frame::create(x265_param *param, float* quantOffsets) @@ -98,6 +101,20 @@ CHECKED_MALLOC_ZERO(m_classifyCount, uint32_t, size); } + if (param->rc.aqMode == X265_AQ_EDGE || (param->rc.zonefileCount && param->rc.aqMode != 0)) + { + uint32_t numCuInWidth = (param->sourceWidth + param->maxCUSize - 1) / param->maxCUSize; + uint32_t numCuInHeight = (param->sourceHeight + param->maxCUSize - 1) / param->maxCUSize; + uint32_t m_lumaMarginX = param->maxCUSize + 32; // search margin and 8-tap filter half-length, padded for 32-byte alignment + uint32_t m_lumaMarginY = param->maxCUSize + 16; // margin for 8-tap filter and infinite padding + intptr_t m_stride = (numCuInWidth * param->maxCUSize) + (m_lumaMarginX << 1); + int maxHeight = numCuInHeight * param->maxCUSize; + + m_edgePic = X265_MALLOC(pixel, m_stride * (maxHeight + (m_lumaMarginY * 2))); + m_gaussianPic = X265_MALLOC(pixel, m_stride * (maxHeight + (m_lumaMarginY * 2))); + m_thetaPic = X265_MALLOC(pixel, m_stride * (maxHeight + (m_lumaMarginY * 2))); + } + if (m_fencPic->create(param, !!m_param->bCopyPicToFrame) && m_lowres.create(param, m_fencPic, param->rc.qgSize)) { X265_CHECK((m_reconColCount == NULL), "m_reconColCount was initialized"); @@ -243,4 +260,11 @@ X265_FREE_ZERO(m_classifyVariance); X265_FREE_ZERO(m_classifyCount); } + + if (m_param->rc.aqMode == X265_AQ_EDGE || (m_param->rc.zonefileCount && m_param->rc.aqMode != 0)) + { + X265_FREE(m_edgePic); + X265_FREE(m_gaussianPic); + X265_FREE(m_thetaPic); + } } diff -r 354901970679 -r b66d88859a52 source/common/frame.h --- a/source/common/frame.h Fri Sep 13 15:57:26 2019 +0530 +++ b/source/common/frame.h Wed Oct 09 09:45:14 2019 +0530 @@ -132,6 +132,11 @@ bool m_classifyFrame; int m_fieldNum; + /* aq-mode 4 : Gaussian, edge and theta frames for edge information */ + pixel* m_edgePic; + pixel* m_gaussianPic; + pixel* m_thetaPic; + Frame(); bool create(x265_param *param, float* quantOffsets); diff -r 354901970679 -r b66d88859a52 source/encoder/slicetype.cpp --- a/source/encoder/slicetype.cpp Fri Sep 13 15:57:26 2019 +0530 +++ b/source/encoder/slicetype.cpp Wed Oct 09 09:45:14 2019 +0530 @@ -85,12 +85,22 @@ } // end anonymous namespace -void edgeFilter(Frame *curFrame, pixel *pic1, pixel *pic2, pixel *pic3, intptr_t stride, int height, int width) +void edgeFilter(Frame *curFrame, x265_param* param) { + int height = curFrame->m_fencPic->m_picHeight; + int width = curFrame->m_fencPic->m_picWidth; + intptr_t stride = curFrame->m_fencPic->m_stride; + uint32_t numCuInHeight = (height + param->maxCUSize - 1) / param->maxCUSize; + int maxHeight = numCuInHeight * param->maxCUSize; + + memset(curFrame->m_edgePic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel)); + memset(curFrame->m_gaussianPic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel)); + memset(curFrame->m_thetaPic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel)); + pixel *src = (pixel*)curFrame->m_fencPic->m_picOrg[0]; - pixel *edgePic = pic1 + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX; - pixel *refPic = pic2 + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX; - pixel *edgeTheta = pic3 + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX; + pixel *edgePic = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX; + pixel *refPic = curFrame->m_gaussianPic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX; + pixel *edgeTheta = curFrame->m_thetaPic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX; for (int i = 0; i < height; i++) { @@ -103,7 +113,7 @@ //Applying Gaussian filter on the picture src = (pixel*)curFrame->m_fencPic->m_picOrg[0]; - refPic = pic2 + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX; + refPic = curFrame->m_gaussianPic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX; pixel pixelValue = 0; for (int rowNum = 0; rowNum < height; rowNum++) @@ -148,7 +158,7 @@ float gradientH = 0, gradientV = 0, radians = 0, theta = 0; float gradientMagnitude = 0; pixel blackPixel = 0; - edgePic = pic1 + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX; + edgePic = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * stride + curFrame->m_fencPic->m_lumaMarginX; //Applying Sobel filter on the gaussian filtered picture for (int rowNum = 0; rowNum < height; rowNum++) { @@ -198,8 +208,10 @@ angle = sum / (size*size); } -uint32_t LookaheadTLD::edgeDensityCu(Frame* curFrame,pixel *edgeImage, pixel *edgeTheta, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize) +uint32_t LookaheadTLD::edgeDensityCu(Frame* curFrame, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize) { + pixel *edgeImage = curFrame->m_edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX; + pixel *edgeTheta = curFrame->m_thetaPic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX; intptr_t srcStride = curFrame->m_fencPic->m_stride; intptr_t blockOffsetLuma = blockX + (blockY * srcStride); int plane = 0; // Sobel filter is applied only on Y component @@ -478,31 +490,14 @@ } else { -#define AQ_EDGE_BIAS 0.5 -#define EDGE_INCLINATION 45 - - pixel *edgePic = NULL; - pixel *gaussianPic = NULL; - pixel *thetaPic = NULL; - - if (param->rc.aqMode == X265_AQ_EDGE) - { - uint32_t numCuInHeight = (maxRow + param->maxCUSize - 1) / param->maxCUSize; - int maxHeight = numCuInHeight * param->maxCUSize; - intptr_t stride = curFrame->m_fencPic->m_stride; - edgePic = X265_MALLOC(pixel, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2))); - gaussianPic = X265_MALLOC(pixel, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2))); - thetaPic = X265_MALLOC(pixel, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2))); - memset(edgePic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel)); - memset(gaussianPic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel)); - memset(thetaPic, 0, stride * (maxHeight + (curFrame->m_fencPic->m_lumaMarginY * 2)) * sizeof(pixel)); - edgeFilter(curFrame, edgePic, gaussianPic, thetaPic, stride, maxRow, maxCol); - } - int blockXY = 0, inclinedEdge = 0; double avg_adj_pow2 = 0, avg_adj = 0, qp_adj = 0; double bias_strength = 0.f; double strength = 0.f; + + if (param->rc.aqMode == X265_AQ_EDGE) + edgeFilter(curFrame, param); + if (param->rc.aqMode == X265_AQ_AUTO_VARIANCE || param->rc.aqMode == X265_AQ_AUTO_VARIANCE_BIASED || param->rc.aqMode == X265_AQ_EDGE) { double bit_depth_correction = 1.f / (1 << (2 * (X265_DEPTH - 8))); @@ -514,9 +509,7 @@ energy = acEnergyCu(curFrame, blockX, blockY, param->internalCsp, param->rc.qgSize); if (param->rc.aqMode == X265_AQ_EDGE) { - pixel *edgeImage = edgePic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX; - pixel *edgeTheta = thetaPic + curFrame->m_fencPic->m_lumaMarginY * curFrame->m_fencPic->m_stride + curFrame->m_fencPic->m_lumaMarginX; - edgeDensity = edgeDensityCu(curFrame, edgeImage, edgeTheta, avgAngle, blockX, blockY, param->rc.qgSize); + edgeDensity = edgeDensityCu(curFrame, avgAngle, blockX, blockY, param->rc.qgSize); if (edgeDensity) { qp_adj = pow(edgeDensity * bit_depth_correction + 1, 0.1); @@ -549,13 +542,6 @@ else strength = param->rc.aqStrength * 1.0397f; - if (param->rc.aqMode == X265_AQ_EDGE) - { - X265_FREE(edgePic); - X265_FREE(gaussianPic); - X265_FREE(thetaPic); - } - blockXY = 0; for (int blockY = 0; blockY < maxRow; blockY += loopIncr) { diff -r 354901970679 -r b66d88859a52 source/encoder/slicetype.h --- a/source/encoder/slicetype.h Fri Sep 13 15:57:26 2019 +0530 +++ b/source/encoder/slicetype.h Wed Oct 09 09:45:14 2019 +0530 @@ -40,6 +40,8 @@ #define LOWRES_COST_MASK ((1 << 14) - 1) #define LOWRES_COST_SHIFT 14 +#define AQ_EDGE_BIAS 0.5 +#define EDGE_INCLINATION 45 /* Thread local data for lookahead tasks */ struct LookaheadTLD @@ -92,7 +94,7 @@ protected: uint32_t acEnergyCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, int csp, uint32_t qgSize); - uint32_t edgeDensityCu(Frame*curFrame, pixel *edgeImage, pixel *edgeTheta, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize); + uint32_t edgeDensityCu(Frame*curFrame, uint32_t &avgAngle, uint32_t blockX, uint32_t blockY, uint32_t qgSize); uint32_t lumaSumCu(Frame* curFrame, uint32_t blockX, uint32_t blockY, uint32_t qgSize); uint32_t weightCostLuma(Lowres& fenc, Lowres& ref, WeightParam& wp); bool allocWeightedRef(Lowres& fenc); diff -r 354901970679 -r b66d88859a52 source/test/regression-tests.txt --- a/source/test/regression-tests.txt Fri Sep 13 15:57:26 2019 +0530 +++ b/source/test/regression-tests.txt Wed Oct 09 09:45:14 2019 +0530 @@ -154,7 +154,7 @@ BasketballDrive_1920x1080_50.y4m, --preset medium --no-open-gop --keyint 50 --min-keyint 50 --radl 2 --vbv-maxrate 5000 --vbv-bufsize 5000 big_buck_bunny_360p24.y4m, --bitrate 500 --fades 720p50_parkrun_ter.y4m,--preset medium --bitrate 400 --hme -ducks_take_off_420_1_720p50.y4m,--preset medium --aq-mode 4 --crf 22 --no-cutree +ducks_take_off_420_720p50.y4m,--preset medium --aq-mode 4 --crf 22 --no-cutree ducks_take_off_420_1_720p50.y4m,--preset medium --selective-sao 4 --sao --crf 20 Traffic_4096x2048_30p.y4m, --preset medium --frame-dup --dup-threshold 60 --hrd --bitrate 10000 --vbv-bufsize 15000 --vbv-maxrate 12000 Thanks & Regards *Akil R* Video Codec Engineer Media & AI Analytics <https://multicorewareinc.com/>
AQ_4_fix_latest.patch
Description: Binary data
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel