[x265] refine partition size related
# HG changeset patch # User Satoshi Nakagawa nakagawa...@oki.com # Date 1406011990 -32400 # Tue Jul 22 15:53:10 2014 +0900 # Node ID b2ad081e4bfc20bbc84e8bfbab59ed52aeac2a73 # Parent d303b4d860e9f06396a156726dd518d0f41fe796 refine partition size related - reorder LumaPartitions to simplify partitionFromLog2Size() - remove unused diff -r d303b4d860e9 -r b2ad081e4bfc source/Lib/TLibCommon/TComYuv.cpp --- a/source/Lib/TLibCommon/TComYuv.cpp Mon Jul 21 22:43:38 2014 -0500 +++ b/source/Lib/TLibCommon/TComYuv.cpp Tue Jul 22 15:53:10 2014 +0900 @@ -127,6 +127,15 @@ primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], getCStride(), srcV, srcPicYuv-getCStride()); } +void TComYuv::copyFromYuv(TComYuv* srcYuv) +{ +X265_CHECK(m_width = srcYuv-m_width m_height = srcYuv-m_height, invalid size\n); + +primitives.luma_copy_pp[m_part](m_buf[0], m_width, srcYuv-m_buf[0], srcYuv-m_width); +primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_cwidth, srcYuv-m_buf[1], srcYuv-m_cwidth); +primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_cwidth, srcYuv-m_buf[2], srcYuv-m_cwidth); +} + void TComYuv::copyToPartYuv(TComYuv* dstPicYuv, uint32_t partIdx) { pixel* dstY = dstPicYuv-getLumaAddr(partIdx); @@ -156,50 +165,9 @@ primitives.chroma[m_csp].copy_pp[part](dstV, dstPicYuv-getCStride(), srcV, getCStride()); } -void TComYuv::copyPartToPartYuv(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma) +void TComYuv::addClip(TComYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t log2Size) { -int part = partitionFromSizes(width, height); - -X265_CHECK(width != 4 || height != 4, 4x4 partition detected\n); - -if (bLuma) -{ -pixel* src = getLumaAddr(partIdx); -pixel* dst = dstPicYuv-getLumaAddr(partIdx); - -uint32_t srcstride = getStride(); -uint32_t dststride = dstPicYuv-getStride(); - -primitives.luma_copy_pp[part](dst, dststride, src, srcstride); -} -if (bChroma) -{ -pixel* srcU = getCbAddr(partIdx); -pixel* srcV = getCrAddr(partIdx); -pixel* dstU = dstPicYuv-getCbAddr(partIdx); -pixel* dstV = dstPicYuv-getCrAddr(partIdx); - -uint32_t srcstride = getCStride(); -uint32_t dststride = dstPicYuv-getCStride(); - -primitives.chroma[m_csp].copy_pp[part](dstU, dststride, srcU, srcstride); -primitives.chroma[m_csp].copy_pp[part](dstV, dststride, srcV, srcstride); -} -} - -void TComYuv::copyPartToPartLuma(ShortYuv* dstPicYuv, uint32_t partIdx, uint32_t lumaSize) -{ -int part = partitionFromSize(lumaSize); - -int16_t* dst = dstPicYuv-getLumaAddr(partIdx); -uint32_t dststride = dstPicYuv-m_width; - -primitives.luma_copy_ps[part](dst, dststride, getLumaAddr(partIdx), getStride()); -} - -void TComYuv::addClip(TComYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t partSize) -{ -int part = partitionFromSize(partSize); +int part = partitionFromLog2Size(log2Size); addClipLuma(srcYuv0, srcYuv1, part); addClipChroma(srcYuv0, srcYuv1, part); @@ -235,113 +203,32 @@ primitives.chroma[m_csp].add_ps[part](dstV, dststride, srcV0, srcV1, src0Stride, src1Stride); } -void TComYuv::addAvg(TComYuv* srcYuv0, TComYuv* srcYuv1, uint32_t partUnitIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma) -{ -int x, y; -uint32_t src0Stride, src1Stride, dststride; -int shiftNum, offset; - -pixel* srcY0 = srcYuv0-getLumaAddr(partUnitIdx); -pixel* srcU0 = srcYuv0-getCbAddr(partUnitIdx); -pixel* srcV0 = srcYuv0-getCrAddr(partUnitIdx); - -pixel* srcY1 = srcYuv1-getLumaAddr(partUnitIdx); -pixel* srcU1 = srcYuv1-getCbAddr(partUnitIdx); -pixel* srcV1 = srcYuv1-getCrAddr(partUnitIdx); - -pixel* dstY = getLumaAddr(partUnitIdx); -pixel* dstU = getCbAddr(partUnitIdx); -pixel* dstV = getCrAddr(partUnitIdx); - -if (bLuma) -{ -src0Stride = srcYuv0-getStride(); -src1Stride = srcYuv1-getStride(); -dststride = getStride(); -shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH; -offset = (1 (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS; - -for (y = 0; y height; y++) -{ -for (x = 0; x width; x += 4) -{ -dstY[x + 0] = Clip((srcY0[x + 0] + srcY1[x + 0] + offset) shiftNum); -dstY[x + 1] = Clip((srcY0[x + 1] + srcY1[x + 1] + offset) shiftNum); -dstY[x + 2] = Clip((srcY0[x + 2] + srcY1[x + 2] + offset) shiftNum); -dstY[x + 3] = Clip((srcY0[x + 3] + srcY1[x + 3] + offset) shiftNum); -} - -srcY0 += src0Stride; -srcY1 += src1Stride; -dstY += dststride; -} -} -if (bChroma) -{ -shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH; -offset = (1 (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS; - -src0Stride = srcYuv0-getCStride(); -src1Stride =
Re: [x265] Custom LowRes scale
I can do that :) Do you have standard way to generate these figures? (Video, options ?) Or shall I just generate a couple of figures to put in the commit? On 07/21/2014 06:16 PM, Deepthi Nandakumar wrote: Thanks, this is certainly an enhancement to x265 lookahead. We would be interested in this - especially if you can also include some efficiency (bitrate vs SSIM) metrics that describe the penalty moving from X265_LOWRES_SCALE of 4 to higher scales. On Mon, Jul 21, 2014 at 8:49 PM, Nicolas Morey-Chaisemartin nmo...@kalray.eu wrote: Hi, We recently profiled x265 pre-analysis to estimate what performance we could reach using our accelerator and I was quite disappointed by the performance. When running on a Core-i7 with AVX at roughly 2.7GHz, we barely reached the 30fps mark using ultrafast preset on a 4K video. After a little bit of browsing I realized that work in LosRew is always done at 1/4th of the final resolution which seems fair but requires a huge amount of work for 4K. It seemed straight forward enough to change the divider at LowRes initialization but it seems there are a lot of hard coded values that depend both on the LowRes divider and the LowRes CU Size. Here's a patch (definitly not applicable like this but just to give an idea of where I'm going) that seems to fix most of the hard-coded value. It still works with a X265_LOWRES_SCALE of 4 and the perf is definilty improving (29fps = 40fps on a 2048x1024 medium preset on a E5504). Would you be interested in a clean version of this? At least the hard-coded CU_SIZE part? IMHO it would be better to have dynamic value for LowRes depending on preset (or equivalent) and the input resolution... 1/4th is fast enough in HD not to be an issue but for RT stream in 4K or more, 1/16 will be compulsory. Nicolas --- x265/source/common/common.h | 1 + x265/source/common/lowres.cpp| 4 ++-- x265/source/encoder/frameencoder.cpp | 7 --- x265/source/encoder/ratecontrol.cpp | 16 x265/source/encoder/slicetype.cpp| 8 5 files changed, 19 insertions(+), 17 deletions(-) diff --git a/x265/source/common/common.h b/x265/source/common/common.h index 06f60e7..00e73fc 100644 --- a/x265/source/common/common.h +++ b/x265/source/common/common.h @@ -156,6 +156,7 @@ typedef int32_t coeff_t; // transform coefficient // high cost estimates (intra and inter both suffer) #define X265_LOWRES_CU_SIZE 8 #define X265_LOWRES_CU_BITS 3 +#define X265_LOWRES_SCALE 2 #define X265_MALLOC(type, count)(type*)x265_malloc(sizeof(type) * (count)) #define X265_FREE(ptr) x265_free(ptr) diff --git a/x265/source/common/lowres.cpp b/x265/source/common/lowres.cpp index 5fc2f6b..6138023 100644 --- a/x265/source/common/lowres.cpp +++ b/x265/source/common/lowres.cpp @@ -31,8 +31,8 @@ bool Lowres::create(TComPicYuv *orig, int _bframes, bool bAQEnabled) { isLowres = true; bframes = _bframes; -width = orig-getWidth() / 2; -lines = orig-getHeight() / 2; +width = orig-getWidth() / X265_LOWRES_SCALE; +lines = orig-getHeight() / X265_LOWRES_SCALE; lumaStride = width + 2 * orig-getLumaMarginX(); if (lumaStride 31) lumaStride += 32 - (lumaStride 31); diff --git a/x265/source/encoder/frameencoder.cpp b/x265/source/encoder/ frameencoder.cpp index 8c3ee26..7213f60 100644 --- a/x265/source/encoder/frameencoder.cpp +++ b/x265/source/encoder/frameencoder.cpp @@ -1300,9 +1300,10 @@ int FrameEncoder::calcQpForCu(uint32_t cuAddr, double baseQp) /* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */ double qp_offset = 0; -int maxBlockCols = (m_frame-getPicYuvOrg()-getWidth() + (16 - 1)) / 16; -int maxBlockRows = (m_frame-getPicYuvOrg()-getHeight() + (16 - 1)) / 16; -int noOfBlocks = g_maxCUSize / 16; +int lowResCu = (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE); +int maxBlockCols = (m_frame-getPicYuvOrg()-getWidth() + (lowResCu - 1)) / lowResCu; +int maxBlockRows = (m_frame-getPicYuvOrg()-getHeight() + (lowResCu - 1)) / lowResCu; +int noOfBlocks = g_maxCUSize / lowResCu; int block_y = (cuAddr / m_frame-getPicSym()-getFrameWidthInCU()) * noOfBlocks; int block_x = (cuAddr * noOfBlocks) - block_y * m_frame-getPicSym()- getFrameWidthInCU(); diff --git a/x265/source/encoder/ratecontrol.cpp b/x265/source/encoder/ ratecontrol.cpp index 4358994..5fcc27a 100644 --- a/x265/source/encoder/ratecontrol.cpp +++ b/x265/source/encoder/ratecontrol.cpp @@ -161,8 +161,8 @@ void RateControl::calcAdaptiveQuantFrame(Frame *pic) if (m_param-rc.aqMode == X265_AQ_NONE || m_param-rc.aqStrength == 0) { /* Need to init it anyways for CU tree */ -int cuWidth = ((maxCol / 2) + X265_LOWRES_CU_SIZE - 1) X265_LOWRES_CU_BITS; -int cuHeight = ((maxRow / 2) + X265_LOWRES_CU_SIZE - 1) X265_LOWRES_CU_BITS; +int cuWidth = ((maxCol / X265_LOWRES_SCALE) +
Re: [x265] refine partition size related
On 07/22, Satoshi Nakagawa wrote: # HG changeset patch # User Satoshi Nakagawa nakagawa...@oki.com # Date 1406011990 -32400 # Tue Jul 22 15:53:10 2014 +0900 # Node ID b2ad081e4bfc20bbc84e8bfbab59ed52aeac2a73 # Parent d303b4d860e9f06396a156726dd518d0f41fe796 refine partition size related - reorder LumaPartitions to simplify partitionFromLog2Size() - remove unused Queued for testing, thanks. One question below: diff -r d303b4d860e9 -r b2ad081e4bfc source/Lib/TLibCommon/TComYuv.cpp --- a/source/Lib/TLibCommon/TComYuv.cpp Mon Jul 21 22:43:38 2014 -0500 +++ b/source/Lib/TLibCommon/TComYuv.cpp Tue Jul 22 15:53:10 2014 +0900 @@ -127,6 +127,15 @@ snip diff -r d303b4d860e9 -r b2ad081e4bfc source/test/testbench.cpp --- a/source/test/testbench.cpp Mon Jul 21 22:43:38 2014 -0500 +++ b/source/test/testbench.cpp Tue Jul 22 15:53:10 2014 +0900 @@ -127,6 +127,7 @@ EncoderPrimitives cprim; memset(cprim, 0, sizeof(EncoderPrimitives)); Setup_C_Primitives(cprim); +Setup_Alias_Primitives(cprim); struct test_arch_t { @@ -186,6 +187,7 @@ memset(optprim, 0, sizeof(optprim)); Setup_Instrinsic_Primitives(optprim, cpuid); Setup_Assembly_Primitives(optprim, cpuid); +Setup_Alias_Primitives(optprim); is there a reason to test the aliased functions, since by their nature they should already be being tested via another function pointer? -- Steve Borho ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] Custom LowRes scale
On 07/21/2014 07:11 PM, Steve Borho wrote: Interesting. I imagine much 4k content would work decently well even with further downscaling of the lookahead pictures. The lowres motion vectors are used in weight analysis as well, so that file would need to be updated. I'll have a look at it. It doesn't semm as straight forward as the other files though. While we're talking about lowres MV: from what I could gather they are not used during the motionSearch on the full res picture. As a lot of time is spent finding those, whouldn't it be useful to add them as candidate in the fullres search? Nicolas ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] refine partition size related
To find non optimized functions, and which function can be aliased. I think many 4:2:2 functions can be aliased. -Original Message- From: x265-devel [mailto:x265-devel-boun...@videolan.org] On Behalf Of Steve Borho Sent: Tuesday, July 22, 2014 4:16 PM To: Development for x265 Subject: Re: [x265] refine partition size related On 07/22, Satoshi Nakagawa wrote: # HG changeset patch # User Satoshi Nakagawa nakagawa...@oki.com # Date 1406011990 -32400 # Tue Jul 22 15:53:10 2014 +0900 # Node ID b2ad081e4bfc20bbc84e8bfbab59ed52aeac2a73 # Parent d303b4d860e9f06396a156726dd518d0f41fe796 refine partition size related - reorder LumaPartitions to simplify partitionFromLog2Size() - remove unused Queued for testing, thanks. One question below: diff -r d303b4d860e9 -r b2ad081e4bfc source/Lib/TLibCommon/TComYuv.cpp --- a/source/Lib/TLibCommon/TComYuv.cpp Mon Jul 21 22:43:38 2014 -0500 +++ b/source/Lib/TLibCommon/TComYuv.cpp Tue Jul 22 15:53:10 2014 +0900 @@ -127,6 +127,15 @@ snip diff -r d303b4d860e9 -r b2ad081e4bfc source/test/testbench.cpp --- a/source/test/testbench.cpp Mon Jul 21 22:43:38 2014 -0500 +++ b/source/test/testbench.cpp Tue Jul 22 15:53:10 2014 +0900 @@ -127,6 +127,7 @@ EncoderPrimitives cprim; memset(cprim, 0, sizeof(EncoderPrimitives)); Setup_C_Primitives(cprim); +Setup_Alias_Primitives(cprim); struct test_arch_t { @@ -186,6 +187,7 @@ memset(optprim, 0, sizeof(optprim)); Setup_Instrinsic_Primitives(optprim, cpuid); Setup_Assembly_Primitives(optprim, cpuid); +Setup_Alias_Primitives(optprim); is there a reason to test the aliased functions, since by their nature they should already be being tested via another function pointer? -- Steve Borho ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] refine partition size related
On 07/22, Satoshi Nakagawa wrote: To find non optimized functions, and which function can be aliased. I think many 4:2:2 functions can be aliased. ok, fair enough. -- Steve Borho ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] Custom LowRes scale
On 07/22, Nicolas Morey-Chaisemartin wrote: On 07/21/2014 07:11 PM, Steve Borho wrote: Interesting. I imagine much 4k content would work decently well even with further downscaling of the lookahead pictures. The lowres motion vectors are used in weight analysis as well, so that file would need to be updated. I'll have a look at it. It doesn't semm as straight forward as the other files though. it is slightly more complicated; you'll want to scale up the block sizes used for motion-compensated weight analysis - up to 32x32 or 64x64 based on how much further you downscale the lowres in lookahead. While we're talking about lowres MV: from what I could gather they are not used during the motionSearch on the full res picture. As a lot of time is spent finding those, whouldn't it be useful to add them as candidate in the fullres search? This has been on my TODO list for ages; a couple of people have claimed they've tried it and it hasn't helped as much as you might think. But I haven't had a working patch in hand to verify it. The AMVP fixup after motion search, where we get to go shopping for a better MVP after the search, often makes extra motion candidates superfluous. -- Steve Borho ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] Custom LowRes scale
On 07/22/2014 10:08 AM, Steve Borho wrote: On 07/22, Nicolas Morey-Chaisemartin wrote: I'll have a look at it. It doesn't semm as straight forward as the other files though. it is slightly more complicated; you'll want to scale up the block sizes used for motion-compensated weight analysis - up to 32x32 or 64x64 based on how much further you downscale the lowres in lookahead. Is there a clean way to get a LUMA_NNxNN value from a block size ? Should I handle block larger than 64x64 by looping on the 64x64 blocks? or simply add a check at lowres init that the fullres CU size is = 64 ? While we're talking about lowres MV: from what I could gather they are not used during the motionSearch on the full res picture. As a lot of time is spent finding those, whouldn't it be useful to add them as candidate in the fullres search? This has been on my TODO list for ages; a couple of people have claimed they've tried it and it hasn't helped as much as you might think. But I haven't had a working patch in hand to verify it. The AMVP fixup after motion search, where we get to go shopping for a better MVP after the search, often makes extra motion candidates superfluous. I started working on this yesterday for our accelerator but I got carried away on lowres scaling. I haven't any results yet but I'll post them as soon as I have some. By the way, lowres MV are in lowres luma pixels right? So I'll need to scale the vector by 2 to get the full MV? Nicolas ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] lowres: Enhanced scaling
# HG changeset patch # User Nicolas Morey-Chaisemartin nmo...@kalray.eu # Date 1406020650 -7200 # Tue Jul 22 11:17:30 2014 +0200 # Node ID fc75f5f4f85e0d9441dc73b09ec60a36c20f # Parent 4c9ce4db74d1c9768abc61290bd1bda002b79f4e lowres: Enhanced scaling * Replace hard coded values with X265_LOWRES_CU_SIZE * Add X265_LOWRES_SCALE define to tweak divider for LowRes Note: X265_LOWRES_SCALE * X265_LOWRES_CU_SIZE must be lesser or equal to 64 to be able to use standard filters for weight prediction Performance Impact: Command Line: ./x265/build/x265 --preset medium --accel=none red_kayak_1080p-420.y4m kayak.hevc --bitrate=$BITRATE --ssim - BITRATE=4000 X265_LOWRES_SCALE=2 encoded 570 frames in 105.56s (5.40 fps), 3334.27 kb/s, SSIM Mean Y: 0.8900527 ( 9.588 dB) - BITRATE=4000 X265_LOWRES_SCALE=4 encoded 570 frames in 87.11s (6.54 fps), 3398.38 kb/s, SSIM Mean Y: 0.8836753 ( 9.343 dB) - BITRATE=4000 X265_LOWRES_SCALE=8 encoded 570 frames in 79.71s (7.15 fps), 3437.19 kb/s, SSIM Mean Y: 0.8765783 ( 9.086 dB) - BITRATE=9000 X265_LOWRES_SCALE=2 encoded 570 frames in 115.32s (4.94 fps), 7263.50 kb/s, SSIM Mean Y: 0.9272905 (11.384 dB) - BITRATE=9000 X265_LOWRES_SCALE=4 encoded 570 frames in 101.53s (5.61 fps), 7439.24 kb/s, SSIM Mean Y: 0.9209998 (11.024 dB) - BITRATE=9000 X265_LOWRES_SCALE=8 encoded 570 frames in 92.98s (6.13 fps), 7549.41 kb/s, SSIM Mean Y: 0.9160721 (10.761 dB) diff --git a/source/common/common.h b/source/common/common.h --- a/source/common/common.h +++ b/source/common/common.h @@ -153,16 +153,17 @@ typedef int32_t coeff_t; // transf // arbitrary, but low because SATD scores are 1/4 normal #define X265_LOOKAHEAD_QP (12 + QP_BD_OFFSET) #define X265_LOOKAHEAD_MAX 250 // Use the same size blocks as x264. Using larger blocks seems to give artificially // high cost estimates (intra and inter both suffer) #define X265_LOWRES_CU_SIZE 8 #define X265_LOWRES_CU_BITS 3 +#define X265_LOWRES_SCALE 2 #define X265_MALLOC(type, count)(type*)x265_malloc(sizeof(type) * (count)) #define X265_FREE(ptr) x265_free(ptr) #define CHECKED_MALLOC(var, type, count) \ { \ var = (type*)x265_malloc(sizeof(type) * (count)); \ if (!var) \ { \ diff --git a/source/common/lowres.cpp b/source/common/lowres.cpp --- a/source/common/lowres.cpp +++ b/source/common/lowres.cpp @@ -24,20 +24,21 @@ #include TLibCommon/TComPicYuv.h #include lowres.h #include mv.h using namespace x265; bool Lowres::create(TComPicYuv *orig, int _bframes, bool bAQEnabled) { + X265_CHECK(X265_LOWRES_SCALE * X265_LOWRES_CU_SIZE = 64, Invalid LowRes scaling\n); isLowres = true; bframes = _bframes; -width = orig-getWidth() / 2; -lines = orig-getHeight() / 2; +width = orig-getWidth() / X265_LOWRES_SCALE; +lines = orig-getHeight() / X265_LOWRES_SCALE; lumaStride = width + 2 * orig-getLumaMarginX(); if (lumaStride 31) lumaStride += 32 - (lumaStride 31); int cuWidth = (width + X265_LOWRES_CU_SIZE - 1) X265_LOWRES_CU_BITS; int cuHeight = (lines + X265_LOWRES_CU_SIZE - 1) X265_LOWRES_CU_BITS; int cuCount = cuWidth * cuHeight; /* rounding the width to multiple of lowres CU size */ diff --git a/source/encoder/frameencoder.cpp b/source/encoder/frameencoder.cpp --- a/source/encoder/frameencoder.cpp +++ b/source/encoder/frameencoder.cpp @@ -984,19 +984,20 @@ int FrameEncoder::calcQpForCu(uint32_t c if (bIsVbv) { m_frame-m_cuCostsForVbv[cuAddr] = 0; m_frame-m_intraCuCostsForVbv[cuAddr] = 0; } /* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */ double qp_offset = 0; -int maxBlockCols = (m_frame-getPicYuvOrg()-getWidth() + (16 - 1)) / 16; -int maxBlockRows = (m_frame-getPicYuvOrg()-getHeight() + (16 - 1)) / 16; -int noOfBlocks = g_maxCUSize / 16; +int lowResCu = (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE); +int maxBlockCols = (m_frame-getPicYuvOrg()-getWidth() + (lowResCu - 1)) / lowResCu; +int maxBlockRows = (m_frame-getPicYuvOrg()-getHeight() + (lowResCu - 1)) / lowResCu; +int noOfBlocks = g_maxCUSize / lowResCu; int block_y = (cuAddr / m_frame-getPicSym()-getFrameWidthInCU()) * noOfBlocks; int block_x = (cuAddr * noOfBlocks) - block_y * m_frame-getPicSym()-getFrameWidthInCU(); /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */ double *qpoffs = (m_isReferenced m_param-rc.cuTree) ? m_frame-m_lowres.qpCuTreeOffset : m_frame-m_lowres.qpAqOffset; int cnt = 0, idx = 0; for (int h = 0; h noOfBlocks block_y maxBlockRows; h++, block_y++) diff --git a/source/encoder/ratecontrol.cpp b/source/encoder/ratecontrol.cpp --- a/source/encoder/ratecontrol.cpp +++ b/source/encoder/ratecontrol.cpp @@ -198,18 +198,18 @@ void RateControl::calcAdaptiveQuantFrame /* Calculate Qp offset for each 16x16 block in
[x265] Brief speed test with x265 in all presets and different versions
Dear x265 developers. I'd like to share with you a result of a brief speed test, comparing x265 in different versions (early 1.1, late 1.1, and current 1.2) in all presets with the small foreman clip. Its meaning is certainly limited, so I don't feel like sharing it in a public forum (may cause too much FUD), and I don't want to interpret it too much, except for one general result: There are changes which increased the speed during the development of v1.1, but with only an AMD Phenom-II X4, it is not yet very obvious. And the promised increase for AMD with the HADDD macro didn't happen to me, rather the opposite. Results will probably be different for FX+ CPUs. CLI: -o foreman_cif_placebo.hevc --preset %preset% --aq-mode 2 --aq-strength 1.5 --psy-rd 0.5 foreman_cif.y4m And no, I will not insist in slowing down preset medium, just for the sake of the curve's beauty. ;-) -- Fun and success! Mario *LigH* Rohkrämer mailto:cont...@ligh.de ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH] psyrdoq: implementation of psyrdoq
# HG changeset patch # User Sumalatha Polureddysumala...@multicorewareinc.com # Date 1406032149 -19800 # Node ID 37e03dcd2e4f0b5894880ff8c097bd6e11590459 # Parent d303b4d860e9f06396a156726dd518d0f41fe796 psyrdoq: implementation of psyrdoq diff -r d303b4d860e9 -r 37e03dcd2e4f source/Lib/TLibCommon/TComTrQuant.cpp --- a/source/Lib/TLibCommon/TComTrQuant.cpp Mon Jul 21 22:43:38 2014 -0500 +++ b/source/Lib/TLibCommon/TComTrQuant.cpp Tue Jul 22 17:59:09 2014 +0530 @@ -64,6 +64,8 @@ return y + ((x - y) ((x - y) (sizeof(int) * CHAR_BIT - 1))); // min(x, y) } +#define SIGN(x,y) ((x^(y 31))-(y 31)) + // // TComTrQuant class member functions // @@ -307,6 +309,8 @@ } uint32_t TComTrQuant::transformNxN(TComDataCU* cu, + pixel* fenc, + uint32_tfencStride, int16_t*residual, uint32_tstride, coeff_t*coeff, @@ -316,10 +320,10 @@ booluseTransformSkip, boolcurUseRDOQ) { +int trSize = 1 log2TrSize; if (cu-getCUTransquantBypass(absPartIdx)) { uint32_t numSig = 0; -int trSize = 1 log2TrSize; for (int k = 0; k trSize; k++) { for (int j = 0; j trSize; j++) @@ -339,6 +343,12 @@ const uint32_t sizeIdx = log2TrSize - 2; int useDST = (sizeIdx == 0 ttype == TEXT_LUMA cu-getPredictionMode(absPartIdx) == MODE_INTRA); int index = DCT_4x4 + sizeIdx - useDST; +if (psyRdoqEnabled()) +{ +// converting pixel to int and putting in separate buffer to take dct +primitives.square_copy_ps[sizeIdx](m_tmpfencBuf, MAX_CU_SIZE, fenc, fencStride); +primitives.dct[index](m_tmpfencBuf, m_tmpfencCoeff, stride); +} primitives.dct[index](residual, m_tmpCoeff, stride); if (m_nr-bNoiseReduction) { @@ -356,7 +366,7 @@ if (m_useRDOQ curUseRDOQ) { -return xRateDistOptQuant(cu, m_tmpCoeff, coeff, log2TrSize, ttype, absPartIdx); +return xRateDistOptQuant(cu, m_tmpfencCoeff, m_tmpCoeff, coeff, log2TrSize, ttype, absPartIdx); } return xQuant(cu, m_tmpCoeff, coeff, log2TrSize, ttype, absPartIdx); } @@ -505,7 +515,7 @@ * Rate distortion optimized quantization for entropy * coding engines using probability models like CABAC */ -uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t log2TrSize, +uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t* fencCoeff, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx) { uint32_t trSize = 1 log2TrSize; @@ -614,7 +624,7 @@ { level = xGetCodedLevel(costCoeff[scanPos], curCostSig, costSig[scanPos], levelDouble, maxAbsLevel, baseLevel, greaterOneBits, levelAbsBits, goRiceParam, - c1c2Idx, qbits, scaleFactor, 1); + c1c2Idx, qbits, scaleFactor, 1, srcCoeff[blkPos], fencCoeff[blkPos]); sigRateDelta[blkPos] = 0; } else @@ -631,7 +641,7 @@ curCostSig = xGetRateSigCoef(1, ctxSig); level = xGetCodedLevel(costCoeff[scanPos], curCostSig, costSig[scanPos], levelDouble, maxAbsLevel, baseLevel, greaterOneBits, levelAbsBits, goRiceParam, - c1c2Idx, qbits, scaleFactor, 0); + c1c2Idx, qbits, scaleFactor, 0, srcCoeff[blkPos], fencCoeff[blkPos]); } else { @@ -1126,7 +1136,9 @@ uint32_t c1c2Idx, int qbits, double scaleFactor, -bool last) const +bool last, +int signCoef, +int origCoef) const { uint32_t bestAbsLevel = 0; @@ -1155,7 +1167,18 @@ for (int absLevel = maxAbsLevel; absLevel = minAbsLevel; absLevel--) { X265_CHECK(fabs((double)err2 - double(levelDouble - (absLevel
[x265] [PATCH 0 of 3 ] Remove TComDataCU dependencies from prediction/MC
This will help pave the way for a much better designed Analysis and Search structure ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 1 of 3] TComPrediction: remove TComDataCU as pointer to private functions
# HG changeset patch # User Deepthi Nandakumar deep...@multicorewareinc.com # Date 1405938991 -19800 # Mon Jul 21 16:06:31 2014 +0530 # Node ID d8d26a695cf6734ad2180c4694360ef6e71ead81 # Parent e3ad03b7c4854be40730645d4fe25e56a93f3f94 TComPrediction: remove TComDataCU as pointer to private functions diff -r e3ad03b7c485 -r d8d26a695cf6 source/Lib/TLibCommon/TComPrediction.cpp --- a/source/Lib/TLibCommon/TComPrediction.cpp Tue Jul 22 13:28:54 2014 -0500 +++ b/source/Lib/TLibCommon/TComPrediction.cpp Mon Jul 21 16:06:31 2014 +0530 @@ -85,6 +85,7 @@ void TComPrediction::initTempBuff(int csp) { +m_csp = csp; m_hChromaShift = CHROMA_H_SHIFT(csp); m_vChromaShift = CHROMA_V_SHIFT(csp); @@ -262,9 +263,11 @@ MV mv = cu-getCUMvField(list)-getMv(partAddr); cu-clipMv(mv); if (bLuma) -xPredInterLumaBlk(cu, cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), partAddr, mv, width, height, shortYuv); + xPredInterLumaBlk(cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), cu-getAddr(), cu-getZorderIdxInCU(), +partAddr, mv, width, height, shortYuv); if (bChroma) -xPredInterChromaBlk(cu, cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), partAddr, mv, width, height, shortYuv); + xPredInterChromaBlk(cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), cu-getAddr(), cu-getZorderIdxInCU(), +partAddr, mv, width, height, shortYuv); xWeightedPredictionUni(cu, shortYuv, partAddr, width, height, list, predYuv, -1, bLuma, bChroma); } @@ -291,10 +294,12 @@ cu-clipMv(mv); if (bLuma) -xPredInterLumaBlk(cu, cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), partAddr, mv, width, height, outPredYuv); + xPredInterLumaBlk(cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), cu-getAddr(), cu-getZorderIdxInCU(), +partAddr, mv, width, height, outPredYuv); if (bChroma) -xPredInterChromaBlk(cu, cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), partAddr, mv, width, height, outPredYuv); + xPredInterChromaBlk(cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), cu-getAddr(), cu-getZorderIdxInCU(), +partAddr, mv, width, height, outPredYuv); } void TComPrediction::xPredInterUni(TComDataCU* cu, uint32_t partAddr, int width, int height, int list, ShortYuv* outPredYuv, bool bLuma, bool bChroma) @@ -307,9 +312,11 @@ cu-clipMv(mv); if (bLuma) -xPredInterLumaBlk(cu, cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), partAddr, mv, width, height, outPredYuv); + xPredInterLumaBlk(cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), cu-getAddr(), cu-getZorderIdxInCU(), +partAddr, mv, width, height, outPredYuv); if (bChroma) -xPredInterChromaBlk(cu, cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), partAddr, mv, width, height, outPredYuv); + xPredInterChromaBlk(cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), cu-getAddr(), cu-getZorderIdxInCU(), +partAddr, mv, width, height, outPredYuv); } void TComPrediction::xPredInterBi(TComDataCU* cu, uint32_t partAddr, int width, int height, TComYuv* outPredYuv, bool bLuma, bool bChroma) @@ -378,7 +385,7 @@ * \param height Height of block * \param dstPic Pointer to destination picture */ -void TComPrediction::xPredInterLumaBlk(TComDataCU *cu, TComPicYuv *refPic, uint32_t partAddr, MV *mv, int width, int height, TComYuv *dstPic) +void TComPrediction::xPredInterLumaBlk(TComPicYuv *refPic, uint32_t cuAddr, uint32_t zOrderIdxinCU, uint32_t partAddr, MV *mv, int width, int height, TComYuv *dstPic) { int dstStride = dstPic-getStride(); pixel *dst= dstPic-getLumaAddr(partAddr); @@ -386,7 +393,7 @@ int srcStride = refPic-getStride(); int srcOffset = (mv-x 2) + (mv-y 2) * srcStride; int partEnum = partitionFromSizes(width, height); -pixel* src = refPic-getLumaAddr(cu-getAddr(), cu-getZorderIdxInCU() + partAddr) + srcOffset; +pixel* src = refPic-getLumaAddr(cuAddr, zOrderIdxinCU + partAddr) + srcOffset; int xFrac = mv-x 0x3; int yFrac = mv-y 0x3; @@ -414,11 +421,11 @@ } //Motion compensated block for biprediction -void TComPrediction::xPredInterLumaBlk(TComDataCU *cu, TComPicYuv *refPic, uint32_t partAddr, MV *mv, int width, int height, ShortYuv *dstPic) +void TComPrediction::xPredInterLumaBlk(TComPicYuv *refPic, uint32_t cuAddr, uint32_t zOrderIdxinCU, uint32_t partAddr, MV *mv, int width, int height, ShortYuv *dstPic) { int refStride = refPic-getStride(); int refOffset = (mv-x 2) + (mv-y 2) * refStride; -pixel *ref= refPic-getLumaAddr(cu-getAddr(), cu-getZorderIdxInCU() + partAddr) + refOffset; +pixel *ref= refPic-getLumaAddr(cuAddr, zOrderIdxinCU + partAddr) + refOffset; int dstStride =
[x265] [PATCH 2 of 3] TComPrediction: save CU data (partAddr, width, height) as member fields
# HG changeset patch # User Deepthi Nandakumar deep...@multicorewareinc.com # Date 1406028818 -19800 # Tue Jul 22 17:03:38 2014 +0530 # Node ID a5422a41c85ac06fc773f1179e6fbea1a80a5e98 # Parent d8d26a695cf6734ad2180c4694360ef6e71ead81 TComPrediction: save CU data (partAddr, width, height) as member fields Before motion compensation, save CU related data inside the TComPrediction structure diff -r d8d26a695cf6 -r a5422a41c85a source/Lib/TLibCommon/TComPrediction.cpp --- a/source/Lib/TLibCommon/TComPrediction.cpp Mon Jul 21 16:06:31 2014 +0530 +++ b/source/Lib/TLibCommon/TComPrediction.cpp Tue Jul 22 17:03:38 2014 +0530 @@ -223,18 +223,18 @@ * \param TComDataCU* cu * \param uint32_t PartAddr */ -bool TComPrediction::xCheckIdenticalMotion(TComDataCU* cu, uint32_t partAddr) +bool TComPrediction::xCheckIdenticalMotion(TComDataCU* cu) { X265_CHECK(cu-m_slice-isInterB(), identical motion check in P frame\n); if (!cu-m_slice-m_pps-bUseWeightedBiPred) { -int refIdxL0 = cu-getCUMvField(0)-getRefIdx(partAddr); -int refIdxL1 = cu-getCUMvField(1)-getRefIdx(partAddr); +int refIdxL0 = cu-getCUMvField(0)-getRefIdx(m_partAddr); +int refIdxL1 = cu-getCUMvField(1)-getRefIdx(m_partAddr); if (refIdxL0 = 0 refIdxL1 = 0) { int refPOCL0 = cu-m_slice-m_refPOCList[0][refIdxL0]; int refPOCL1 = cu-m_slice-m_refPOCList[1][refIdxL1]; -if (refPOCL0 == refPOCL1 cu-getCUMvField(0)-getMv(partAddr) == cu-getCUMvField(1)-getMv(partAddr)) +if (refPOCL0 == refPOCL1 cu-getCUMvField(0)-getMv(m_partAddr) == cu-getCUMvField(1)-getMv(m_partAddr)) return true; } } @@ -243,89 +243,83 @@ void TComPrediction::motionCompensation(TComDataCU* cu, TComYuv* predYuv, int list, int partIdx, bool bLuma, bool bChroma) { -int width; -int height; -uint32_t partAddr; +X265_CHECK(partIdx = 0, partidx is not positive\n); + +if (cu-m_slice-isInterP()) +list = REF_PIC_LIST_0; +if (list != REF_PIC_LIST_X) +{ +if (cu-m_slice-m_pps-bUseWeightPred) +{ +ShortYuv* shortYuv = m_predShortYuv[0]; +int refId = cu-getCUMvField(list)-getRefIdx(m_partAddr); +X265_CHECK(refId = 0, refidx is not positive\n); -X265_CHECK(partIdx = 0, partidx is not positive\n); -{ -cu-getPartIndexAndSize(partIdx, partAddr, width, height); -if (cu-m_slice-isInterP()) -list = REF_PIC_LIST_0; -if (list != REF_PIC_LIST_X) -{ -if (cu-m_slice-m_pps-bUseWeightPred) -{ -ShortYuv* shortYuv = m_predShortYuv[0]; -int refId = cu-getCUMvField(list)-getRefIdx(partAddr); -X265_CHECK(refId = 0, refidx is not positive\n); +MV mv = cu-getCUMvField(list)-getMv(m_partAddr); +cu-clipMv(mv); +if (bLuma) + xPredInterLumaBlk(cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), cu-getAddr(), cu-getZorderIdxInCU(), +mv, shortYuv); +if (bChroma) + xPredInterChromaBlk(cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), cu-getAddr(), cu-getZorderIdxInCU(), +mv, shortYuv); -MV mv = cu-getCUMvField(list)-getMv(partAddr); -cu-clipMv(mv); -if (bLuma) - xPredInterLumaBlk(cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), cu-getAddr(), cu-getZorderIdxInCU(), -partAddr, mv, width, height, shortYuv); -if (bChroma) - xPredInterChromaBlk(cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), cu-getAddr(), cu-getZorderIdxInCU(), -partAddr, mv, width, height, shortYuv); - -xWeightedPredictionUni(cu, shortYuv, partAddr, width, height, list, predYuv, -1, bLuma, bChroma); -} -else -xPredInterUni(cu, partAddr, width, height, list, predYuv, bLuma, bChroma); +xWeightedPredictionUni(cu, shortYuv, m_partAddr, m_width, m_height, list, predYuv, -1, bLuma, bChroma); } else -{ -if (xCheckIdenticalMotion(cu, partAddr)) -xPredInterUni(cu, partAddr, width, height, REF_PIC_LIST_0, predYuv, bLuma, bChroma); -else -xPredInterBi(cu, partAddr, width, height, predYuv, bLuma, bChroma); -} +xPredInterUni(cu, list, predYuv, bLuma, bChroma); +} +else +{ +if (xCheckIdenticalMotion(cu)) +xPredInterUni(cu, REF_PIC_LIST_0, predYuv, bLuma, bChroma); +else +xPredInterBi(cu, predYuv, bLuma, bChroma); } } -void TComPrediction::xPredInterUni(TComDataCU* cu, uint32_t partAddr, int width, int height, int list, TComYuv* outPredYuv, bool bLuma, bool bChroma) +void
[x265] [PATCH 3 of 3] TComPrediction: remove redundant colorspace information
# HG changeset patch # User Deepthi Nandakumar deep...@multicorewareinc.com # Date 1406039346 -19800 # Tue Jul 22 19:59:06 2014 +0530 # Node ID 30f41c3ef7d39a6e341bd149adf6d57267984ec7 # Parent a5422a41c85ac06fc773f1179e6fbea1a80a5e98 TComPrediction: remove redundant colorspace information m_csp is sufficient inside the TEncSearch, TComPrediction structures diff -r a5422a41c85a -r 30f41c3ef7d3 source/Lib/TLibCommon/TComPrediction.cpp --- a/source/Lib/TLibCommon/TComPrediction.cpp Tue Jul 22 17:03:38 2014 +0530 +++ b/source/Lib/TLibCommon/TComPrediction.cpp Tue Jul 22 19:59:06 2014 +0530 @@ -86,8 +86,6 @@ void TComPrediction::initTempBuff(int csp) { m_csp = csp; -m_hChromaShift = CHROMA_H_SHIFT(csp); -m_vChromaShift = CHROMA_V_SHIFT(csp); if (m_predBuf == NULL) { @@ -470,8 +468,11 @@ int refStride = refPic-getCStride(); int dstStride = dstPic-getCStride(); -int shiftHor = (2 + m_hChromaShift); -int shiftVer = (2 + m_vChromaShift); +int hChromaShift = CHROMA_H_SHIFT(m_csp); +int vChromaShift = CHROMA_V_SHIFT(m_csp); + +int shiftHor = (2 + hChromaShift); +int shiftVer = (2 + vChromaShift); int refOffset = (mv-x shiftHor) + (mv-y shiftVer) * refStride; @@ -493,25 +494,25 @@ } else if (yFrac == 0) { -primitives.chroma[m_csp].filter_hpp[partEnum](refCb, refStride, dstCb, dstStride, xFrac (1 - m_hChromaShift)); -primitives.chroma[m_csp].filter_hpp[partEnum](refCr, refStride, dstCr, dstStride, xFrac (1 - m_hChromaShift)); +primitives.chroma[m_csp].filter_hpp[partEnum](refCb, refStride, dstCb, dstStride, xFrac (1 - hChromaShift)); +primitives.chroma[m_csp].filter_hpp[partEnum](refCr, refStride, dstCr, dstStride, xFrac (1 - hChromaShift)); } else if (xFrac == 0) { -primitives.chroma[m_csp].filter_vpp[partEnum](refCb, refStride, dstCb, dstStride, yFrac (1 - m_vChromaShift)); -primitives.chroma[m_csp].filter_vpp[partEnum](refCr, refStride, dstCr, dstStride, yFrac (1 - m_vChromaShift)); +primitives.chroma[m_csp].filter_vpp[partEnum](refCb, refStride, dstCb, dstStride, yFrac (1 - vChromaShift)); +primitives.chroma[m_csp].filter_vpp[partEnum](refCr, refStride, dstCr, dstStride, yFrac (1 - vChromaShift)); } else { -int extStride = m_width m_hChromaShift; +int extStride = m_width hChromaShift; int filterSize = NTAPS_CHROMA; int halfFilterSize = (filterSize 1); -primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, m_immedVals, extStride, xFrac (1 - m_hChromaShift), 1); -primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac (1 - m_vChromaShift)); +primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, m_immedVals, extStride, xFrac (1 - hChromaShift), 1); +primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac (1 - vChromaShift)); -primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, m_immedVals, extStride, xFrac (1 - m_hChromaShift), 1); -primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac (1 - m_vChromaShift)); +primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, m_immedVals, extStride, xFrac (1 - hChromaShift), 1); +primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac (1 - vChromaShift)); } } @@ -520,9 +521,11 @@ { int refStride = refPic-getCStride(); int dstStride = dstPic-m_cwidth; +int hChromaShift = CHROMA_H_SHIFT(m_csp); +int vChromaShift = CHROMA_V_SHIFT(m_csp); -int shiftHor = (2 + m_hChromaShift); -int shiftVer = (2 + m_vChromaShift); +int shiftHor = (2 + hChromaShift); +int shiftVer = (2 + vChromaShift); int refOffset = (mv-x shiftHor) + (mv-y shiftVer) * refStride; @@ -537,8 +540,8 @@ int partEnum = partitionFromSizes(m_width, m_height); -uint32_t cxWidth = m_widthm_hChromaShift; -uint32_t cxHeight = m_height m_vChromaShift; +uint32_t cxWidth = m_widthhChromaShift; +uint32_t cxHeight = m_height vChromaShift; X265_CHECK(((cxWidth | cxHeight) % 2) == 0, chroma block size expected to be multiple of 2\n); @@ -549,23 +552,23 @@ } else if (yFrac == 0) { -primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, dstCb, dstStride, xFrac (1 - m_hChromaShift), 0); -primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, dstCr, dstStride, xFrac (1 - m_hChromaShift), 0); +primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, dstCb, dstStride, xFrac (1 - hChromaShift), 0); +
Re: [x265] [PATCH] psyrdoq: implementation of psyrdoq
Does it make sense to try this for DC coefficients? On Tue, Jul 22, 2014 at 9:24 PM, Steve Borho st...@borho.org wrote: On 07/22, sumala...@multicorewareinc.com wrote: # HG changeset patch # User Sumalatha Polureddysumala...@multicorewareinc.com # Date 1406032149 -19800 # Node ID 37e03dcd2e4f0b5894880ff8c097bd6e11590459 # Parent d303b4d860e9f06396a156726dd518d0f41fe796 psyrdoq: implementation of psyrdoq diff -r d303b4d860e9 -r 37e03dcd2e4f source/Lib/TLibCommon/TComTrQuant.cpp --- a/source/Lib/TLibCommon/TComTrQuant.cpp Mon Jul 21 22:43:38 2014 -0500 +++ b/source/Lib/TLibCommon/TComTrQuant.cpp Tue Jul 22 17:59:09 2014 +0530 @@ -64,6 +64,8 @@ return y + ((x - y) ((x - y) (sizeof(int) * CHAR_BIT - 1))); // min(x, y) } +#define SIGN(x,y) ((x^(y 31))-(y 31)) + // // TComTrQuant class member functions // @@ -307,6 +309,8 @@ } uint32_t TComTrQuant::transformNxN(TComDataCU* cu, + pixel* fenc, + uint32_tfencStride, int16_t*residual, uint32_tstride, coeff_t*coeff, @@ -316,10 +320,10 @@ booluseTransformSkip, boolcurUseRDOQ) { +int trSize = 1 log2TrSize; if (cu-getCUTransquantBypass(absPartIdx)) { uint32_t numSig = 0; -int trSize = 1 log2TrSize; for (int k = 0; k trSize; k++) { for (int j = 0; j trSize; j++) @@ -339,6 +343,12 @@ const uint32_t sizeIdx = log2TrSize - 2; int useDST = (sizeIdx == 0 ttype == TEXT_LUMA cu-getPredictionMode(absPartIdx) == MODE_INTRA); int index = DCT_4x4 + sizeIdx - useDST; +if (psyRdoqEnabled()) +{ +// converting pixel to int and putting in separate buffer to take dct +primitives.square_copy_ps[sizeIdx](m_tmpfencBuf, MAX_CU_SIZE, fenc, fencStride); +primitives.dct[index](m_tmpfencBuf, m_tmpfencCoeff, stride); +} primitives.dct[index](residual, m_tmpCoeff, stride); if (m_nr-bNoiseReduction) { @@ -356,7 +366,7 @@ if (m_useRDOQ curUseRDOQ) { -return xRateDistOptQuant(cu, m_tmpCoeff, coeff, log2TrSize, ttype, absPartIdx); +return xRateDistOptQuant(cu, m_tmpfencCoeff, m_tmpCoeff, coeff, log2TrSize, ttype, absPartIdx); } return xQuant(cu, m_tmpCoeff, coeff, log2TrSize, ttype, absPartIdx); } @@ -505,7 +515,7 @@ * Rate distortion optimized quantization for entropy * coding engines using probability models like CABAC */ -uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t log2TrSize, +uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t* fencCoeff, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t log2TrSize, TextType ttype, uint32_t absPartIdx) { uint32_t trSize = 1 log2TrSize; @@ -614,7 +624,7 @@ { level = xGetCodedLevel(costCoeff[scanPos], curCostSig, costSig[scanPos], levelDouble, maxAbsLevel, baseLevel, greaterOneBits, levelAbsBits, goRiceParam, - c1c2Idx, qbits, scaleFactor, 1); + c1c2Idx, qbits, scaleFactor, 1, srcCoeff[blkPos], fencCoeff[blkPos]); sigRateDelta[blkPos] = 0; } else @@ -631,7 +641,7 @@ curCostSig = xGetRateSigCoef(1, ctxSig); level = xGetCodedLevel(costCoeff[scanPos], curCostSig, costSig[scanPos], levelDouble, maxAbsLevel, baseLevel, greaterOneBits, levelAbsBits, goRiceParam, - c1c2Idx, qbits, scaleFactor, 0); + c1c2Idx, qbits, scaleFactor, 0, srcCoeff[blkPos], fencCoeff[blkPos]); } else { @@ -1126,7 +1136,9 @@ uint32_t c1c2Idx, int qbits, double scaleFactor, -bool last) const +bool last,
Re: [x265] [PATCH 1 of 3] TComPrediction: remove TComDataCU as pointer to private functions
On 07/23, deep...@multicorewareinc.com wrote: # HG changeset patch # User Deepthi Nandakumar deep...@multicorewareinc.com # Date 1405938991 -19800 # Mon Jul 21 16:06:31 2014 +0530 # Node ID d8d26a695cf6734ad2180c4694360ef6e71ead81 # Parent e3ad03b7c4854be40730645d4fe25e56a93f3f94 TComPrediction: remove TComDataCU as pointer to private functions diff -r e3ad03b7c485 -r d8d26a695cf6 source/Lib/TLibCommon/TComPrediction.cpp --- a/source/Lib/TLibCommon/TComPrediction.cppTue Jul 22 13:28:54 2014 -0500 +++ b/source/Lib/TLibCommon/TComPrediction.cppMon Jul 21 16:06:31 2014 +0530 @@ -85,6 +85,7 @@ void TComPrediction::initTempBuff(int csp) { +m_csp = csp; m_hChromaShift = CHROMA_H_SHIFT(csp); m_vChromaShift = CHROMA_V_SHIFT(csp); @@ -262,9 +263,11 @@ MV mv = cu-getCUMvField(list)-getMv(partAddr); cu-clipMv(mv); if (bLuma) -xPredInterLumaBlk(cu, cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), partAddr, mv, width, height, shortYuv); + xPredInterLumaBlk(cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), cu-getAddr(), cu-getZorderIdxInCU(), +partAddr, mv, width, height, shortYuv); if (bChroma) -xPredInterChromaBlk(cu, cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), partAddr, mv, width, height, shortYuv); + xPredInterChromaBlk(cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), cu-getAddr(), cu-getZorderIdxInCU(), +partAddr, mv, width, height, shortYuv); We try to align the second row of arguments with the open paren( xWeightedPredictionUni(cu, shortYuv, partAddr, width, height, list, predYuv, -1, bLuma, bChroma); } @@ -291,10 +294,12 @@ cu-clipMv(mv); if (bLuma) -xPredInterLumaBlk(cu, cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), partAddr, mv, width, height, outPredYuv); + xPredInterLumaBlk(cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), cu-getAddr(), cu-getZorderIdxInCU(), +partAddr, mv, width, height, outPredYuv); if (bChroma) -xPredInterChromaBlk(cu, cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), partAddr, mv, width, height, outPredYuv); + xPredInterChromaBlk(cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), cu-getAddr(), cu-getZorderIdxInCU(), +partAddr, mv, width, height, outPredYuv); } void TComPrediction::xPredInterUni(TComDataCU* cu, uint32_t partAddr, int width, int height, int list, ShortYuv* outPredYuv, bool bLuma, bool bChroma) @@ -307,9 +312,11 @@ cu-clipMv(mv); if (bLuma) -xPredInterLumaBlk(cu, cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), partAddr, mv, width, height, outPredYuv); + xPredInterLumaBlk(cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), cu-getAddr(), cu-getZorderIdxInCU(), +partAddr, mv, width, height, outPredYuv); if (bChroma) -xPredInterChromaBlk(cu, cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), partAddr, mv, width, height, outPredYuv); + xPredInterChromaBlk(cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), cu-getAddr(), cu-getZorderIdxInCU(), +partAddr, mv, width, height, outPredYuv); } void TComPrediction::xPredInterBi(TComDataCU* cu, uint32_t partAddr, int width, int height, TComYuv* outPredYuv, bool bLuma, bool bChroma) @@ -378,7 +385,7 @@ * \param height Height of block * \param dstPic Pointer to destination picture */ -void TComPrediction::xPredInterLumaBlk(TComDataCU *cu, TComPicYuv *refPic, uint32_t partAddr, MV *mv, int width, int height, TComYuv *dstPic) +void TComPrediction::xPredInterLumaBlk(TComPicYuv *refPic, uint32_t cuAddr, uint32_t zOrderIdxinCU, uint32_t partAddr, MV *mv, int width, int height, TComYuv *dstPic) these `x' hungarian prefixes on the function names in this class really hurt my eyes (the HM used them for internal functions). I hope those can be removed in a future patch. { int dstStride = dstPic-getStride(); pixel *dst= dstPic-getLumaAddr(partAddr); @@ -386,7 +393,7 @@ int srcStride = refPic-getStride(); int srcOffset = (mv-x 2) + (mv-y 2) * srcStride; int partEnum = partitionFromSizes(width, height); -pixel* src = refPic-getLumaAddr(cu-getAddr(), cu-getZorderIdxInCU() + partAddr) + srcOffset; +pixel* src = refPic-getLumaAddr(cuAddr, zOrderIdxinCU + partAddr) + srcOffset; int xFrac = mv-x 0x3; int yFrac = mv-y 0x3; @@ -414,11 +421,11 @@ } //Motion compensated block for biprediction -void TComPrediction::xPredInterLumaBlk(TComDataCU *cu, TComPicYuv *refPic, uint32_t partAddr, MV *mv, int width, int height, ShortYuv *dstPic) +void TComPrediction::xPredInterLumaBlk(TComPicYuv *refPic,
Re: [x265] [PATCH 3 of 3] TComPrediction: remove redundant colorspace information
On 07/23, deep...@multicorewareinc.com wrote: # HG changeset patch # User Deepthi Nandakumar deep...@multicorewareinc.com # Date 1406039346 -19800 # Tue Jul 22 19:59:06 2014 +0530 # Node ID 30f41c3ef7d39a6e341bd149adf6d57267984ec7 # Parent a5422a41c85ac06fc773f1179e6fbea1a80a5e98 TComPrediction: remove redundant colorspace information m_csp is sufficient inside the TEncSearch, TComPrediction structures diff -r a5422a41c85a -r 30f41c3ef7d3 source/Lib/TLibCommon/TComPrediction.cpp --- a/source/Lib/TLibCommon/TComPrediction.cppTue Jul 22 17:03:38 2014 +0530 +++ b/source/Lib/TLibCommon/TComPrediction.cppTue Jul 22 19:59:06 2014 +0530 @@ -86,8 +86,6 @@ void TComPrediction::initTempBuff(int csp) { m_csp = csp; -m_hChromaShift = CHROMA_H_SHIFT(csp); -m_vChromaShift = CHROMA_V_SHIFT(csp); if (m_predBuf == NULL) { @@ -470,8 +468,11 @@ int refStride = refPic-getCStride(); int dstStride = dstPic-getCStride(); -int shiftHor = (2 + m_hChromaShift); -int shiftVer = (2 + m_vChromaShift); +int hChromaShift = CHROMA_H_SHIFT(m_csp); +int vChromaShift = CHROMA_V_SHIFT(m_csp); ok Somewhere down the line we should make a build option that makes CHROMA_V_SHIFT() and CHROMA_H_SHIFT() and similar macros return hard-coded 4:2:0 values, to squeeze out a few more ounces of perf when the user is uninterested in 4:2:2 or 4:4:4. + +int shiftHor = (2 + hChromaShift); +int shiftVer = (2 + vChromaShift); int refOffset = (mv-x shiftHor) + (mv-y shiftVer) * refStride; @@ -493,25 +494,25 @@ } else if (yFrac == 0) { -primitives.chroma[m_csp].filter_hpp[partEnum](refCb, refStride, dstCb, dstStride, xFrac (1 - m_hChromaShift)); -primitives.chroma[m_csp].filter_hpp[partEnum](refCr, refStride, dstCr, dstStride, xFrac (1 - m_hChromaShift)); +primitives.chroma[m_csp].filter_hpp[partEnum](refCb, refStride, dstCb, dstStride, xFrac (1 - hChromaShift)); +primitives.chroma[m_csp].filter_hpp[partEnum](refCr, refStride, dstCr, dstStride, xFrac (1 - hChromaShift)); } else if (xFrac == 0) { -primitives.chroma[m_csp].filter_vpp[partEnum](refCb, refStride, dstCb, dstStride, yFrac (1 - m_vChromaShift)); -primitives.chroma[m_csp].filter_vpp[partEnum](refCr, refStride, dstCr, dstStride, yFrac (1 - m_vChromaShift)); +primitives.chroma[m_csp].filter_vpp[partEnum](refCb, refStride, dstCb, dstStride, yFrac (1 - vChromaShift)); +primitives.chroma[m_csp].filter_vpp[partEnum](refCr, refStride, dstCr, dstStride, yFrac (1 - vChromaShift)); } else { -int extStride = m_width m_hChromaShift; +int extStride = m_width hChromaShift; int filterSize = NTAPS_CHROMA; int halfFilterSize = (filterSize 1); -primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, m_immedVals, extStride, xFrac (1 - m_hChromaShift), 1); -primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac (1 - m_vChromaShift)); +primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, m_immedVals, extStride, xFrac (1 - hChromaShift), 1); +primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac (1 - vChromaShift)); -primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, m_immedVals, extStride, xFrac (1 - m_hChromaShift), 1); -primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac (1 - m_vChromaShift)); +primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, m_immedVals, extStride, xFrac (1 - hChromaShift), 1); +primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac (1 - vChromaShift)); } } @@ -520,9 +521,11 @@ { int refStride = refPic-getCStride(); int dstStride = dstPic-m_cwidth; +int hChromaShift = CHROMA_H_SHIFT(m_csp); +int vChromaShift = CHROMA_V_SHIFT(m_csp); -int shiftHor = (2 + m_hChromaShift); -int shiftVer = (2 + m_vChromaShift); +int shiftHor = (2 + hChromaShift); +int shiftVer = (2 + vChromaShift); int refOffset = (mv-x shiftHor) + (mv-y shiftVer) * refStride; @@ -537,8 +540,8 @@ int partEnum = partitionFromSizes(m_width, m_height); -uint32_t cxWidth = m_widthm_hChromaShift; -uint32_t cxHeight = m_height m_vChromaShift; +uint32_t cxWidth = m_widthhChromaShift; +uint32_t cxHeight = m_height vChromaShift; X265_CHECK(((cxWidth | cxHeight) % 2) == 0, chroma block size expected to be multiple of 2\n); @@ -549,23 +552,23
Re: [x265] [PATCH] psyrdoq: implementation of psyrdoq
On 07/23, Deepthi Nandakumar wrote: Does it make sense to try this for DC coefficients? my understanding is that it is not helpful, and possibly harmful. we don't want to bias the DC coefficient in any way. snipped -- Steve Borho ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH] psyrdoq: implementation of psyrdoq
Ok, Thats what I thought too. On Jul 23, 2014 8:55 AM, Steve Borho st...@borho.org wrote: On 07/23, Deepthi Nandakumar wrote: Does it make sense to try this for DC coefficients? my understanding is that it is not helpful, and possibly harmful. we don't want to bias the DC coefficient in any way. snipped -- Steve Borho ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] trquant: store QpParam for each component
On 07/21, Satoshi Nakagawa wrote: # HG changeset patch # User Satoshi Nakagawa nakagawa...@oki.com # Date 1405905842 -32400 # Mon Jul 21 10:24:02 2014 +0900 # Node ID b40af94fd00f5f23a22854aaf498ffef32910110 # Parent eb983d29c11acc03b91e07fe93c31503fa3a4732 trquant: store QpParam for each component Hello Satoshi, This patch looked harmless, so I had queued it without sending it through regression testing, but it turned out to cause hash mismatches. The quickest repro case I know of is this: x265 BasketBallDrive_1920x1080_50.y4m --bitrate 4000 --rd 5 --hash 1 -f 50 it causes a hash mistake right away and eventually a decoder crash. If you can fix it today, I won't back this out. diff -r eb983d29c11a -r b40af94fd00f source/Lib/TLibCommon/TComLoopFilter.cpp --- a/source/Lib/TLibCommon/TComLoopFilter.cppThu Jul 17 09:29:39 2014 +0200 +++ b/source/Lib/TLibCommon/TComLoopFilter.cppMon Jul 21 10:24:02 2014 +0900 @@ -48,7 +48,6 @@ // // Constants // -#define QpUV(iQpY, chFmt) (((iQpY) 0) ? (iQpY) : (((iQpY) 57) ? ((iQpY) - 6) : g_chromaScale[chFmt][(iQpY)])) #define DEFAULT_INTRA_TC_OFFSET 2 /// Default intra TC offset // @@ -441,9 +440,6 @@ pixel* tmpsrc = src; int stride = reconYuv-getStride(); -int qp = 0; -int qpP = 0; -int qpQ = 0; uint32_t numParts = cu-m_pic-getNumPartInCUSize() depth; uint32_t log2UnitSize = g_log2UnitSize; @@ -457,8 +453,8 @@ uint32_t partQ = 0; TComDataCU* cuP = cu; TComDataCU* cuQ = cu; -int betaOffsetDiv2 = cuQ-m_slice-m_pps-deblockingFilterBetaOffsetDiv2; -int tcOffsetDiv2 = cuQ-m_slice-m_pps-deblockingFilterTcOffsetDiv2; +int betaOffset = cuQ-m_slice-m_pps-deblockingFilterBetaOffsetDiv2 1; +int tcOffset = cuQ-m_slice-m_pps-deblockingFilterTcOffsetDiv2 1; if (dir == EDGE_VER) { @@ -480,7 +476,7 @@ bs = blockingStrength[bsAbsIdx]; if (bs) { -qpQ = cu-getQP(bsAbsIdx); +int qpQ = cu-getQP(bsAbsIdx); partQ = bsAbsIdx; // Derive neighboring PU index if (dir == EDGE_VER) @@ -492,12 +488,12 @@ cuP = cuQ-getPUAbove(partP, partQ); } -qpP = cuP-getQP(partP); -qp = (qpP + qpQ + 1) 1; +int qpP = cuP-getQP(partP); +int qp = (qpP + qpQ + 1) 1; int bitdepthScale = 1 (X265_DEPTH - 8); -int indexTC = Clip3(0, MAX_QP + DEFAULT_INTRA_TC_OFFSET, int(qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + (tcOffsetDiv2 1))); -int indexB = Clip3(0, MAX_QP, qp + (betaOffsetDiv2 1)); +int indexTC = Clip3(0, MAX_QP + DEFAULT_INTRA_TC_OFFSET, int(qp + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset)); +int indexB = Clip3(0, MAX_QP, qp + betaOffset); int tc = sm_tcTable[indexTC] * bitdepthScale; int beta = sm_betaTable[indexB] * bitdepthScale; @@ -544,13 +540,11 @@ void TComLoopFilter::xEdgeFilterChroma(TComDataCU* cu, uint32_t absZOrderIdx, uint32_t depth, int dir, int edge, uint8_t blockingStrength[]) { +int chFmt = cu-getChromaFormat(); TComPicYuv* reconYuv = cu-m_pic-getPicYuvRec(); int stride = reconYuv-getCStride(); pixel* srcCb = reconYuv-getCbAddr(cu-getAddr(), absZOrderIdx); pixel* srcCr = reconYuv-getCrAddr(cu-getAddr(), absZOrderIdx); -int qp = 0; -int qpP = 0; -int qpQ = 0; uint32_t log2UnitSizeH = g_log2UnitSize - cu-getHorzChromaShift(); uint32_t log2UnitSizeV = g_log2UnitSize - cu-getVertChromaShift(); uint32_t unitSizeChromaH = 1 log2UnitSizeH; @@ -565,7 +559,7 @@ uint32_t partQ; TComDataCU* cuP; TComDataCU* cuQ = cu; -int tcOffsetDiv2 = cu-m_slice-m_pps-deblockingFilterTcOffsetDiv2; +int tcOffset = cu-m_slice-m_pps-deblockingFilterTcOffsetDiv2 1; // Vertical Position uint32_t edgeNumInLCUVert = g_zscanToRaster[absZOrderIdx] % lcuWidthInBaseUnits + edge; @@ -611,7 +605,7 @@ if (bs 1) { -qpQ = cu-getQP(bsAbsIdx); +int qpQ = cu-getQP(bsAbsIdx); partQ = bsAbsIdx; // Derive neighboring PU index if (dir == EDGE_VER) @@ -623,7 +617,7 @@ cuP = cuQ-getPUAbove(partP, partQ); } -qpP = cuP-getQP(partP); +int qpP = cuP-getQP(partP); if (cu-m_slice-m_pps-bTransquantBypassEnabled) { @@ -636,10 +630,17 @@
[x265] [PATCH] analysis: setQPforQuant in checkIntraInter to fix the hash mismatch at rd=56
# HG changeset patch # User Gopu Govindaswamy g...@multicorewareinc.com # Date 1406094393 -19800 # Wed Jul 23 11:16:33 2014 +0530 # Node ID 1beaaabef3eb6d3e832102ed7dafcd855c1d7298 # Parent e3ad03b7c4854be40730645d4fe25e56a93f3f94 analysis: setQPforQuant in checkIntraInter to fix the hash mismatch at rd=56 diff -r e3ad03b7c485 -r 1beaaabef3eb source/encoder/analysis.cpp --- a/source/encoder/analysis.cpp Tue Jul 22 13:28:54 2014 -0500 +++ b/source/encoder/analysis.cpp Wed Jul 23 11:16:33 2014 +0530 @@ -1722,6 +1722,7 @@ PPAScopeEvent(CheckRDCostIntra + depth); +m_trQuant.setQPforQuant(outTempCU); outTempCU-setSkipFlagSubParts(false, 0, depth); outTempCU-setPartSizeSubParts(partSize, 0, depth); outTempCU-setPredModeSubParts(MODE_INTRA, 0, depth); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel