[x265] refine partition size related

2014-07-22 Thread Satoshi Nakagawa
# HG changeset patch
# User Satoshi Nakagawa nakagawa...@oki.com
# Date 1406011990 -32400
#  Tue Jul 22 15:53:10 2014 +0900
# Node ID b2ad081e4bfc20bbc84e8bfbab59ed52aeac2a73
# Parent  d303b4d860e9f06396a156726dd518d0f41fe796
refine partition size related

- reorder LumaPartitions to simplify partitionFromLog2Size()
- remove unused


diff -r d303b4d860e9 -r b2ad081e4bfc source/Lib/TLibCommon/TComYuv.cpp
--- a/source/Lib/TLibCommon/TComYuv.cpp Mon Jul 21 22:43:38 2014 -0500
+++ b/source/Lib/TLibCommon/TComYuv.cpp Tue Jul 22 15:53:10 2014 +0900
@@ -127,6 +127,15 @@
 primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], getCStride(), srcV, 
srcPicYuv-getCStride());
 }
 
+void TComYuv::copyFromYuv(TComYuv* srcYuv)
+{
+X265_CHECK(m_width = srcYuv-m_width  m_height = srcYuv-m_height, 
invalid size\n);
+
+primitives.luma_copy_pp[m_part](m_buf[0], m_width, srcYuv-m_buf[0], 
srcYuv-m_width);
+primitives.chroma[m_csp].copy_pp[m_part](m_buf[1], m_cwidth, 
srcYuv-m_buf[1], srcYuv-m_cwidth);
+primitives.chroma[m_csp].copy_pp[m_part](m_buf[2], m_cwidth, 
srcYuv-m_buf[2], srcYuv-m_cwidth);
+}
+
 void TComYuv::copyToPartYuv(TComYuv* dstPicYuv, uint32_t partIdx)
 {
 pixel* dstY = dstPicYuv-getLumaAddr(partIdx);
@@ -156,50 +165,9 @@
 primitives.chroma[m_csp].copy_pp[part](dstV, dstPicYuv-getCStride(), 
srcV, getCStride());
 }
 
-void TComYuv::copyPartToPartYuv(TComYuv* dstPicYuv, uint32_t partIdx, uint32_t 
width, uint32_t height, bool bLuma, bool bChroma)
+void TComYuv::addClip(TComYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t log2Size)
 {
-int part = partitionFromSizes(width, height);
-
-X265_CHECK(width != 4 || height != 4, 4x4 partition detected\n);
-
-if (bLuma)
-{
-pixel* src = getLumaAddr(partIdx);
-pixel* dst = dstPicYuv-getLumaAddr(partIdx);
-
-uint32_t srcstride = getStride();
-uint32_t dststride = dstPicYuv-getStride();
-
-primitives.luma_copy_pp[part](dst, dststride, src, srcstride);
-}
-if (bChroma)
-{
-pixel* srcU = getCbAddr(partIdx);
-pixel* srcV = getCrAddr(partIdx);
-pixel* dstU = dstPicYuv-getCbAddr(partIdx);
-pixel* dstV = dstPicYuv-getCrAddr(partIdx);
-
-uint32_t srcstride = getCStride();
-uint32_t dststride = dstPicYuv-getCStride();
-
-primitives.chroma[m_csp].copy_pp[part](dstU, dststride, srcU, 
srcstride);
-primitives.chroma[m_csp].copy_pp[part](dstV, dststride, srcV, 
srcstride);
-}
-}
-
-void TComYuv::copyPartToPartLuma(ShortYuv* dstPicYuv, uint32_t partIdx, 
uint32_t lumaSize)
-{
-int part = partitionFromSize(lumaSize);
-
-int16_t* dst = dstPicYuv-getLumaAddr(partIdx);
-uint32_t dststride = dstPicYuv-m_width;
-
-primitives.luma_copy_ps[part](dst, dststride, getLumaAddr(partIdx), 
getStride());
-}
-
-void TComYuv::addClip(TComYuv* srcYuv0, ShortYuv* srcYuv1, uint32_t partSize)
-{
-int part = partitionFromSize(partSize);
+int part = partitionFromLog2Size(log2Size);
 
 addClipLuma(srcYuv0, srcYuv1, part);
 addClipChroma(srcYuv0, srcYuv1, part);
@@ -235,113 +203,32 @@
 primitives.chroma[m_csp].add_ps[part](dstV, dststride, srcV0, srcV1, 
src0Stride, src1Stride);
 }
 
-void TComYuv::addAvg(TComYuv* srcYuv0, TComYuv* srcYuv1, uint32_t partUnitIdx, 
uint32_t width, uint32_t height, bool bLuma, bool bChroma)
-{
-int x, y;
-uint32_t src0Stride, src1Stride, dststride;
-int shiftNum, offset;
-
-pixel* srcY0 = srcYuv0-getLumaAddr(partUnitIdx);
-pixel* srcU0 = srcYuv0-getCbAddr(partUnitIdx);
-pixel* srcV0 = srcYuv0-getCrAddr(partUnitIdx);
-
-pixel* srcY1 = srcYuv1-getLumaAddr(partUnitIdx);
-pixel* srcU1 = srcYuv1-getCbAddr(partUnitIdx);
-pixel* srcV1 = srcYuv1-getCrAddr(partUnitIdx);
-
-pixel* dstY  = getLumaAddr(partUnitIdx);
-pixel* dstU  = getCbAddr(partUnitIdx);
-pixel* dstV  = getCrAddr(partUnitIdx);
-
-if (bLuma)
-{
-src0Stride = srcYuv0-getStride();
-src1Stride = srcYuv1-getStride();
-dststride  = getStride();
-shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
-offset = (1  (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
-
-for (y = 0; y  height; y++)
-{
-for (x = 0; x  width; x += 4)
-{
-dstY[x + 0] = Clip((srcY0[x + 0] + srcY1[x + 0] + offset)  
shiftNum);
-dstY[x + 1] = Clip((srcY0[x + 1] + srcY1[x + 1] + offset)  
shiftNum);
-dstY[x + 2] = Clip((srcY0[x + 2] + srcY1[x + 2] + offset)  
shiftNum);
-dstY[x + 3] = Clip((srcY0[x + 3] + srcY1[x + 3] + offset)  
shiftNum);
-}
-
-srcY0 += src0Stride;
-srcY1 += src1Stride;
-dstY  += dststride;
-}
-}
-if (bChroma)
-{
-shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
-offset = (1  (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
-
-src0Stride = srcYuv0-getCStride();
-src1Stride = 

Re: [x265] Custom LowRes scale

2014-07-22 Thread Nicolas Morey-Chaisemartin

I can do that :)
Do you have standard way to generate these figures? (Video, options ?)
Or shall I just generate a couple of figures to put in the commit?

On 07/21/2014 06:16 PM, Deepthi Nandakumar wrote:

Thanks, this is certainly an enhancement to x265 lookahead. We would be
interested in this - especially if you can also include some efficiency
(bitrate vs SSIM) metrics that describe the penalty moving from
X265_LOWRES_SCALE of 4 to higher scales.


On Mon, Jul 21, 2014 at 8:49 PM, Nicolas Morey-Chaisemartin 
nmo...@kalray.eu wrote:


Hi,

We recently profiled x265 pre-analysis to estimate what performance we
could reach using our accelerator and I was quite disappointed by the
performance.
When running on a Core-i7 with AVX at roughly 2.7GHz, we barely reached
the 30fps mark using ultrafast preset on a 4K video.




After a little bit of browsing I realized that work in LosRew is always
done at 1/4th of the final resolution which seems fair but requires a huge
amount of work for 4K.
It seemed straight forward enough to change the divider at LowRes
initialization but it seems there are a lot of hard coded values that
depend both on the LowRes divider and the LowRes CU Size.

Here's a patch (definitly not applicable like this but just to give an
idea of where I'm going) that seems to fix most of the hard-coded value.
It still works with a X265_LOWRES_SCALE of 4 and the perf is definilty
improving (29fps = 40fps on a 2048x1024 medium preset on a E5504).

Would you be interested in a clean version of this? At least the
hard-coded CU_SIZE part?
IMHO it would be better to have dynamic value for LowRes depending on
preset (or equivalent) and the input resolution...
1/4th is fast enough in HD not to be an issue but for RT stream in 4K or
more, 1/16 will be compulsory.

Nicolas

---
  x265/source/common/common.h  |  1 +
  x265/source/common/lowres.cpp|  4 ++--
  x265/source/encoder/frameencoder.cpp |  7 ---
  x265/source/encoder/ratecontrol.cpp  | 16 
  x265/source/encoder/slicetype.cpp|  8 
  5 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/x265/source/common/common.h b/x265/source/common/common.h
index 06f60e7..00e73fc 100644
--- a/x265/source/common/common.h
+++ b/x265/source/common/common.h
@@ -156,6 +156,7 @@ typedef int32_t  coeff_t;  // transform coefficient
  // high cost estimates (intra and inter both suffer)
  #define X265_LOWRES_CU_SIZE   8
  #define X265_LOWRES_CU_BITS   3
+#define X265_LOWRES_SCALE 2
   #define X265_MALLOC(type, count)(type*)x265_malloc(sizeof(type) *
(count))
  #define X265_FREE(ptr)  x265_free(ptr)
diff --git a/x265/source/common/lowres.cpp b/x265/source/common/lowres.cpp
index 5fc2f6b..6138023 100644
--- a/x265/source/common/lowres.cpp
+++ b/x265/source/common/lowres.cpp
@@ -31,8 +31,8 @@ bool Lowres::create(TComPicYuv *orig, int _bframes, bool
bAQEnabled)
  {
  isLowres = true;
  bframes = _bframes;
-width = orig-getWidth() / 2;
-lines = orig-getHeight() / 2;
+width = orig-getWidth() / X265_LOWRES_SCALE;
+lines = orig-getHeight() / X265_LOWRES_SCALE;
  lumaStride = width + 2 * orig-getLumaMarginX();
  if (lumaStride  31)
  lumaStride += 32 - (lumaStride  31);
diff --git a/x265/source/encoder/frameencoder.cpp b/x265/source/encoder/
frameencoder.cpp
index 8c3ee26..7213f60 100644
--- a/x265/source/encoder/frameencoder.cpp
+++ b/x265/source/encoder/frameencoder.cpp
@@ -1300,9 +1300,10 @@ int FrameEncoder::calcQpForCu(uint32_t cuAddr,
double baseQp)
   /* Derive qpOffet for each CU by averaging offsets for all 16x16
blocks in the cu. */
  double qp_offset = 0;
-int maxBlockCols = (m_frame-getPicYuvOrg()-getWidth() + (16 - 1))
/ 16;
-int maxBlockRows = (m_frame-getPicYuvOrg()-getHeight() + (16 - 1))
/ 16;
-int noOfBlocks = g_maxCUSize / 16;
+int lowResCu = (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE);
+int maxBlockCols = (m_frame-getPicYuvOrg()-getWidth() + (lowResCu
- 1)) / lowResCu;
+int maxBlockRows = (m_frame-getPicYuvOrg()-getHeight() + (lowResCu
- 1)) / lowResCu;
+int noOfBlocks = g_maxCUSize / lowResCu;
  int block_y = (cuAddr / m_frame-getPicSym()-getFrameWidthInCU()) *
noOfBlocks;
  int block_x = (cuAddr * noOfBlocks) - block_y * m_frame-getPicSym()-
getFrameWidthInCU();
  diff --git a/x265/source/encoder/ratecontrol.cpp b/x265/source/encoder/
ratecontrol.cpp
index 4358994..5fcc27a 100644
--- a/x265/source/encoder/ratecontrol.cpp
+++ b/x265/source/encoder/ratecontrol.cpp
@@ -161,8 +161,8 @@ void RateControl::calcAdaptiveQuantFrame(Frame *pic)
  if (m_param-rc.aqMode == X265_AQ_NONE || m_param-rc.aqStrength == 0)
  {
  /* Need to init it anyways for CU tree */
-int cuWidth = ((maxCol / 2) + X265_LOWRES_CU_SIZE - 1) 
X265_LOWRES_CU_BITS;
-int cuHeight = ((maxRow / 2) + X265_LOWRES_CU_SIZE - 1) 
X265_LOWRES_CU_BITS;
+int cuWidth = ((maxCol / X265_LOWRES_SCALE) + 

Re: [x265] refine partition size related

2014-07-22 Thread Steve Borho
On 07/22, Satoshi Nakagawa wrote:
 # HG changeset patch
 # User Satoshi Nakagawa nakagawa...@oki.com
 # Date 1406011990 -32400
 #  Tue Jul 22 15:53:10 2014 +0900
 # Node ID b2ad081e4bfc20bbc84e8bfbab59ed52aeac2a73
 # Parent  d303b4d860e9f06396a156726dd518d0f41fe796
 refine partition size related
 
 - reorder LumaPartitions to simplify partitionFromLog2Size()
 - remove unused

Queued for testing, thanks.

One question below:

 
 diff -r d303b4d860e9 -r b2ad081e4bfc source/Lib/TLibCommon/TComYuv.cpp
 --- a/source/Lib/TLibCommon/TComYuv.cpp   Mon Jul 21 22:43:38 2014 -0500
 +++ b/source/Lib/TLibCommon/TComYuv.cpp   Tue Jul 22 15:53:10 2014 +0900
 @@ -127,6 +127,15 @@

snip

 diff -r d303b4d860e9 -r b2ad081e4bfc source/test/testbench.cpp
 --- a/source/test/testbench.cpp   Mon Jul 21 22:43:38 2014 -0500
 +++ b/source/test/testbench.cpp   Tue Jul 22 15:53:10 2014 +0900
 @@ -127,6 +127,7 @@
  EncoderPrimitives cprim;
  memset(cprim, 0, sizeof(EncoderPrimitives));
  Setup_C_Primitives(cprim);
 +Setup_Alias_Primitives(cprim);
  
  struct test_arch_t
  {
 @@ -186,6 +187,7 @@
  memset(optprim, 0, sizeof(optprim));
  Setup_Instrinsic_Primitives(optprim, cpuid);
  Setup_Assembly_Primitives(optprim, cpuid);
 +Setup_Alias_Primitives(optprim);

is there a reason to test the aliased functions, since by their nature
they should already be being tested via another function pointer?

-- 
Steve Borho
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] Custom LowRes scale

2014-07-22 Thread Nicolas Morey-Chaisemartin


On 07/21/2014 07:11 PM, Steve Borho wrote:

Interesting. I imagine much 4k content would work decently well even
with further downscaling of the lookahead pictures.

The lowres motion vectors are used in weight analysis as well, so that
file would need to be updated.

I'll have a look at it. It doesn't semm as straight forward as the other files 
though.
While we're talking about lowres MV: from what I could gather they are not used 
during the motionSearch on the full res picture.
As a lot of time is spent finding those, whouldn't it be useful to add them as 
candidate in the fullres search?

Nicolas

___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] refine partition size related

2014-07-22 Thread Satoshi Nakagawa
To find non optimized functions, and which function can be aliased.
I think many 4:2:2 functions can be aliased.


 -Original Message-
 From: x265-devel [mailto:x265-devel-boun...@videolan.org] On Behalf Of
 Steve Borho
 Sent: Tuesday, July 22, 2014 4:16 PM
 To: Development for x265
 Subject: Re: [x265] refine partition size related
 
 On 07/22, Satoshi Nakagawa wrote:
  # HG changeset patch
  # User Satoshi Nakagawa nakagawa...@oki.com # Date 1406011990 -32400
  #  Tue Jul 22 15:53:10 2014 +0900
  # Node ID b2ad081e4bfc20bbc84e8bfbab59ed52aeac2a73
  # Parent  d303b4d860e9f06396a156726dd518d0f41fe796
  refine partition size related
 
  - reorder LumaPartitions to simplify partitionFromLog2Size()
  - remove unused
 
 Queued for testing, thanks.
 
 One question below:
 
 
  diff -r d303b4d860e9 -r b2ad081e4bfc
 source/Lib/TLibCommon/TComYuv.cpp
  --- a/source/Lib/TLibCommon/TComYuv.cpp Mon Jul 21 22:43:38 2014
 -0500
  +++ b/source/Lib/TLibCommon/TComYuv.cpp Tue Jul 22 15:53:10 2014
 +0900
  @@ -127,6 +127,15 @@
 
 snip
 
  diff -r d303b4d860e9 -r b2ad081e4bfc source/test/testbench.cpp
  --- a/source/test/testbench.cpp Mon Jul 21 22:43:38 2014 -0500
  +++ b/source/test/testbench.cpp Tue Jul 22 15:53:10 2014 +0900
  @@ -127,6 +127,7 @@
   EncoderPrimitives cprim;
   memset(cprim, 0, sizeof(EncoderPrimitives));
   Setup_C_Primitives(cprim);
  +Setup_Alias_Primitives(cprim);
 
   struct test_arch_t
   {
  @@ -186,6 +187,7 @@
   memset(optprim, 0, sizeof(optprim));
   Setup_Instrinsic_Primitives(optprim, cpuid);
   Setup_Assembly_Primitives(optprim, cpuid);
  +Setup_Alias_Primitives(optprim);
 
 is there a reason to test the aliased functions, since by their nature
 they should already be being tested via another function pointer?
 
 --
 Steve Borho
 ___
 x265-devel mailing list
 x265-devel@videolan.org
 https://mailman.videolan.org/listinfo/x265-devel

___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] refine partition size related

2014-07-22 Thread Steve Borho
On 07/22, Satoshi Nakagawa wrote:
 To find non optimized functions, and which function can be aliased.
 I think many 4:2:2 functions can be aliased.

ok, fair enough.

-- 
Steve Borho
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] Custom LowRes scale

2014-07-22 Thread Steve Borho
On 07/22, Nicolas Morey-Chaisemartin wrote:
 
 On 07/21/2014 07:11 PM, Steve Borho wrote:
 Interesting. I imagine much 4k content would work decently well even
 with further downscaling of the lookahead pictures.
 
 The lowres motion vectors are used in weight analysis as well, so that
 file would need to be updated.

 I'll have a look at it. It doesn't semm as straight forward as the
 other files though.

it is slightly more complicated; you'll want to scale up the block sizes
used for motion-compensated weight analysis - up to 32x32 or 64x64 based
on how much further you downscale the lowres in lookahead.

 While we're talking about lowres MV: from what I could gather they are
 not used during the motionSearch on the full res picture.  As a lot of
 time is spent finding those, whouldn't it be useful to add them as
 candidate in the fullres search?

This has been on my TODO list for ages; a couple of people have claimed
they've tried it and it hasn't helped as much as you might think.  But I
haven't had a working patch in hand to verify it.

The AMVP fixup after motion search, where we get to go shopping for a
better MVP after the search, often makes extra motion candidates
superfluous.

-- 
Steve Borho
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] Custom LowRes scale

2014-07-22 Thread Nicolas Morey-Chaisemartin


On 07/22/2014 10:08 AM, Steve Borho wrote:

On 07/22, Nicolas Morey-Chaisemartin wrote:

I'll have a look at it. It doesn't semm as straight forward as the
other files though.

it is slightly more complicated; you'll want to scale up the block sizes
used for motion-compensated weight analysis - up to 32x32 or 64x64 based
on how much further you downscale the lowres in lookahead.


Is there a clean way to get a LUMA_NNxNN value from a block size ?
Should I handle block larger than 64x64 by looping on the 64x64 blocks? or simply 
add a check at lowres init that the fullres CU size is = 64 ?




While we're talking about lowres MV: from what I could gather they are
not used during the motionSearch on the full res picture.  As a lot of
time is spent finding those, whouldn't it be useful to add them as
candidate in the fullres search?

This has been on my TODO list for ages; a couple of people have claimed
they've tried it and it hasn't helped as much as you might think.  But I
haven't had a working patch in hand to verify it.

The AMVP fixup after motion search, where we get to go shopping for a
better MVP after the search, often makes extra motion candidates
superfluous.


I started working on this yesterday for our accelerator but I got carried away 
on lowres scaling.
I haven't any results yet but I'll post them as soon as I have some.
By the way, lowres MV are in lowres luma pixels right? So I'll need to scale 
the vector by 2 to get the full MV?

Nicolas

___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] lowres: Enhanced scaling

2014-07-22 Thread Nicolas Morey-Chaisemartin

# HG changeset patch
# User Nicolas Morey-Chaisemartin nmo...@kalray.eu
# Date 1406020650 -7200
#  Tue Jul 22 11:17:30 2014 +0200
# Node ID fc75f5f4f85e0d9441dc73b09ec60a36c20f
# Parent  4c9ce4db74d1c9768abc61290bd1bda002b79f4e
lowres: Enhanced scaling

 * Replace hard coded values with X265_LOWRES_CU_SIZE
 * Add X265_LOWRES_SCALE define to tweak divider for LowRes

Note: X265_LOWRES_SCALE * X265_LOWRES_CU_SIZE must be lesser or equal to 64 to 
be able to use standard filters for weight prediction

Performance Impact:
Command Line:
./x265/build/x265 --preset medium  --accel=none  red_kayak_1080p-420.y4m 
kayak.hevc --bitrate=$BITRATE --ssim

- BITRATE=4000 X265_LOWRES_SCALE=2
encoded 570 frames in 105.56s (5.40 fps), 3334.27 kb/s, SSIM Mean Y: 0.8900527 
( 9.588 dB)
- BITRATE=4000 X265_LOWRES_SCALE=4
encoded 570 frames in 87.11s (6.54 fps), 3398.38 kb/s, SSIM Mean Y: 0.8836753 ( 
9.343 dB)
- BITRATE=4000 X265_LOWRES_SCALE=8
encoded 570 frames in 79.71s (7.15 fps), 3437.19 kb/s, SSIM Mean Y: 0.8765783 ( 
9.086 dB)

- BITRATE=9000 X265_LOWRES_SCALE=2
encoded 570 frames in 115.32s (4.94 fps), 7263.50 kb/s, SSIM Mean Y: 0.9272905 
(11.384 dB)
- BITRATE=9000 X265_LOWRES_SCALE=4
encoded 570 frames in 101.53s (5.61 fps), 7439.24 kb/s, SSIM Mean Y: 0.9209998 
(11.024 dB)
- BITRATE=9000 X265_LOWRES_SCALE=8
encoded 570 frames in 92.98s (6.13 fps), 7549.41 kb/s, SSIM Mean Y: 0.9160721 
(10.761 dB)

diff --git a/source/common/common.h b/source/common/common.h
--- a/source/common/common.h
+++ b/source/common/common.h
@@ -153,16 +153,17 @@ typedef int32_t  coeff_t;  // transf
 // arbitrary, but low because SATD scores are 1/4 normal
 #define X265_LOOKAHEAD_QP (12 + QP_BD_OFFSET)
 #define X265_LOOKAHEAD_MAX 250
 
 // Use the same size blocks as x264.  Using larger blocks seems to give artificially

 // high cost estimates (intra and inter both suffer)
 #define X265_LOWRES_CU_SIZE   8
 #define X265_LOWRES_CU_BITS   3
+#define X265_LOWRES_SCALE 2
 
 #define X265_MALLOC(type, count)(type*)x265_malloc(sizeof(type) * (count))

 #define X265_FREE(ptr)  x265_free(ptr)
 #define CHECKED_MALLOC(var, type, count) \
 { \
 var = (type*)x265_malloc(sizeof(type) * (count)); \
 if (!var) \
 { \
diff --git a/source/common/lowres.cpp b/source/common/lowres.cpp
--- a/source/common/lowres.cpp
+++ b/source/common/lowres.cpp
@@ -24,20 +24,21 @@
 #include TLibCommon/TComPicYuv.h
 #include lowres.h
 #include mv.h
 
 using namespace x265;
 
 bool Lowres::create(TComPicYuv *orig, int _bframes, bool bAQEnabled)

 {
+   X265_CHECK(X265_LOWRES_SCALE * X265_LOWRES_CU_SIZE = 64, Invalid LowRes 
scaling\n);
 isLowres = true;
 bframes = _bframes;
-width = orig-getWidth() / 2;
-lines = orig-getHeight() / 2;
+width = orig-getWidth() / X265_LOWRES_SCALE;
+lines = orig-getHeight() / X265_LOWRES_SCALE;
 lumaStride = width + 2 * orig-getLumaMarginX();
 if (lumaStride  31)
 lumaStride += 32 - (lumaStride  31);
 int cuWidth = (width + X265_LOWRES_CU_SIZE - 1)  X265_LOWRES_CU_BITS;
 int cuHeight = (lines + X265_LOWRES_CU_SIZE - 1)  X265_LOWRES_CU_BITS;
 int cuCount = cuWidth * cuHeight;
 
 /* rounding the width to multiple of lowres CU size */

diff --git a/source/encoder/frameencoder.cpp b/source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp
+++ b/source/encoder/frameencoder.cpp
@@ -984,19 +984,20 @@ int FrameEncoder::calcQpForCu(uint32_t c
 if (bIsVbv)
 {
 m_frame-m_cuCostsForVbv[cuAddr] = 0;
 m_frame-m_intraCuCostsForVbv[cuAddr] = 0;
 }
 
 /* Derive qpOffet for each CU by averaging offsets for all 16x16 blocks in the cu. */

 double qp_offset = 0;
-int maxBlockCols = (m_frame-getPicYuvOrg()-getWidth() + (16 - 1)) / 16;
-int maxBlockRows = (m_frame-getPicYuvOrg()-getHeight() + (16 - 1)) / 16;
-int noOfBlocks = g_maxCUSize / 16;
+int lowResCu = (X265_LOWRES_CU_SIZE * X265_LOWRES_SCALE);
+int maxBlockCols = (m_frame-getPicYuvOrg()-getWidth() + (lowResCu - 1)) 
/ lowResCu;
+int maxBlockRows = (m_frame-getPicYuvOrg()-getHeight() + (lowResCu - 1)) 
/ lowResCu;
+int noOfBlocks = g_maxCUSize / lowResCu;
 int block_y = (cuAddr / m_frame-getPicSym()-getFrameWidthInCU()) * 
noOfBlocks;
 int block_x = (cuAddr * noOfBlocks) - block_y * 
m_frame-getPicSym()-getFrameWidthInCU();
 
 /* Use cuTree offsets if cuTree enabled and frame is referenced, else use AQ offsets */

 double *qpoffs = (m_isReferenced  m_param-rc.cuTree) ? 
m_frame-m_lowres.qpCuTreeOffset : m_frame-m_lowres.qpAqOffset;
 
 int cnt = 0, idx = 0;

 for (int h = 0; h  noOfBlocks  block_y  maxBlockRows; h++, block_y++)
diff --git a/source/encoder/ratecontrol.cpp b/source/encoder/ratecontrol.cpp
--- a/source/encoder/ratecontrol.cpp
+++ b/source/encoder/ratecontrol.cpp
@@ -198,18 +198,18 @@ void RateControl::calcAdaptiveQuantFrame
 
 /* Calculate Qp offset for each 16x16 block in 

[x265] Brief speed test with x265 in all presets and different versions

2014-07-22 Thread Mario *LigH* Rohkrämer

Dear x265 developers.

I'd like to share with you a result of a brief speed test, comparing x265  
in different versions (early 1.1, late 1.1, and current 1.2) in all  
presets with the small foreman clip. Its meaning is certainly limited,  
so I don't feel like sharing it in a public forum (may cause too much  
FUD), and I don't want to interpret it too much, except for one general  
result: There are changes which increased the speed during the development  
of v1.1, but with only an AMD Phenom-II X4, it is not yet very obvious.  
And the promised increase for AMD with the HADDD macro didn't happen to  
me, rather the opposite. Results will probably be different for FX+ CPUs.


CLI: -o foreman_cif_placebo.hevc --preset %preset% --aq-mode 2  
--aq-strength 1.5 --psy-rd 0.5 foreman_cif.y4m


And no, I will not insist in slowing down preset medium, just for the sake  
of the curve's beauty. ;-)


--

Fun and success!
Mario *LigH* Rohkrämer
mailto:cont...@ligh.de
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] psyrdoq: implementation of psyrdoq

2014-07-22 Thread sumalatha
# HG changeset patch
# User Sumalatha Polureddysumala...@multicorewareinc.com
# Date 1406032149 -19800
# Node ID 37e03dcd2e4f0b5894880ff8c097bd6e11590459
# Parent  d303b4d860e9f06396a156726dd518d0f41fe796
psyrdoq: implementation of psyrdoq

diff -r d303b4d860e9 -r 37e03dcd2e4f source/Lib/TLibCommon/TComTrQuant.cpp
--- a/source/Lib/TLibCommon/TComTrQuant.cpp Mon Jul 21 22:43:38 2014 -0500
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp Tue Jul 22 17:59:09 2014 +0530
@@ -64,6 +64,8 @@
 return y + ((x - y)  ((x - y)  (sizeof(int) * CHAR_BIT - 1))); // 
min(x, y)
 }
 
+#define SIGN(x,y) ((x^(y  31))-(y  31))
+
 // 

 // TComTrQuant class member functions
 // 

@@ -307,6 +309,8 @@
 }
 
 uint32_t TComTrQuant::transformNxN(TComDataCU* cu,
+   pixel*  fenc,
+   uint32_tfencStride,
int16_t*residual,
uint32_tstride,
coeff_t*coeff,
@@ -316,10 +320,10 @@
booluseTransformSkip,
boolcurUseRDOQ)
 {
+int trSize = 1  log2TrSize;
 if (cu-getCUTransquantBypass(absPartIdx))
 {
 uint32_t numSig = 0;
-int trSize = 1  log2TrSize;
 for (int k = 0; k  trSize; k++)
 {
 for (int j = 0; j  trSize; j++)
@@ -339,6 +343,12 @@
 const uint32_t sizeIdx = log2TrSize - 2;
 int useDST = (sizeIdx == 0  ttype == TEXT_LUMA  
cu-getPredictionMode(absPartIdx) == MODE_INTRA);
 int index = DCT_4x4 + sizeIdx - useDST;
+if (psyRdoqEnabled())
+{
+// converting pixel to int and putting in separate buffer to take 
dct
+primitives.square_copy_ps[sizeIdx](m_tmpfencBuf, MAX_CU_SIZE, 
fenc, fencStride);
+primitives.dct[index](m_tmpfencBuf, m_tmpfencCoeff, stride);
+}
 primitives.dct[index](residual, m_tmpCoeff, stride);
 if (m_nr-bNoiseReduction)
 {
@@ -356,7 +366,7 @@
 
 if (m_useRDOQ  curUseRDOQ)
 {
-return xRateDistOptQuant(cu, m_tmpCoeff, coeff, log2TrSize, ttype, 
absPartIdx);
+return xRateDistOptQuant(cu, m_tmpfencCoeff, m_tmpCoeff, coeff, 
log2TrSize, ttype, absPartIdx);
 }
 return xQuant(cu, m_tmpCoeff, coeff, log2TrSize, ttype, absPartIdx);
 }
@@ -505,7 +515,7 @@
  * Rate distortion optimized quantization for entropy
  * coding engines using probability models like CABAC
  */
-uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t* srcCoeff, 
coeff_t* dstCoeff, uint32_t log2TrSize,
+uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t* fencCoeff, 
int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t log2TrSize,
 TextType ttype, uint32_t absPartIdx)
 {
 uint32_t trSize = 1  log2TrSize;
@@ -614,7 +624,7 @@
 {
 level = xGetCodedLevel(costCoeff[scanPos], curCostSig, 
costSig[scanPos],
levelDouble, maxAbsLevel, 
baseLevel, greaterOneBits, levelAbsBits, goRiceParam,
-   c1c2Idx, qbits, scaleFactor, 1);
+   c1c2Idx, qbits, scaleFactor, 1, 
srcCoeff[blkPos], fencCoeff[blkPos]);
 sigRateDelta[blkPos] = 0;
 }
 else
@@ -631,7 +641,7 @@
 curCostSig = xGetRateSigCoef(1, ctxSig);
 level = xGetCodedLevel(costCoeff[scanPos], curCostSig, 
costSig[scanPos],
levelDouble, maxAbsLevel, 
baseLevel, greaterOneBits, levelAbsBits, goRiceParam,
-   c1c2Idx, qbits, scaleFactor, 0);
+   c1c2Idx, qbits, scaleFactor, 0, 
srcCoeff[blkPos], fencCoeff[blkPos]);
 }
 else
 {
@@ -1126,7 +1136,9 @@
 uint32_t c1c2Idx,
 int  qbits,
 double   scaleFactor,
-bool last) const
+bool last,
+int  signCoef,
+int  origCoef) const
 {
 uint32_t   bestAbsLevel = 0;
 
@@ -1155,7 +1167,18 @@
 for (int absLevel = maxAbsLevel; absLevel = minAbsLevel; absLevel--)
 {
 X265_CHECK(fabs((double)err2 - double(levelDouble  - (absLevel  

[x265] [PATCH 0 of 3 ] Remove TComDataCU dependencies from prediction/MC

2014-07-22 Thread deepthi
This will help pave the way for a much better designed Analysis and Search 
structure
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH 1 of 3] TComPrediction: remove TComDataCU as pointer to private functions

2014-07-22 Thread deepthi
# HG changeset patch
# User Deepthi Nandakumar deep...@multicorewareinc.com
# Date 1405938991 -19800
#  Mon Jul 21 16:06:31 2014 +0530
# Node ID d8d26a695cf6734ad2180c4694360ef6e71ead81
# Parent  e3ad03b7c4854be40730645d4fe25e56a93f3f94
TComPrediction: remove TComDataCU as pointer to private functions

diff -r e3ad03b7c485 -r d8d26a695cf6 source/Lib/TLibCommon/TComPrediction.cpp
--- a/source/Lib/TLibCommon/TComPrediction.cpp  Tue Jul 22 13:28:54 2014 -0500
+++ b/source/Lib/TLibCommon/TComPrediction.cpp  Mon Jul 21 16:06:31 2014 +0530
@@ -85,6 +85,7 @@
 
 void TComPrediction::initTempBuff(int csp)
 {
+m_csp = csp;
 m_hChromaShift = CHROMA_H_SHIFT(csp);
 m_vChromaShift = CHROMA_V_SHIFT(csp);
 
@@ -262,9 +263,11 @@
 MV mv = cu-getCUMvField(list)-getMv(partAddr);
 cu-clipMv(mv);
 if (bLuma)
-xPredInterLumaBlk(cu, 
cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), partAddr, mv, width, 
height, shortYuv);
+
xPredInterLumaBlk(cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), 
cu-getAddr(), cu-getZorderIdxInCU(),
+partAddr, mv, width, height, shortYuv);
 if (bChroma)
-xPredInterChromaBlk(cu, 
cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), partAddr, mv, width, 
height, shortYuv);
+
xPredInterChromaBlk(cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), 
cu-getAddr(), cu-getZorderIdxInCU(), 
+partAddr, mv, width, height, shortYuv);
 
 xWeightedPredictionUni(cu, shortYuv, partAddr, width, height, 
list, predYuv, -1, bLuma, bChroma);
 }
@@ -291,10 +294,12 @@
 cu-clipMv(mv);
 
 if (bLuma)
-xPredInterLumaBlk(cu, 
cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), partAddr, mv, width, 
height, outPredYuv);
+
xPredInterLumaBlk(cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), 
cu-getAddr(), cu-getZorderIdxInCU(), 
+partAddr, mv, width, height, outPredYuv);
 
 if (bChroma)
-xPredInterChromaBlk(cu, 
cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), partAddr, mv, width, 
height, outPredYuv);
+
xPredInterChromaBlk(cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), 
cu-getAddr(), cu-getZorderIdxInCU(), 
+partAddr, mv, width, height, outPredYuv);
 }
 
 void TComPrediction::xPredInterUni(TComDataCU* cu, uint32_t partAddr, int 
width, int height, int list, ShortYuv* outPredYuv, bool bLuma, bool bChroma)
@@ -307,9 +312,11 @@
 cu-clipMv(mv);
 
 if (bLuma)
-xPredInterLumaBlk(cu, 
cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), partAddr, mv, width, 
height, outPredYuv);
+
xPredInterLumaBlk(cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), 
cu-getAddr(), cu-getZorderIdxInCU(), 
+partAddr, mv, width, height, outPredYuv);
 if (bChroma)
-xPredInterChromaBlk(cu, 
cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), partAddr, mv, width, 
height, outPredYuv);
+
xPredInterChromaBlk(cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), 
cu-getAddr(), cu-getZorderIdxInCU(), 
+partAddr, mv, width, height, outPredYuv);
 }
 
 void TComPrediction::xPredInterBi(TComDataCU* cu, uint32_t partAddr, int 
width, int height, TComYuv* outPredYuv, bool bLuma, bool bChroma)
@@ -378,7 +385,7 @@
  * \param height   Height of block
  * \param dstPic   Pointer to destination picture
  */
-void TComPrediction::xPredInterLumaBlk(TComDataCU *cu, TComPicYuv *refPic, 
uint32_t partAddr, MV *mv, int width, int height, TComYuv *dstPic)
+void TComPrediction::xPredInterLumaBlk(TComPicYuv *refPic, uint32_t cuAddr, 
uint32_t zOrderIdxinCU, uint32_t partAddr, MV *mv, int width, int height, 
TComYuv *dstPic)
 {
 int dstStride = dstPic-getStride();
 pixel *dst= dstPic-getLumaAddr(partAddr);
@@ -386,7 +393,7 @@
 int srcStride = refPic-getStride();
 int srcOffset = (mv-x  2) + (mv-y  2) * srcStride;
 int partEnum = partitionFromSizes(width, height);
-pixel* src = refPic-getLumaAddr(cu-getAddr(), cu-getZorderIdxInCU() + 
partAddr) + srcOffset;
+pixel* src = refPic-getLumaAddr(cuAddr, zOrderIdxinCU + partAddr) + 
srcOffset;
 
 int xFrac = mv-x  0x3;
 int yFrac = mv-y  0x3;
@@ -414,11 +421,11 @@
 }
 
 //Motion compensated block for biprediction
-void TComPrediction::xPredInterLumaBlk(TComDataCU *cu, TComPicYuv *refPic, 
uint32_t partAddr, MV *mv, int width, int height, ShortYuv *dstPic)
+void TComPrediction::xPredInterLumaBlk(TComPicYuv *refPic, uint32_t cuAddr, 
uint32_t zOrderIdxinCU, uint32_t partAddr, MV *mv, int width, int height, 
ShortYuv *dstPic)
 {
 int refStride = refPic-getStride();
 int refOffset = (mv-x  2) + (mv-y  2) * refStride;
-pixel *ref= refPic-getLumaAddr(cu-getAddr(), cu-getZorderIdxInCU() 
+ partAddr) + refOffset;
+pixel *ref= refPic-getLumaAddr(cuAddr, zOrderIdxinCU + partAddr) + 
refOffset;
 
 int dstStride = 

[x265] [PATCH 2 of 3] TComPrediction: save CU data (partAddr, width, height) as member fields

2014-07-22 Thread deepthi
# HG changeset patch
# User Deepthi Nandakumar deep...@multicorewareinc.com
# Date 1406028818 -19800
#  Tue Jul 22 17:03:38 2014 +0530
# Node ID a5422a41c85ac06fc773f1179e6fbea1a80a5e98
# Parent  d8d26a695cf6734ad2180c4694360ef6e71ead81
TComPrediction: save CU data (partAddr, width, height) as member fields

Before motion compensation, save CU related data inside the TComPrediction 
structure

diff -r d8d26a695cf6 -r a5422a41c85a source/Lib/TLibCommon/TComPrediction.cpp
--- a/source/Lib/TLibCommon/TComPrediction.cpp  Mon Jul 21 16:06:31 2014 +0530
+++ b/source/Lib/TLibCommon/TComPrediction.cpp  Tue Jul 22 17:03:38 2014 +0530
@@ -223,18 +223,18 @@
  * \param TComDataCU* cu
  * \param uint32_t PartAddr
  */
-bool TComPrediction::xCheckIdenticalMotion(TComDataCU* cu, uint32_t partAddr)
+bool TComPrediction::xCheckIdenticalMotion(TComDataCU* cu)
 {
 X265_CHECK(cu-m_slice-isInterB(), identical motion check in P frame\n);
 if (!cu-m_slice-m_pps-bUseWeightedBiPred)
 {
-int refIdxL0 = cu-getCUMvField(0)-getRefIdx(partAddr);
-int refIdxL1 = cu-getCUMvField(1)-getRefIdx(partAddr);
+int refIdxL0 = cu-getCUMvField(0)-getRefIdx(m_partAddr);
+int refIdxL1 = cu-getCUMvField(1)-getRefIdx(m_partAddr);
 if (refIdxL0 = 0  refIdxL1 = 0)
 {
 int refPOCL0 = cu-m_slice-m_refPOCList[0][refIdxL0];
 int refPOCL1 = cu-m_slice-m_refPOCList[1][refIdxL1];
-if (refPOCL0 == refPOCL1  cu-getCUMvField(0)-getMv(partAddr) 
== cu-getCUMvField(1)-getMv(partAddr))
+if (refPOCL0 == refPOCL1  cu-getCUMvField(0)-getMv(m_partAddr) 
== cu-getCUMvField(1)-getMv(m_partAddr))
 return true;
 }
 }
@@ -243,89 +243,83 @@
 
 void TComPrediction::motionCompensation(TComDataCU* cu, TComYuv* predYuv, int 
list, int partIdx, bool bLuma, bool bChroma)
 {
-int  width;
-int  height;
-uint32_t partAddr;
+X265_CHECK(partIdx = 0, partidx is not positive\n);
+
+if (cu-m_slice-isInterP())
+list = REF_PIC_LIST_0;
+if (list != REF_PIC_LIST_X)
+{
+if (cu-m_slice-m_pps-bUseWeightPred)
+{
+ShortYuv* shortYuv = m_predShortYuv[0];
+int refId = cu-getCUMvField(list)-getRefIdx(m_partAddr);
+X265_CHECK(refId = 0, refidx is not positive\n);
 
-X265_CHECK(partIdx = 0, partidx is not positive\n);
-{
-cu-getPartIndexAndSize(partIdx, partAddr, width, height);
-if (cu-m_slice-isInterP())
-list = REF_PIC_LIST_0;
-if (list != REF_PIC_LIST_X)
-{
-if (cu-m_slice-m_pps-bUseWeightPred)
-{
-ShortYuv* shortYuv = m_predShortYuv[0];
-int refId = cu-getCUMvField(list)-getRefIdx(partAddr);
-X265_CHECK(refId = 0, refidx is not positive\n);
+MV mv = cu-getCUMvField(list)-getMv(m_partAddr);
+cu-clipMv(mv);
+if (bLuma)
+
xPredInterLumaBlk(cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), 
cu-getAddr(), cu-getZorderIdxInCU(),
+mv, shortYuv);
+if (bChroma)
+
xPredInterChromaBlk(cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), 
cu-getAddr(), cu-getZorderIdxInCU(), 
+mv, shortYuv);
 
-MV mv = cu-getCUMvField(list)-getMv(partAddr);
-cu-clipMv(mv);
-if (bLuma)
-
xPredInterLumaBlk(cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), 
cu-getAddr(), cu-getZorderIdxInCU(),
-partAddr, mv, width, height, shortYuv);
-if (bChroma)
-
xPredInterChromaBlk(cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), 
cu-getAddr(), cu-getZorderIdxInCU(), 
-partAddr, mv, width, height, shortYuv);
-
-xWeightedPredictionUni(cu, shortYuv, partAddr, width, height, 
list, predYuv, -1, bLuma, bChroma);
-}
-else
-xPredInterUni(cu, partAddr, width, height, list, predYuv, 
bLuma, bChroma);
+xWeightedPredictionUni(cu, shortYuv, m_partAddr, m_width, 
m_height, list, predYuv, -1, bLuma, bChroma);
 }
 else
-{
-if (xCheckIdenticalMotion(cu, partAddr))
-xPredInterUni(cu, partAddr, width, height, REF_PIC_LIST_0, 
predYuv, bLuma, bChroma);
-else
-xPredInterBi(cu, partAddr, width, height, predYuv, bLuma, 
bChroma);
-}
+xPredInterUni(cu, list, predYuv, bLuma, bChroma);
+}
+else
+{
+if (xCheckIdenticalMotion(cu))
+xPredInterUni(cu, REF_PIC_LIST_0, predYuv, bLuma, bChroma);
+else
+xPredInterBi(cu, predYuv, bLuma, bChroma);
 }
 }
 
-void TComPrediction::xPredInterUni(TComDataCU* cu, uint32_t partAddr, int 
width, int height, int list, TComYuv* outPredYuv, bool bLuma, bool bChroma)
+void 

[x265] [PATCH 3 of 3] TComPrediction: remove redundant colorspace information

2014-07-22 Thread deepthi
# HG changeset patch
# User Deepthi Nandakumar deep...@multicorewareinc.com
# Date 1406039346 -19800
#  Tue Jul 22 19:59:06 2014 +0530
# Node ID 30f41c3ef7d39a6e341bd149adf6d57267984ec7
# Parent  a5422a41c85ac06fc773f1179e6fbea1a80a5e98
TComPrediction: remove redundant colorspace information

m_csp is sufficient inside the TEncSearch, TComPrediction structures

diff -r a5422a41c85a -r 30f41c3ef7d3 source/Lib/TLibCommon/TComPrediction.cpp
--- a/source/Lib/TLibCommon/TComPrediction.cpp  Tue Jul 22 17:03:38 2014 +0530
+++ b/source/Lib/TLibCommon/TComPrediction.cpp  Tue Jul 22 19:59:06 2014 +0530
@@ -86,8 +86,6 @@
 void TComPrediction::initTempBuff(int csp)
 {
 m_csp = csp;
-m_hChromaShift = CHROMA_H_SHIFT(csp);
-m_vChromaShift = CHROMA_V_SHIFT(csp);
 
 if (m_predBuf == NULL)
 {
@@ -470,8 +468,11 @@
 int refStride = refPic-getCStride();
 int dstStride = dstPic-getCStride();
 
-int shiftHor = (2 + m_hChromaShift);
-int shiftVer = (2 + m_vChromaShift);
+int hChromaShift = CHROMA_H_SHIFT(m_csp);
+int vChromaShift = CHROMA_V_SHIFT(m_csp);
+
+int shiftHor = (2 + hChromaShift);
+int shiftVer = (2 + vChromaShift);
 
 int refOffset = (mv-x  shiftHor) + (mv-y  shiftVer) * refStride;
 
@@ -493,25 +494,25 @@
 }
 else if (yFrac == 0)
 {
-primitives.chroma[m_csp].filter_hpp[partEnum](refCb, refStride, dstCb, 
dstStride, xFrac  (1 - m_hChromaShift));
-primitives.chroma[m_csp].filter_hpp[partEnum](refCr, refStride, dstCr, 
dstStride, xFrac  (1 - m_hChromaShift));
+primitives.chroma[m_csp].filter_hpp[partEnum](refCb, refStride, dstCb, 
dstStride, xFrac  (1 - hChromaShift));
+primitives.chroma[m_csp].filter_hpp[partEnum](refCr, refStride, dstCr, 
dstStride, xFrac  (1 - hChromaShift));
 }
 else if (xFrac == 0)
 {
-primitives.chroma[m_csp].filter_vpp[partEnum](refCb, refStride, dstCb, 
dstStride, yFrac  (1 - m_vChromaShift));
-primitives.chroma[m_csp].filter_vpp[partEnum](refCr, refStride, dstCr, 
dstStride, yFrac  (1 - m_vChromaShift));
+primitives.chroma[m_csp].filter_vpp[partEnum](refCb, refStride, dstCb, 
dstStride, yFrac  (1 - vChromaShift));
+primitives.chroma[m_csp].filter_vpp[partEnum](refCr, refStride, dstCr, 
dstStride, yFrac  (1 - vChromaShift));
 }
 else
 {
-int extStride = m_width  m_hChromaShift;
+int extStride = m_width  hChromaShift;
 int filterSize = NTAPS_CHROMA;
 int halfFilterSize = (filterSize  1);
 
-primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, 
m_immedVals, extStride, xFrac  (1 - m_hChromaShift), 1);
-primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + 
(halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac  (1 - 
m_vChromaShift));
+primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, 
m_immedVals, extStride, xFrac  (1 - hChromaShift), 1);
+primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + 
(halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac  (1 - 
vChromaShift));
 
-primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, 
m_immedVals, extStride, xFrac  (1 - m_hChromaShift), 1);
-primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + 
(halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac  (1 - 
m_vChromaShift));
+primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, 
m_immedVals, extStride, xFrac  (1 - hChromaShift), 1);
+primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + 
(halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac  (1 - 
vChromaShift));
 }
 }
 
@@ -520,9 +521,11 @@
 {
 int refStride = refPic-getCStride();
 int dstStride = dstPic-m_cwidth;
+int hChromaShift = CHROMA_H_SHIFT(m_csp);
+int vChromaShift = CHROMA_V_SHIFT(m_csp);
 
-int shiftHor = (2 + m_hChromaShift);
-int shiftVer = (2 + m_vChromaShift);
+int shiftHor = (2 + hChromaShift);
+int shiftVer = (2 + vChromaShift);
 
 int refOffset = (mv-x  shiftHor) + (mv-y  shiftVer) * refStride;
 
@@ -537,8 +540,8 @@
 
 int partEnum = partitionFromSizes(m_width, m_height);
 
-uint32_t cxWidth  = m_widthm_hChromaShift;
-uint32_t cxHeight = m_height  m_vChromaShift;
+uint32_t cxWidth  = m_widthhChromaShift;
+uint32_t cxHeight = m_height  vChromaShift;
 
 X265_CHECK(((cxWidth | cxHeight) % 2) == 0, chroma block size expected to 
be multiple of 2\n);
 
@@ -549,23 +552,23 @@
 }
 else if (yFrac == 0)
 {
-primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, dstCb, 
dstStride, xFrac  (1 - m_hChromaShift), 0);
-primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, dstCr, 
dstStride, xFrac  (1 - m_hChromaShift), 0);
+primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, dstCb, 
dstStride, xFrac  (1 - hChromaShift), 0);
+

Re: [x265] [PATCH] psyrdoq: implementation of psyrdoq

2014-07-22 Thread Deepthi Nandakumar
Does it make sense to try this for DC coefficients?


On Tue, Jul 22, 2014 at 9:24 PM, Steve Borho st...@borho.org wrote:

 On 07/22, sumala...@multicorewareinc.com wrote:
  # HG changeset patch
  # User Sumalatha Polureddysumala...@multicorewareinc.com
  # Date 1406032149 -19800
  # Node ID 37e03dcd2e4f0b5894880ff8c097bd6e11590459
  # Parent  d303b4d860e9f06396a156726dd518d0f41fe796
  psyrdoq: implementation of psyrdoq
 
  diff -r d303b4d860e9 -r 37e03dcd2e4f
 source/Lib/TLibCommon/TComTrQuant.cpp
  --- a/source/Lib/TLibCommon/TComTrQuant.cpp   Mon Jul 21 22:43:38 2014
 -0500
  +++ b/source/Lib/TLibCommon/TComTrQuant.cpp   Tue Jul 22 17:59:09 2014
 +0530
  @@ -64,6 +64,8 @@
   return y + ((x - y)  ((x - y)  (sizeof(int) * CHAR_BIT - 1)));
 // min(x, y)
   }
 
  +#define SIGN(x,y) ((x^(y  31))-(y  31))
  +
   //
 
   // TComTrQuant class member functions
   //
 
  @@ -307,6 +309,8 @@
   }
 
   uint32_t TComTrQuant::transformNxN(TComDataCU* cu,
  +   pixel*  fenc,
  +   uint32_tfencStride,
  int16_t*residual,
  uint32_tstride,
  coeff_t*coeff,
  @@ -316,10 +320,10 @@
  booluseTransformSkip,
  boolcurUseRDOQ)
   {
  +int trSize = 1  log2TrSize;
   if (cu-getCUTransquantBypass(absPartIdx))
   {
   uint32_t numSig = 0;
  -int trSize = 1  log2TrSize;
   for (int k = 0; k  trSize; k++)
   {
   for (int j = 0; j  trSize; j++)
  @@ -339,6 +343,12 @@
   const uint32_t sizeIdx = log2TrSize - 2;
   int useDST = (sizeIdx == 0  ttype == TEXT_LUMA 
 cu-getPredictionMode(absPartIdx) == MODE_INTRA);
   int index = DCT_4x4 + sizeIdx - useDST;
  +if (psyRdoqEnabled())
  +{
  +// converting pixel to int and putting in separate buffer
 to take dct
  +primitives.square_copy_ps[sizeIdx](m_tmpfencBuf,
 MAX_CU_SIZE, fenc, fencStride);
  +primitives.dct[index](m_tmpfencBuf, m_tmpfencCoeff, stride);
  +}
   primitives.dct[index](residual, m_tmpCoeff, stride);
   if (m_nr-bNoiseReduction)
   {
  @@ -356,7 +366,7 @@
 
   if (m_useRDOQ  curUseRDOQ)
   {
  -return xRateDistOptQuant(cu, m_tmpCoeff, coeff, log2TrSize,
 ttype, absPartIdx);
  +return xRateDistOptQuant(cu, m_tmpfencCoeff, m_tmpCoeff, coeff,
 log2TrSize, ttype, absPartIdx);
   }
   return xQuant(cu, m_tmpCoeff, coeff, log2TrSize, ttype, absPartIdx);
   }
  @@ -505,7 +515,7 @@
* Rate distortion optimized quantization for entropy
* coding engines using probability models like CABAC
*/
  -uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t*
 srcCoeff, coeff_t* dstCoeff, uint32_t log2TrSize,
  +uint32_t TComTrQuant::xRateDistOptQuant(TComDataCU* cu, int32_t*
 fencCoeff, int32_t* srcCoeff, coeff_t* dstCoeff, uint32_t log2TrSize,
   TextType ttype, uint32_t
 absPartIdx)
   {
   uint32_t trSize = 1  log2TrSize;
  @@ -614,7 +624,7 @@
   {
   level = xGetCodedLevel(costCoeff[scanPos],
 curCostSig, costSig[scanPos],
  levelDouble, maxAbsLevel,
 baseLevel, greaterOneBits, levelAbsBits, goRiceParam,
  -   c1c2Idx, qbits, scaleFactor,
 1);
  +   c1c2Idx, qbits, scaleFactor,
 1, srcCoeff[blkPos], fencCoeff[blkPos]);
   sigRateDelta[blkPos] = 0;
   }
   else
  @@ -631,7 +641,7 @@
   curCostSig = xGetRateSigCoef(1, ctxSig);
   level = xGetCodedLevel(costCoeff[scanPos],
 curCostSig, costSig[scanPos],
  levelDouble,
 maxAbsLevel, baseLevel, greaterOneBits, levelAbsBits, goRiceParam,
  -   c1c2Idx, qbits,
 scaleFactor, 0);
  +   c1c2Idx, qbits,
 scaleFactor, 0, srcCoeff[blkPos], fencCoeff[blkPos]);
   }
   else
   {
  @@ -1126,7 +1136,9 @@
   uint32_t c1c2Idx,
   int  qbits,
   double   scaleFactor,
  -bool last) const
  +bool last,
 

Re: [x265] [PATCH 1 of 3] TComPrediction: remove TComDataCU as pointer to private functions

2014-07-22 Thread Steve Borho
On 07/23, deep...@multicorewareinc.com wrote:
 # HG changeset patch
 # User Deepthi Nandakumar deep...@multicorewareinc.com
 # Date 1405938991 -19800
 #  Mon Jul 21 16:06:31 2014 +0530
 # Node ID d8d26a695cf6734ad2180c4694360ef6e71ead81
 # Parent  e3ad03b7c4854be40730645d4fe25e56a93f3f94
 TComPrediction: remove TComDataCU as pointer to private functions
 
 diff -r e3ad03b7c485 -r d8d26a695cf6 source/Lib/TLibCommon/TComPrediction.cpp
 --- a/source/Lib/TLibCommon/TComPrediction.cppTue Jul 22 13:28:54 
 2014 -0500
 +++ b/source/Lib/TLibCommon/TComPrediction.cppMon Jul 21 16:06:31 
 2014 +0530
 @@ -85,6 +85,7 @@
  
  void TComPrediction::initTempBuff(int csp)
  {
 +m_csp = csp;
  m_hChromaShift = CHROMA_H_SHIFT(csp);
  m_vChromaShift = CHROMA_V_SHIFT(csp);
  
 @@ -262,9 +263,11 @@
  MV mv = cu-getCUMvField(list)-getMv(partAddr);
  cu-clipMv(mv);
  if (bLuma)
 -xPredInterLumaBlk(cu, 
 cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), partAddr, mv, width, 
 height, shortYuv);
 +
 xPredInterLumaBlk(cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), 
 cu-getAddr(), cu-getZorderIdxInCU(),
 +partAddr, mv, width, height, shortYuv);
  if (bChroma)
 -xPredInterChromaBlk(cu, 
 cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), partAddr, mv, width, 
 height, shortYuv);
 +
 xPredInterChromaBlk(cu-m_slice-m_refPicList[list][refId]-getPicYuvRec(), 
 cu-getAddr(), cu-getZorderIdxInCU(), 
 +partAddr, mv, width, height, shortYuv);

We try to align the second row of arguments with the open paren(
  
  xWeightedPredictionUni(cu, shortYuv, partAddr, width, 
 height, list, predYuv, -1, bLuma, bChroma);
  }
 @@ -291,10 +294,12 @@
  cu-clipMv(mv);
  
  if (bLuma)
 -xPredInterLumaBlk(cu, 
 cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), partAddr, mv, 
 width, height, outPredYuv);
 +
 xPredInterLumaBlk(cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), 
 cu-getAddr(), cu-getZorderIdxInCU(), 
 +partAddr, mv, width, height, outPredYuv);
  
  if (bChroma)
 -xPredInterChromaBlk(cu, 
 cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), partAddr, mv, 
 width, height, outPredYuv);
 +
 xPredInterChromaBlk(cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), 
 cu-getAddr(), cu-getZorderIdxInCU(), 
 +partAddr, mv, width, height, outPredYuv);
  }
  
  void TComPrediction::xPredInterUni(TComDataCU* cu, uint32_t partAddr, int 
 width, int height, int list, ShortYuv* outPredYuv, bool bLuma, bool bChroma)
 @@ -307,9 +312,11 @@
  cu-clipMv(mv);
  
  if (bLuma)
 -xPredInterLumaBlk(cu, 
 cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), partAddr, mv, 
 width, height, outPredYuv);
 +
 xPredInterLumaBlk(cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), 
 cu-getAddr(), cu-getZorderIdxInCU(), 
 +partAddr, mv, width, height, outPredYuv);
  if (bChroma)
 -xPredInterChromaBlk(cu, 
 cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), partAddr, mv, 
 width, height, outPredYuv);
 +
 xPredInterChromaBlk(cu-m_slice-m_refPicList[list][refIdx]-getPicYuvRec(), 
 cu-getAddr(), cu-getZorderIdxInCU(), 
 +partAddr, mv, width, height, outPredYuv);
  }
  
  void TComPrediction::xPredInterBi(TComDataCU* cu, uint32_t partAddr, int 
 width, int height, TComYuv* outPredYuv, bool bLuma, bool bChroma)
 @@ -378,7 +385,7 @@
   * \param height   Height of block
   * \param dstPic   Pointer to destination picture
   */
 -void TComPrediction::xPredInterLumaBlk(TComDataCU *cu, TComPicYuv *refPic, 
 uint32_t partAddr, MV *mv, int width, int height, TComYuv *dstPic)
 +void TComPrediction::xPredInterLumaBlk(TComPicYuv *refPic, uint32_t cuAddr, 
 uint32_t zOrderIdxinCU, uint32_t partAddr, MV *mv, int width, int height, 
 TComYuv *dstPic)

these `x' hungarian prefixes on the function names in this class really
hurt my eyes (the HM used them for internal functions). I hope those
can be removed in a future patch.

  {
  int dstStride = dstPic-getStride();
  pixel *dst= dstPic-getLumaAddr(partAddr);
 @@ -386,7 +393,7 @@
  int srcStride = refPic-getStride();
  int srcOffset = (mv-x  2) + (mv-y  2) * srcStride;
  int partEnum = partitionFromSizes(width, height);
 -pixel* src = refPic-getLumaAddr(cu-getAddr(), cu-getZorderIdxInCU() + 
 partAddr) + srcOffset;
 +pixel* src = refPic-getLumaAddr(cuAddr, zOrderIdxinCU + partAddr) + 
 srcOffset;
  
  int xFrac = mv-x  0x3;
  int yFrac = mv-y  0x3;
 @@ -414,11 +421,11 @@
  }
  
  //Motion compensated block for biprediction
 -void TComPrediction::xPredInterLumaBlk(TComDataCU *cu, TComPicYuv *refPic, 
 uint32_t partAddr, MV *mv, int width, int height, ShortYuv *dstPic)
 +void TComPrediction::xPredInterLumaBlk(TComPicYuv *refPic, 

Re: [x265] [PATCH 3 of 3] TComPrediction: remove redundant colorspace information

2014-07-22 Thread Steve Borho
On 07/23, deep...@multicorewareinc.com wrote:
 # HG changeset patch
 # User Deepthi Nandakumar deep...@multicorewareinc.com
 # Date 1406039346 -19800
 #  Tue Jul 22 19:59:06 2014 +0530
 # Node ID 30f41c3ef7d39a6e341bd149adf6d57267984ec7
 # Parent  a5422a41c85ac06fc773f1179e6fbea1a80a5e98
 TComPrediction: remove redundant colorspace information
 
 m_csp is sufficient inside the TEncSearch, TComPrediction structures
 
 diff -r a5422a41c85a -r 30f41c3ef7d3 source/Lib/TLibCommon/TComPrediction.cpp
 --- a/source/Lib/TLibCommon/TComPrediction.cppTue Jul 22 17:03:38 
 2014 +0530
 +++ b/source/Lib/TLibCommon/TComPrediction.cppTue Jul 22 19:59:06 
 2014 +0530
 @@ -86,8 +86,6 @@
  void TComPrediction::initTempBuff(int csp)
  {
  m_csp = csp;
 -m_hChromaShift = CHROMA_H_SHIFT(csp);
 -m_vChromaShift = CHROMA_V_SHIFT(csp);
  
  if (m_predBuf == NULL)
  {
 @@ -470,8 +468,11 @@
  int refStride = refPic-getCStride();
  int dstStride = dstPic-getCStride();
  
 -int shiftHor = (2 + m_hChromaShift);
 -int shiftVer = (2 + m_vChromaShift);
 +int hChromaShift = CHROMA_H_SHIFT(m_csp);
 +int vChromaShift = CHROMA_V_SHIFT(m_csp);

ok

Somewhere down the line we should make a build option that makes
CHROMA_V_SHIFT() and CHROMA_H_SHIFT() and similar macros return
hard-coded 4:2:0 values, to squeeze out a few more ounces of perf when
the user is uninterested in 4:2:2 or 4:4:4.

 +
 +int shiftHor = (2 + hChromaShift);
 +int shiftVer = (2 + vChromaShift);
  
  int refOffset = (mv-x  shiftHor) + (mv-y  shiftVer) * refStride;
  
 @@ -493,25 +494,25 @@
  }
  else if (yFrac == 0)
  {
 -primitives.chroma[m_csp].filter_hpp[partEnum](refCb, refStride, 
 dstCb, dstStride, xFrac  (1 - m_hChromaShift));
 -primitives.chroma[m_csp].filter_hpp[partEnum](refCr, refStride, 
 dstCr, dstStride, xFrac  (1 - m_hChromaShift));
 +primitives.chroma[m_csp].filter_hpp[partEnum](refCb, refStride, 
 dstCb, dstStride, xFrac  (1 - hChromaShift));
 +primitives.chroma[m_csp].filter_hpp[partEnum](refCr, refStride, 
 dstCr, dstStride, xFrac  (1 - hChromaShift));
  }
  else if (xFrac == 0)
  {
 -primitives.chroma[m_csp].filter_vpp[partEnum](refCb, refStride, 
 dstCb, dstStride, yFrac  (1 - m_vChromaShift));
 -primitives.chroma[m_csp].filter_vpp[partEnum](refCr, refStride, 
 dstCr, dstStride, yFrac  (1 - m_vChromaShift));
 +primitives.chroma[m_csp].filter_vpp[partEnum](refCb, refStride, 
 dstCb, dstStride, yFrac  (1 - vChromaShift));
 +primitives.chroma[m_csp].filter_vpp[partEnum](refCr, refStride, 
 dstCr, dstStride, yFrac  (1 - vChromaShift));
  }
  else
  {
 -int extStride = m_width  m_hChromaShift;
 +int extStride = m_width  hChromaShift;
  int filterSize = NTAPS_CHROMA;
  int halfFilterSize = (filterSize  1);
  
 -primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, 
 m_immedVals, extStride, xFrac  (1 - m_hChromaShift), 1);
 -primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + 
 (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac  (1 - 
 m_vChromaShift));
 +primitives.chroma[m_csp].filter_hps[partEnum](refCb, refStride, 
 m_immedVals, extStride, xFrac  (1 - hChromaShift), 1);
 +primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + 
 (halfFilterSize - 1) * extStride, extStride, dstCb, dstStride, yFrac  (1 - 
 vChromaShift));
  
 -primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, 
 m_immedVals, extStride, xFrac  (1 - m_hChromaShift), 1);
 -primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + 
 (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac  (1 - 
 m_vChromaShift));
 +primitives.chroma[m_csp].filter_hps[partEnum](refCr, refStride, 
 m_immedVals, extStride, xFrac  (1 - hChromaShift), 1);
 +primitives.chroma[m_csp].filter_vsp[partEnum](m_immedVals + 
 (halfFilterSize - 1) * extStride, extStride, dstCr, dstStride, yFrac  (1 - 
 vChromaShift));
  }
  }
  
 @@ -520,9 +521,11 @@
  {
  int refStride = refPic-getCStride();
  int dstStride = dstPic-m_cwidth;
 +int hChromaShift = CHROMA_H_SHIFT(m_csp);
 +int vChromaShift = CHROMA_V_SHIFT(m_csp);
  
 -int shiftHor = (2 + m_hChromaShift);
 -int shiftVer = (2 + m_vChromaShift);
 +int shiftHor = (2 + hChromaShift);
 +int shiftVer = (2 + vChromaShift);
  
  int refOffset = (mv-x  shiftHor) + (mv-y  shiftVer) * refStride;
  
 @@ -537,8 +540,8 @@
  
  int partEnum = partitionFromSizes(m_width, m_height);
  
 -uint32_t cxWidth  = m_widthm_hChromaShift;
 -uint32_t cxHeight = m_height  m_vChromaShift;
 +uint32_t cxWidth  = m_widthhChromaShift;
 +uint32_t cxHeight = m_height  vChromaShift;
  
  X265_CHECK(((cxWidth | cxHeight) % 2) == 0, chroma block size expected 
 to be multiple of 2\n);
  
 @@ -549,23 +552,23 

Re: [x265] [PATCH] psyrdoq: implementation of psyrdoq

2014-07-22 Thread Steve Borho
On 07/23, Deepthi Nandakumar wrote:
 Does it make sense to try this for DC coefficients?

my understanding is that it is not helpful, and possibly harmful.

we don't want to bias the DC coefficient in any way.

snipped

-- 
Steve Borho
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] [PATCH] psyrdoq: implementation of psyrdoq

2014-07-22 Thread Deepthi Nandakumar
Ok, Thats what I thought too.
On Jul 23, 2014 8:55 AM, Steve Borho st...@borho.org wrote:

 On 07/23, Deepthi Nandakumar wrote:
  Does it make sense to try this for DC coefficients?

 my understanding is that it is not helpful, and possibly harmful.

 we don't want to bias the DC coefficient in any way.

 snipped

 --
 Steve Borho
 ___
 x265-devel mailing list
 x265-devel@videolan.org
 https://mailman.videolan.org/listinfo/x265-devel

___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] trquant: store QpParam for each component

2014-07-22 Thread Steve Borho
On 07/21, Satoshi Nakagawa wrote:
 # HG changeset patch
 # User Satoshi Nakagawa nakagawa...@oki.com
 # Date 1405905842 -32400
 #  Mon Jul 21 10:24:02 2014 +0900
 # Node ID b40af94fd00f5f23a22854aaf498ffef32910110
 # Parent  eb983d29c11acc03b91e07fe93c31503fa3a4732
 trquant: store QpParam for each component

Hello Satoshi,

This patch looked harmless, so I had queued it without sending it
through regression testing, but it turned out to cause hash mismatches.

The quickest repro case I know of is this:

x265 BasketBallDrive_1920x1080_50.y4m --bitrate 4000 --rd 5 --hash 1 -f 50

it causes a hash mistake right away and eventually a decoder crash. If
you can fix it today, I won't back this out.

 diff -r eb983d29c11a -r b40af94fd00f source/Lib/TLibCommon/TComLoopFilter.cpp
 --- a/source/Lib/TLibCommon/TComLoopFilter.cppThu Jul 17 09:29:39 
 2014 +0200
 +++ b/source/Lib/TLibCommon/TComLoopFilter.cppMon Jul 21 10:24:02 
 2014 +0900
 @@ -48,7 +48,6 @@
  // 
 
  // Constants
  // 
 
 -#define QpUV(iQpY, chFmt)  (((iQpY)  0) ? (iQpY) : (((iQpY)  57) ? ((iQpY) 
 - 6) : g_chromaScale[chFmt][(iQpY)]))
  #define DEFAULT_INTRA_TC_OFFSET 2 /// Default intra TC offset
  
  // 
 
 @@ -441,9 +440,6 @@
  pixel* tmpsrc = src;
  
  int stride = reconYuv-getStride();
 -int qp = 0;
 -int qpP = 0;
 -int qpQ = 0;
  uint32_t numParts = cu-m_pic-getNumPartInCUSize()  depth;
  
  uint32_t log2UnitSize = g_log2UnitSize;
 @@ -457,8 +453,8 @@
  uint32_t  partQ = 0;
  TComDataCU* cuP = cu;
  TComDataCU* cuQ = cu;
 -int  betaOffsetDiv2 = 
 cuQ-m_slice-m_pps-deblockingFilterBetaOffsetDiv2;
 -int  tcOffsetDiv2 = cuQ-m_slice-m_pps-deblockingFilterTcOffsetDiv2;
 +int  betaOffset = cuQ-m_slice-m_pps-deblockingFilterBetaOffsetDiv2  
 1;
 +int  tcOffset = cuQ-m_slice-m_pps-deblockingFilterTcOffsetDiv2  1;
  
  if (dir == EDGE_VER)
  {
 @@ -480,7 +476,7 @@
  bs = blockingStrength[bsAbsIdx];
  if (bs)
  {
 -qpQ = cu-getQP(bsAbsIdx);
 +int qpQ = cu-getQP(bsAbsIdx);
  partQ = bsAbsIdx;
  // Derive neighboring PU index
  if (dir == EDGE_VER)
 @@ -492,12 +488,12 @@
  cuP = cuQ-getPUAbove(partP, partQ);
  }
  
 -qpP = cuP-getQP(partP);
 -qp = (qpP + qpQ + 1)  1;
 +int qpP = cuP-getQP(partP);
 +int qp = (qpP + qpQ + 1)  1;
  int bitdepthScale = 1  (X265_DEPTH - 8);
  
 -int indexTC = Clip3(0, MAX_QP + DEFAULT_INTRA_TC_OFFSET, int(qp 
 + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + (tcOffsetDiv2  1)));
 -int indexB = Clip3(0, MAX_QP, qp + (betaOffsetDiv2  1));
 +int indexTC = Clip3(0, MAX_QP + DEFAULT_INTRA_TC_OFFSET, int(qp 
 + DEFAULT_INTRA_TC_OFFSET * (bs - 1) + tcOffset));
 +int indexB = Clip3(0, MAX_QP, qp + betaOffset);
  
  int tc =  sm_tcTable[indexTC] * bitdepthScale;
  int beta = sm_betaTable[indexB] * bitdepthScale;
 @@ -544,13 +540,11 @@
  
  void TComLoopFilter::xEdgeFilterChroma(TComDataCU* cu, uint32_t 
 absZOrderIdx, uint32_t depth, int dir, int edge, uint8_t blockingStrength[])
  {
 +int chFmt = cu-getChromaFormat();
  TComPicYuv* reconYuv = cu-m_pic-getPicYuvRec();
  int stride = reconYuv-getCStride();
  pixel* srcCb = reconYuv-getCbAddr(cu-getAddr(), absZOrderIdx);
  pixel* srcCr = reconYuv-getCrAddr(cu-getAddr(), absZOrderIdx);
 -int qp = 0;
 -int qpP = 0;
 -int qpQ = 0;
  uint32_t log2UnitSizeH = g_log2UnitSize - cu-getHorzChromaShift();
  uint32_t log2UnitSizeV = g_log2UnitSize - cu-getVertChromaShift();
  uint32_t unitSizeChromaH = 1  log2UnitSizeH;
 @@ -565,7 +559,7 @@
  uint32_t  partQ;
  TComDataCU* cuP;
  TComDataCU* cuQ = cu;
 -int tcOffsetDiv2 = cu-m_slice-m_pps-deblockingFilterTcOffsetDiv2;
 +int tcOffset = cu-m_slice-m_pps-deblockingFilterTcOffsetDiv2  1;
  
  // Vertical Position
  uint32_t edgeNumInLCUVert = g_zscanToRaster[absZOrderIdx] % 
 lcuWidthInBaseUnits + edge;
 @@ -611,7 +605,7 @@
  
  if (bs  1)
  {
 -qpQ = cu-getQP(bsAbsIdx);
 +int qpQ = cu-getQP(bsAbsIdx);
  partQ = bsAbsIdx;
  // Derive neighboring PU index
  if (dir == EDGE_VER)
 @@ -623,7 +617,7 @@
  cuP = cuQ-getPUAbove(partP, partQ);
  }
  
 -qpP = cuP-getQP(partP);
 +int qpP = cuP-getQP(partP);
  
  if (cu-m_slice-m_pps-bTransquantBypassEnabled)
  {
 @@ -636,10 +630,17 @@
  

[x265] [PATCH] analysis: setQPforQuant in checkIntraInter to fix the hash mismatch at rd=56

2014-07-22 Thread gopu
# HG changeset patch
# User Gopu Govindaswamy g...@multicorewareinc.com
# Date 1406094393 -19800
#  Wed Jul 23 11:16:33 2014 +0530
# Node ID 1beaaabef3eb6d3e832102ed7dafcd855c1d7298
# Parent  e3ad03b7c4854be40730645d4fe25e56a93f3f94
analysis: setQPforQuant in checkIntraInter to fix the hash mismatch at rd=56

diff -r e3ad03b7c485 -r 1beaaabef3eb source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp   Tue Jul 22 13:28:54 2014 -0500
+++ b/source/encoder/analysis.cpp   Wed Jul 23 11:16:33 2014 +0530
@@ -1722,6 +1722,7 @@
 
 PPAScopeEvent(CheckRDCostIntra + depth);
 
+m_trQuant.setQPforQuant(outTempCU);
 outTempCU-setSkipFlagSubParts(false, 0, depth);
 outTempCU-setPartSizeSubParts(partSize, 0, depth);
 outTempCU-setPredModeSubParts(MODE_INTRA, 0, depth);
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel