[x265] [PATCH] analysis: Intra picture estimation information sharing

2014-09-16 Thread gopu
# HG changeset patch
# User Gopu Govindaswamy g...@multicorewareinc.com
# Date 1410857300 -19800
#  Tue Sep 16 14:18:20 2014 +0530
# Node ID 61dc8322e6c0af444ba591755c299b945e1e423a
# Parent  1de67321275e70d510f0df3d5b7d4b9d391a1e66
analysis: Intra picture estimation information sharing

when --analysis-mode=save - the encoder runs a full encode and dump the
best split and mode decisions into x265_analysis.dat(default file name if file
name is not provided) file
when --analysis-mode=load - the encoder reads the best split and mode decisions
from x265_analysis.dat and bypass the actual split and mode decisions, and
therefore perform a much faster encode

diff -r 1de67321275e -r 61dc8322e6c0 source/Lib/TLibCommon/CommonDef.h
--- a/source/Lib/TLibCommon/CommonDef.h Mon Sep 15 15:00:13 2014 +0200
+++ b/source/Lib/TLibCommon/CommonDef.h Tue Sep 16 14:18:20 2014 +0530
@@ -100,4 +100,6 @@
 #define CHROMA_H_SHIFT(x) (x == X265_CSP_I420 || x == X265_CSP_I422)
 #define CHROMA_V_SHIFT(x) (x == X265_CSP_I420)
 
+#define CTU_TO_DEPTH_INDEX  22 // index to array containing increment 
offsets to add into zOrder to get next depth
+
 #endif // ifndef X265_COMMONDEF_H
diff -r 1de67321275e -r 61dc8322e6c0 source/Lib/TLibCommon/TComRom.cpp
--- a/source/Lib/TLibCommon/TComRom.cpp Mon Sep 15 15:00:13 2014 +0200
+++ b/source/Lib/TLibCommon/TComRom.cpp Tue Sep 16 14:18:20 2014 +0530
@@ -505,5 +505,18 @@
 0x38, 
 };
 
+/* Contains how much to increment shared depth buffer for different ctu 
sizes to get next best depth
+ * here, depth 0 = 64x64, depth 1 = 32x32, depth 2 = 16x16 and depth 3 = 
8x8
+ * if ctu = 64, depth buffer size is 256 combination of depth values 0, 1, 
2, 3
+ * if ctu = 32, depth buffer size is 64 combination of depth values 1, 2, 3
+ * if ctu = 16, depth buffer size is 16 combination of depth values 2, 3 */
+
+const uint32_t g_depthInc[3][4] =
+{
+{ 16,  4,  0, 0},
+{ 64, 16,  4, 1},
+{256, 64, 16, 4}
+};
+
 }
 //! \}
diff -r 1de67321275e -r 61dc8322e6c0 source/Lib/TLibCommon/TComRom.h
--- a/source/Lib/TLibCommon/TComRom.h   Mon Sep 15 15:00:13 2014 +0200
+++ b/source/Lib/TLibCommon/TComRom.h   Tue Sep 16 14:18:20 2014 +0530
@@ -155,6 +155,8 @@
 // Intra tables
 extern const uint8_t g_intraFilterFlags[35];
 
+extern const uint32_t g_depthInc[3][4];
+
 }
 
 #endif  //ifndef X265_TCOMROM_H
diff -r 1de67321275e -r 61dc8322e6c0 source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp   Mon Sep 15 15:00:13 2014 +0200
+++ b/source/encoder/analysis.cpp   Tue Sep 16 14:18:20 2014 +0530
@@ -311,14 +311,25 @@
 uint32_t numPartition = cu-getTotalNumPart();
 if (m_bestCU[0]-m_slice-m_sliceType == I_SLICE)
 {
-compressIntraCU(m_bestCU[0], m_tempCU[0], false, cu, 
cu-m_CULocalData);
-if (m_param-analysisMode == 1)
+if (m_param-analysisMode == X265_ANALYSIS_LOAD  
m_bestCU[0]-m_pic-m_intraData)
 {
-memcpy(m_bestCU[0]-m_pic-m_intraData-depth[cu-getAddr() * 
cu-m_numPartitions], m_bestCU[0]-getDepth(), sizeof(uint8_t) * 
cu-getTotalNumPart());
-memcpy(m_bestCU[0]-m_pic-m_intraData-modes[cu-getAddr() * 
cu-m_numPartitions], m_bestCU[0]-getLumaIntraDir(), sizeof(uint8_t) * 
cu-getTotalNumPart());
-memcpy(m_bestCU[0]-m_pic-m_intraData-partSizes[cu-getAddr() * 
cu-m_numPartitions], m_bestCU[0]-getPartitionSize(), sizeof(char) * 
cu-getTotalNumPart());
-m_bestCU[0]-m_pic-m_intraData-cuAddr[cu-getAddr()] = 
cu-getAddr();
-m_bestCU[0]-m_pic-m_intraData-poc[cu-getAddr()]= 
cu-m_pic-m_POC;
+uint32_t zOrder = 0;
+compressSharedIntraCTU(m_bestCU[0], m_tempCU[0], false, cu, 
cu-m_CULocalData, 
+m_bestCU[0]-m_pic-m_intraData-depth[cu-getAddr() * 
cu-m_numPartitions],
+m_bestCU[0]-m_pic-m_intraData-partSizes[cu-getAddr() * 
cu-m_numPartitions],
+m_bestCU[0]-m_pic-m_intraData-modes[cu-getAddr() * 
cu-m_numPartitions], zOrder);
+}
+else
+{
+compressIntraCU(m_bestCU[0], m_tempCU[0], false, cu, 
cu-m_CULocalData);
+if (m_param-analysisMode == X265_ANALYSIS_SAVE  
m_bestCU[0]-m_pic-m_intraData)
+{
+memcpy(m_bestCU[0]-m_pic-m_intraData-depth[cu-getAddr() * 
cu-m_numPartitions], m_bestCU[0]-getDepth(), sizeof(uint8_t) * 
cu-getTotalNumPart());
+memcpy(m_bestCU[0]-m_pic-m_intraData-modes[cu-getAddr() * 
cu-m_numPartitions], m_bestCU[0]-getLumaIntraDir(), sizeof(uint8_t) * 
cu-getTotalNumPart());
+
memcpy(m_bestCU[0]-m_pic-m_intraData-partSizes[cu-getAddr() * 
cu-m_numPartitions], m_bestCU[0]-getPartitionSize(), sizeof(char) * 
cu-getTotalNumPart());
+m_bestCU[0]-m_pic-m_intraData-cuAddr[cu-getAddr()] = 
cu-getAddr();
+m_bestCU[0]-m_pic-m_intraData-poc[cu-getAddr()]= 
cu-m_pic-m_POC;
+}
 }
 if (m_param-bLogCuStats || m_param-rc.bStatWrite)

Re: [x265] [PATCH] analysis: Intra picture estimation information sharing

2014-09-16 Thread Steve Borho
On 09/16, Gopu Govindaswamy wrote:
 On Mon, Sep 15, 2014 at 4:10 PM, Steve Borho st...@borho.org wrote:
  We should probably also be setting the analysis pointers to NULL in the
  input picture structure prior to returning from x265_encoder_encode() so
  they do not accidentally re-use the same buffers for more than one
  picture.  In short, we need to be a lot more defensive about API abuses.
 
 
 i will make the separate patch for this, but still i need to verify on
 this, the analysis buffer is getting used to dump the analysis data into
 file after x265_encoder_encode(),

You can think of this in terms of ownership.

1. user calls x265_alloc_analysis_data(x265_picture*), the user now owns
   these buffers in the x265_picture.

2. user calls x265_encoder_encode() and the encoder copies the analysis
   pointers into the internal Frame structure. Now the encoder owns
   those buffers. The pointers in the input x265_picture are now
   redundant, the user should not read/write/or modify those buffers
   while the encoder owns them

3. Once the frame is encoded, the buffer pointers are copied into the
   output picture structure. Now the user owns them again. They can do
   what the wish, possibly release them.

this is true of both load and save

   +}
   +else
   +{
   +compressIntraCU(m_bestCU[0], m_tempCU[0], false, cu,
  cu-m_CULocalData);
   +if (m_param-analysisMode == 1)
   +{
   +
  memcpy(m_bestCU[0]-m_pic-m_intraData-depth[cu-getAddr() *
  cu-m_numPartitions], m_bestCU[0]-getDepth(), sizeof(uint8_t) *
  cu-getTotalNumPart());
   +
  memcpy(m_bestCU[0]-m_pic-m_intraData-modes[cu-getAddr() *
  cu-m_numPartitions], m_bestCU[0]-getLumaIntraDir(), sizeof(uint8_t) *
  cu-getTotalNumPart());
   +
  memcpy(m_bestCU[0]-m_pic-m_intraData-partSizes[cu-getAddr() *
  cu-m_numPartitions], m_bestCU[0]-getPartitionSize(), sizeof(char) *
  cu-getTotalNumPart());
   +m_bestCU[0]-m_pic-m_intraData-cuAddr[cu-getAddr()]
  = cu-getAddr();
   +m_bestCU[0]-m_pic-m_intraData-poc[cu-getAddr()]
  = cu-m_pic-m_POC;
   +}
}
if (m_param-bLogCuStats || m_param-rc.bStatWrite)
{
   @@ -533,7 +543,142 @@
#endif
}
  
   -void Analysis::checkIntra(TComDataCU* outBestCU, TComDataCU*
  outTempCU, PartSize partSize, CU *cu)
   +void Analysis::sharedCompressIntraCU(TComDataCU* outBestCU,
  TComDataCU* outTempCU, uint32_t depth, TComDataCU* cuPicsym, CU *cu,
  uint8_t* sharedDepth, char* sharedPartSizes, uint8_t* sharedModes)
   +{
   +Frame* pic = outBestCU-m_pic;
   +
   +// if current depth == shared depth then skip further splitting.
   +bool bSubBranch = true;
   +
   +if (depth == 0)
 
  !depth
 
   +{
   +// offset to next best depth in sharedDepth buffer
   +m_zorder = 0;
   +
   +// index to g_depthInc array to increment m_zorder offset to
  next depth
   +m_ctuToDepthIndex = m_param-maxCUSize / 22;
 
  this math is pretty magical. my guess is there's already a table
  somewhere that does this more cleanly? Does this code work with
  --ctu 16?
 
 
 i have verified and i don't find any such a table, but this logic works
 well for ctu size 64, 32 and 16, verified on this

I'm not doubting that it works, it's just not clear what it's doing. it
probably wants to be 1  (g_maxCUDepth - 2) or something similar.

-- 
Steve Borho
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] analysis: Intra picture estimation information sharing

2014-09-15 Thread gopu
# HG changeset patch
# User Gopu Govindaswamy g...@multicorewareinc.com
# Date 1410770251 -19800
#  Mon Sep 15 14:07:31 2014 +0530
# Node ID 9db768fa41ad927c66c1dc4ae446953862052ce4
# Parent  184e56afa951815f4e295b4fcce094ee03361a2e
analysis: Intra picture estimation information sharing

when --analysis-mode=save - the encoder runs a full encode and dump the
best split and mode decisions into x265_analysis.dat(default file name if file
name is not provided) file
when --analysis-mode=load - the encoder reads the best split and mode decisions
from x265_analysis.dat and bypass the actual split and mode decisions, and
therefore perform a much faster encode

diff -r 184e56afa951 -r 9db768fa41ad source/Lib/TLibCommon/TComRom.cpp
--- a/source/Lib/TLibCommon/TComRom.cpp Fri Sep 12 12:02:46 2014 +0530
+++ b/source/Lib/TLibCommon/TComRom.cpp Mon Sep 15 14:07:31 2014 +0530
@@ -505,5 +505,19 @@
 0x38, 
 };
 
+/* Contains how much to increment shared depth buffer for different ctu 
sizes to get next best depth.
+ * here,
+ * depth 0 = 64x64, depth 1 = 32x32, depth 2 = 16x16 and depth 3 = 8x8
+ * if ctu = 64, depth buffer size is 256 combination of depth values 0, 1, 
2, 3.
+ * if ctu = 32, depth buffer size is 64 combination of depth values 1, 2, 
3.
+ * if ctu = 16, depth buffer size is 16 combination of depth values 2, 3 */
+
+const uint32_t g_depthInc[3][4] =
+{
+{ 16,  4,  0, 0},
+{ 64, 16,  4, 1},
+{256, 64, 16, 4}
+};
+
 }
 //! \}
diff -r 184e56afa951 -r 9db768fa41ad source/Lib/TLibCommon/TComRom.h
--- a/source/Lib/TLibCommon/TComRom.h   Fri Sep 12 12:02:46 2014 +0530
+++ b/source/Lib/TLibCommon/TComRom.h   Mon Sep 15 14:07:31 2014 +0530
@@ -155,6 +155,8 @@
 // Intra tables
 extern const uint8_t g_intraFilterFlags[35];
 
+extern const uint32_t g_depthInc[3][4];
+
 }
 
 #endif  //ifndef X265_TCOMROM_H
diff -r 184e56afa951 -r 9db768fa41ad source/encoder/analysis.cpp
--- a/source/encoder/analysis.cpp   Fri Sep 12 12:02:46 2014 +0530
+++ b/source/encoder/analysis.cpp   Mon Sep 15 14:07:31 2014 +0530
@@ -311,14 +311,24 @@
 uint32_t numPartition = cu-getTotalNumPart();
 if (m_bestCU[0]-m_slice-m_sliceType == I_SLICE)
 {
-compressIntraCU(m_bestCU[0], m_tempCU[0], false, cu, 
cu-m_CULocalData);
-if (m_param-analysisMode == 1)
+if (m_param-analysisMode == 2)
 {
-memcpy(m_bestCU[0]-m_pic-m_intraData-depth[cu-getAddr() * 
cu-m_numPartitions], m_bestCU[0]-getDepth(), sizeof(uint8_t) * 
cu-getTotalNumPart());
-memcpy(m_bestCU[0]-m_pic-m_intraData-modes[cu-getAddr() * 
cu-m_numPartitions], m_bestCU[0]-getLumaIntraDir(), sizeof(uint8_t) * 
cu-getTotalNumPart());
-memcpy(m_bestCU[0]-m_pic-m_intraData-partSizes[cu-getAddr() * 
cu-m_numPartitions], m_bestCU[0]-getPartitionSize(), sizeof(char) * 
cu-getTotalNumPart());
-m_bestCU[0]-m_pic-m_intraData-cuAddr[cu-getAddr()] = 
cu-getAddr();
-m_bestCU[0]-m_pic-m_intraData-poc[cu-getAddr()]= 
cu-m_pic-m_POC;
+sharedCompressIntraCU(m_bestCU[0], m_tempCU[0], false, cu, 
cu-m_CULocalData, 
+m_bestCU[0]-m_pic-m_intraData-depth[cu-getAddr() * 
cu-m_numPartitions],
+m_bestCU[0]-m_pic-m_intraData-partSizes[cu-getAddr() * 
cu-m_numPartitions],
+m_bestCU[0]-m_pic-m_intraData-modes[cu-getAddr() * 
cu-m_numPartitions]);
+}
+else
+{
+compressIntraCU(m_bestCU[0], m_tempCU[0], false, cu, 
cu-m_CULocalData);
+if (m_param-analysisMode == 1)
+{
+memcpy(m_bestCU[0]-m_pic-m_intraData-depth[cu-getAddr() * 
cu-m_numPartitions], m_bestCU[0]-getDepth(), sizeof(uint8_t) * 
cu-getTotalNumPart());
+memcpy(m_bestCU[0]-m_pic-m_intraData-modes[cu-getAddr() * 
cu-m_numPartitions], m_bestCU[0]-getLumaIntraDir(), sizeof(uint8_t) * 
cu-getTotalNumPart());
+
memcpy(m_bestCU[0]-m_pic-m_intraData-partSizes[cu-getAddr() * 
cu-m_numPartitions], m_bestCU[0]-getPartitionSize(), sizeof(char) * 
cu-getTotalNumPart());
+m_bestCU[0]-m_pic-m_intraData-cuAddr[cu-getAddr()] = 
cu-getAddr();
+m_bestCU[0]-m_pic-m_intraData-poc[cu-getAddr()]= 
cu-m_pic-m_POC;
+}
 }
 if (m_param-bLogCuStats || m_param-rc.bStatWrite)
 {
@@ -533,7 +543,142 @@
 #endif
 }
 
-void Analysis::checkIntra(TComDataCU* outBestCU, TComDataCU* outTempCU, 
PartSize partSize, CU *cu)
+void Analysis::sharedCompressIntraCU(TComDataCU* outBestCU, TComDataCU* 
outTempCU, uint32_t depth, TComDataCU* cuPicsym, CU *cu, uint8_t* sharedDepth, 
char* sharedPartSizes, uint8_t* sharedModes)
+{
+Frame* pic = outBestCU-m_pic;
+
+// if current depth == shared depth then skip further splitting.
+bool bSubBranch = true;
+
+if (depth == 0)
+{
+// offset to next best depth in sharedDepth buffer
+m_zorder = 0;
+
+// index to g_depthInc array to 

Re: [x265] [PATCH] analysis: Intra picture estimation information sharing

2014-09-15 Thread Steve Borho
On 09/15, g...@multicorewareinc.com wrote:
 # HG changeset patch
 # User Gopu Govindaswamy g...@multicorewareinc.com
 # Date 1410770251 -19800
 #  Mon Sep 15 14:07:31 2014 +0530
 # Node ID 9db768fa41ad927c66c1dc4ae446953862052ce4
 # Parent  184e56afa951815f4e295b4fcce094ee03361a2e
 analysis: Intra picture estimation information sharing
 
 when --analysis-mode=save - the encoder runs a full encode and dump the
 best split and mode decisions into x265_analysis.dat(default file name if file
 name is not provided) file
 when --analysis-mode=load - the encoder reads the best split and mode 
 decisions
 from x265_analysis.dat and bypass the actual split and mode decisions, and
 therefore perform a much faster encode
 
 diff -r 184e56afa951 -r 9db768fa41ad source/Lib/TLibCommon/TComRom.cpp
 --- a/source/Lib/TLibCommon/TComRom.cpp   Fri Sep 12 12:02:46 2014 +0530
 +++ b/source/Lib/TLibCommon/TComRom.cpp   Mon Sep 15 14:07:31 2014 +0530
 @@ -505,5 +505,19 @@
  0x38, 
  };
  
 +/* Contains how much to increment shared depth buffer for different ctu 
 sizes to get next best depth.
 + * here,
 + * depth 0 = 64x64, depth 1 = 32x32, depth 2 = 16x16 and depth 3 = 8x8
 + * if ctu = 64, depth buffer size is 256 combination of depth values 0, 
 1, 2, 3.
 + * if ctu = 32, depth buffer size is 64 combination of depth values 1, 
 2, 3.
 + * if ctu = 16, depth buffer size is 16 combination of depth values 2, 3 
 */

the comment should be w/s aligned with the array, lines 23 should be
combined

 +const uint32_t g_depthInc[3][4] =
 +{
 +{ 16,  4,  0, 0},
 +{ 64, 16,  4, 1},
 +{256, 64, 16, 4}
 +};
 +
  }
  //! \}
 diff -r 184e56afa951 -r 9db768fa41ad source/Lib/TLibCommon/TComRom.h
 --- a/source/Lib/TLibCommon/TComRom.h Fri Sep 12 12:02:46 2014 +0530
 +++ b/source/Lib/TLibCommon/TComRom.h Mon Sep 15 14:07:31 2014 +0530
 @@ -155,6 +155,8 @@
  // Intra tables
  extern const uint8_t g_intraFilterFlags[35];
  
 +extern const uint32_t g_depthInc[3][4];
 +
  }
  
  #endif  //ifndef X265_TCOMROM_H
 diff -r 184e56afa951 -r 9db768fa41ad source/encoder/analysis.cpp
 --- a/source/encoder/analysis.cpp Fri Sep 12 12:02:46 2014 +0530
 +++ b/source/encoder/analysis.cpp Mon Sep 15 14:07:31 2014 +0530
 @@ -311,14 +311,24 @@
  uint32_t numPartition = cu-getTotalNumPart();
  if (m_bestCU[0]-m_slice-m_sliceType == I_SLICE)
  {
 -compressIntraCU(m_bestCU[0], m_tempCU[0], false, cu, 
 cu-m_CULocalData);
 -if (m_param-analysisMode == 1)
 +if (m_param-analysisMode == 2)

our code should always use the X265_ANALYSIS_LOAD|SAVE macros,
except when checking != 0.

  {
 -memcpy(m_bestCU[0]-m_pic-m_intraData-depth[cu-getAddr() * 
 cu-m_numPartitions], m_bestCU[0]-getDepth(), sizeof(uint8_t) * 
 cu-getTotalNumPart());
 -memcpy(m_bestCU[0]-m_pic-m_intraData-modes[cu-getAddr() * 
 cu-m_numPartitions], m_bestCU[0]-getLumaIntraDir(), sizeof(uint8_t) * 
 cu-getTotalNumPart());
 -memcpy(m_bestCU[0]-m_pic-m_intraData-partSizes[cu-getAddr() 
 * cu-m_numPartitions], m_bestCU[0]-getPartitionSize(), sizeof(char) * 
 cu-getTotalNumPart());
 -m_bestCU[0]-m_pic-m_intraData-cuAddr[cu-getAddr()] = 
 cu-getAddr();
 -m_bestCU[0]-m_pic-m_intraData-poc[cu-getAddr()]= 
 cu-m_pic-m_POC;
 +sharedCompressIntraCU(m_bestCU[0], m_tempCU[0], false, cu, 
 cu-m_CULocalData, 
 +m_bestCU[0]-m_pic-m_intraData-depth[cu-getAddr() * 
 cu-m_numPartitions],
 +m_bestCU[0]-m_pic-m_intraData-partSizes[cu-getAddr() * 
 cu-m_numPartitions],
 +m_bestCU[0]-m_pic-m_intraData-modes[cu-getAddr() * 
 cu-m_numPartitions]);

Pointer checking needs to be done at some point, probably at the frame
level. If the user doesn't allocate a buffer, we shouldn't crash.

We should probably also be setting the analysis pointers to NULL in the
input picture structure prior to returning from x265_encoder_encode() so
they do not accidentally re-use the same buffers for more than one
picture.  In short, we need to be a lot more defensive about API abuses.

 +}
 +else
 +{
 +compressIntraCU(m_bestCU[0], m_tempCU[0], false, cu, 
 cu-m_CULocalData);
 +if (m_param-analysisMode == 1)
 +{
 +memcpy(m_bestCU[0]-m_pic-m_intraData-depth[cu-getAddr() 
 * cu-m_numPartitions], m_bestCU[0]-getDepth(), sizeof(uint8_t) * 
 cu-getTotalNumPart());
 +memcpy(m_bestCU[0]-m_pic-m_intraData-modes[cu-getAddr() 
 * cu-m_numPartitions], m_bestCU[0]-getLumaIntraDir(), sizeof(uint8_t) * 
 cu-getTotalNumPart());
 +
 memcpy(m_bestCU[0]-m_pic-m_intraData-partSizes[cu-getAddr() * 
 cu-m_numPartitions], m_bestCU[0]-getPartitionSize(), sizeof(char) * 
 cu-getTotalNumPart());
 +m_bestCU[0]-m_pic-m_intraData-cuAddr[cu-getAddr()] = 
 cu-getAddr();
 +m_bestCU[0]-m_pic-m_intraData-poc[cu-getAddr()]=