[x265] [PATCH] asm-16bpp: code for addAvg luma and chroma all sizes

2014-02-19 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1392807092 -19800
#  Wed Feb 19 16:21:32 2014 +0530
# Node ID cede20cde62ba0a96ac181bcf78a508097de0e7c
# Parent  6150985c3d535f0ea7a1dc0b8f3c69e65e30d25b
asm-16bpp: code for addAvg luma and chroma all sizes

diff -r 6150985c3d53 -r cede20cde62b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Wed Feb 19 12:21:13 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp  Wed Feb 19 16:21:32 2014 +0530
@@ -679,10 +679,13 @@
 p.chroma[X265_CSP_I420].addAvg[CHROMA_ ## W ## x ## H] = x265_addAvg_ ## W 
## x ## H ## cpu;
 
 #define CHROMA_ADDAVG(cpu) \
+SETUP_CHROMA_ADDAVG_FUNC_DEF(2,  4,  cpu); \
+SETUP_CHROMA_ADDAVG_FUNC_DEF(2,  8,  cpu); \
 SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  2,  cpu); \
 SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  4,  cpu); \
 SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  8,  cpu); \
 SETUP_CHROMA_ADDAVG_FUNC_DEF(4,  16, cpu); \
+SETUP_CHROMA_ADDAVG_FUNC_DEF(6,  8,  cpu); \
 SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  2,  cpu); \
 SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  4,  cpu); \
 SETUP_CHROMA_ADDAVG_FUNC_DEF(8,  6,  cpu); \
@@ -831,6 +834,9 @@
 }
 if (cpuMask  X265_CPU_SSE4)
 {
+LUMA_ADDAVG(_sse4);
+CHROMA_ADDAVG(_sse4);
+
 p.dct[DCT_8x8] = x265_dct8_sse4;
 p.quant = x265_quant_sse4;
 p.dequant_normal = x265_dequant_normal_sse4;
@@ -1330,10 +1336,6 @@
 SETUP_INTRA_ANG32(33, 33, sse4);
 
 p.dct[DCT_8x8] = x265_dct8_sse4;
-
-p.chroma[X265_CSP_I420].addAvg[CHROMA_2x4]  = x265_addAvg_2x4_sse4;
-p.chroma[X265_CSP_I420].addAvg[CHROMA_2x8]  = x265_addAvg_2x8_sse4;
-p.chroma[X265_CSP_I420].addAvg[CHROMA_6x8]  = x265_addAvg_6x8_sse4;
 }
 if (cpuMask  X265_CPU_AVX)
 {
diff -r 6150985c3d53 -r cede20cde62b source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Wed Feb 19 12:21:13 2014 +0530
+++ b/source/common/x86/const-a.asm Wed Feb 19 16:21:32 2014 +0530
@@ -36,8 +36,10 @@
 const pw_128,  times 16 dw 128
 const pw_256,  times 16 dw 256
 const pw_512,  times 16 dw 512
+const pw_1023, times 8  dw 1023
 const pw_1024, times 16 dw 1024
 const pw_4096, times 16 dw 4096
+const pw_16400,times 8  dw 16400
 const pw_00ff, times 16 dw 0x00ff
 const pw_pixel_max,times 16 dw ((1  BIT_DEPTH)-1)
 const deinterleave_shufd, dd 0,4,1,5,2,6,3,7
diff -r 6150985c3d53 -r cede20cde62b source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Wed Feb 19 12:21:13 2014 +0530
+++ b/source/common/x86/intrapred16.asm Wed Feb 19 16:21:32 2014 +0530
@@ -45,7 +45,6 @@
 const c_mode32_10_0,db  0,  1,  0,  1,  0,  1,  0,  1,  0,  1,  0,  1, 
 0,  1,  0,  1
 
 const pw_unpackwdq, times 8 db 0,1
-const pw_1023,  times 8 dw 1023
 const pw_ang8_12,   db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 13, 0, 1
 const pw_ang8_13,   db 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 8, 9, 0, 1
 const pw_ang8_14,   db 0, 0, 0, 0, 0, 0, 0, 0, 14, 15, 10, 11, 4, 5, 0, 1
@@ -58,6 +57,7 @@
 
 cextern pw_1
 cextern pw_8
+cextern pw_1023
 cextern pd_16
 cextern pd_32
 cextern pw_4096
diff -r 6150985c3d53 -r cede20cde62b source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asmWed Feb 19 12:21:13 2014 +0530
+++ b/source/common/x86/mc-a.asmWed Feb 19 16:21:32 2014 +0530
@@ -52,6 +52,9 @@
 cextern pw_128
 cextern pw_256
 cextern pw_512
+cextern pw_1023
+cextern pw_1024
+cextern pw_16400
 cextern pw_00ff
 cextern pw_pixel_max
 cextern sw_64
@@ -65,6 +68,873 @@
 ; r2 = pDst, r3 = iStride0
 ; r4 = iStride1, r5 = iDstStride
 
+%if HIGH_BIT_DEPTH
+INIT_XMM sse4
+cglobal addAvg_2x4, 6,7,8, pSrc0, pSrc1, pDst, iStride0, iStride1, iDstStride
+mova  m7,  [pw_16400]
+mova  m0,  [pw_1023]
+add   r3,  r3
+add   r4,  r4
+add   r5,  r5
+
+movd  m1,  [r0]
+movd  m2,  [r0 + r3]
+movd  m3,  [r1]
+movd  m4,  [r1 + r4]
+
+punpckldq m1,  m2
+punpckldq m3,  m4
+
+lea   r0,  [r0 + 2 * r3]
+lea   r1,  [r1 + 2 * r4]
+
+movd  m2,  [r0]
+movd  m4,  [r0 + r3]
+movd  m5,  [r1]
+movd  m6,  [r1 + r4]
+
+punpckldq m2,  m4
+punpckldq m5,  m6
+punpcklqdqm1,  m2
+punpcklqdqm3,  m5
+
+paddw m1,  m3
+paddw m1,  m7
+psraw m1,  5
+pxor  m6,  m6
+pmaxswm1,  m6
+pminswm1,  m0
+
+movd  [r2],m1
+pextrd[r2 + r5],   m1, 1
+lea   r2,  [r2 + 2 * r5]
+pextrd[r2],m1, 2
+pextrd[r2 + r5],   m1, 3
+
+

[x265] [PATCH] fix for 420 binary mismatch for --preset=slower option

2014-02-19 Thread ashok
# HG changeset patch
# User as...@multicorewareinc.com
# Date 1392807860 -19800
#  Wed Feb 19 16:34:20 2014 +0530
# Node ID f0e4f6aa075587f715a7cd48ef63f97d56caa21a
# Parent  8571d160aedb00e07a3f47016f04d8d9aeaa5856
fix for 420 binary mismatch for --preset=slower option

diff -r 8571d160aedb -r f0e4f6aa0755 source/Lib/TLibCommon/TComDataCU.cpp
--- a/source/Lib/TLibCommon/TComDataCU.cpp  Tue Feb 18 01:43:42 2014 -0600
+++ b/source/Lib/TLibCommon/TComDataCU.cpp  Wed Feb 19 16:34:20 2014 +0530
@@ -2852,7 +2852,7 @@
+ (partWidth / m_pic-getMinCUWidth()) 
/ 2];
 }
 
-uint32_t TComDataCU::getCoefScanIdx(uint32_t absPartIdx, uint32_t width, bool 
bIsLuma, bool bIsIntra)
+uint32_t TComDataCU::getCoefScanIdx(uint32_t absPartIdx, uint32_t width, 
uint32_t height, bool bIsLuma, bool bIsIntra)
 {
 uint32_t scanIdx;
 uint32_t dirMode;
@@ -2863,7 +2863,6 @@
 }
 //check that MDCS can be used for this TU
 
-uint32_t height = width;
 if (bIsLuma)
 {
 const uint32_t maximumWidth  = MDCS_MAXIMUM_WIDTH;
diff -r 8571d160aedb -r f0e4f6aa0755 source/Lib/TLibCommon/TComDataCU.h
--- a/source/Lib/TLibCommon/TComDataCU.hTue Feb 18 01:43:42 2014 -0600
+++ b/source/Lib/TLibCommon/TComDataCU.hWed Feb 19 16:34:20 2014 +0530
@@ -473,7 +473,7 @@
 
 uint32_t getTotalNumPart()   { return m_numPartitions; }
 
-uint32_t  getCoefScanIdx(uint32_t absPartIdx, uint32_t width, bool 
bIsLuma, bool bIsIntra);
+uint32_t  getCoefScanIdx(uint32_t absPartIdx, uint32_t width, uint32_t 
height, bool bIsLuma, bool bIsIntra);
 
 // 
---
 // member functions to support multiple color space formats
diff -r 8571d160aedb -r f0e4f6aa0755 source/Lib/TLibCommon/TComTrQuant.cpp
--- a/source/Lib/TLibCommon/TComTrQuant.cpp Tue Feb 18 01:43:42 2014 -0600
+++ b/source/Lib/TLibCommon/TComTrQuant.cpp Wed Feb 19 16:34:20 2014 +0530
@@ -502,7 +502,7 @@
 const uint32_t log2BlockWidth  = g_convertToBit[width]  + 
2;
 const uint32_t log2BlockHeight = g_convertToBit[height] + 
2;
 
-result.scanType = COEFF_SCAN_TYPE(cu-getCoefScanIdx(absPartIdx, width, 
ttype == TEXT_LUMA, cu-isIntra(absPartIdx)));
+result.scanType = COEFF_SCAN_TYPE(cu-getCoefScanIdx(absPartIdx, width, 
height, ttype == TEXT_LUMA, cu-isIntra(absPartIdx)));
 
 //set the group layout
 result.widthInGroups  = width   MLS_CG_LOG2_WIDTH;
@@ -516,19 +516,20 @@
 result.scanCG = 
g_scanOrder[SCAN_UNGROUPED][result.scanType][log2WidthInGroups][log2HeightInGroups];
 
 //set the significance map context selection parameters
+TextType ctype = ttype == TEXT_LUMA ? TEXT_LUMA : TEXT_CHROMA;
 if ((width == 4)  (height == 4))
 {
-result.firstSignificanceMapContext = 
significanceMapContextSetStart[ttype][CONTEXT_TYPE_4x4];
+result.firstSignificanceMapContext = 
significanceMapContextSetStart[ctype][CONTEXT_TYPE_4x4];
 }
 else if ((width == 8)  (height == 8))
 {
-result.firstSignificanceMapContext = 
significanceMapContextSetStart[ttype][CONTEXT_TYPE_8x8];
+result.firstSignificanceMapContext = 
significanceMapContextSetStart[ctype][CONTEXT_TYPE_8x8];
 if (result.scanType != SCAN_DIAG)
-result.firstSignificanceMapContext += 
nonDiagonalScan8x8ContextOffset[ttype];
+result.firstSignificanceMapContext += 
nonDiagonalScan8x8ContextOffset[ctype];
 }
 else
 {
-result.firstSignificanceMapContext = 
significanceMapContextSetStart[ttype][CONTEXT_TYPE_NxN];
+result.firstSignificanceMapContext = 
significanceMapContextSetStart[ctype][CONTEXT_TYPE_NxN];
 }
 }
 
@@ -1124,8 +1125,8 @@
 }
 
 const bool notFirstGroup = ((posX  MLS_CG_LOG2_WIDTH) + (posY  
MLS_CG_LOG2_HEIGHT))  0;
-
-offset = (notFirstGroup ? 
notFirstGroupNeighbourhoodContextOffset[ttype] : 0) + cnt;
+TextType ctype = ttype == TEXT_LUMA ? TEXT_LUMA : TEXT_CHROMA;
+offset = (notFirstGroup ? 
notFirstGroupNeighbourhoodContextOffset[ctype] : 0) + cnt;
 }
 return codingParameters.firstSignificanceMapContext + offset;
 }
diff -r 8571d160aedb -r f0e4f6aa0755 source/Lib/TLibEncoder/TEncSbac.cpp
--- a/source/Lib/TLibEncoder/TEncSbac.cpp   Tue Feb 18 01:43:42 2014 -0600
+++ b/source/Lib/TLibEncoder/TEncSbac.cpp   Wed Feb 19 16:34:20 2014 +0530
@@ -942,7 +942,7 @@
 void TEncSbac::xCodeScalingList(TComScalingList* scalingList, uint32_t sizeId, 
uint32_t listId)
 {
 int coefNum = X265_MIN(MAX_MATRIX_COEF_NUM, 
(int)g_scalingListSize[sizeId]);
-const uint32_t* scan  = (sizeId == 0) ? g_sigLastScan[SCAN_DIAG][1] :  
g_sigLastScanCG32x32;
+const uint32_t* scan  = g_scanOrder[SCAN_UNGROUPED][SCAN_DIAG][sizeId==0 ? 
2 : 3][sizeId==0 ? 2 : 3];
 int nextCoef = 

Re: [x265] APPCRASH in x265 0.7+207 while encoding in preset 'slow' or slower...

2014-02-19 Thread Mario *LigH* Rohkrämer

Am 18.02.2014, 14:03 Uhr, schrieb Mario *LigH* Rohkrämer cont...@ligh.de:

I ran a loop of encodes through all presets (all default options) with  
Sintel Trailer in 640x272 as Y4M source (YUV 4:2:0).


During all presets {slow..placebo}, x265 0.7+207-1be6b8c8b9ed [GCC  
4.8.2, Win64] crashed at different frames, usually around 120/1247,  
already at 29/1247 for preset placebo.


All faster presets passed without crash.


Probably fixed by patch 6190 (591ca91f0501)?

x265 0.7+216-591ca91f0501 [Windows][GCC 4.8.2][64 bit] 8bpp does not crash  
anymore in all presets, except placebo (crash during the final statistics  
summary).


But quality in default CRF 28 is now a lot worse, files now even about  
half the size as before, in presets {fast..placebo}.


--preset faster: 544.26 kbps, 20.311 dB SSIM
--preset fast: 56.79 kbps, 13.542 dB SSIM
--preset slow: 51.78 kbps, 13.493 db SSIM

(Sintel trailer, 640x272, no additional options except logging)

--
__

Fun and success!
Mario *LigH* Rohkrämer
mailto:cont...@ligh.de

___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] Fixed ENC_DEC_TRACE warnings and errors

2014-02-19 Thread dtyx265
# HG changeset patch
# User David T Yuen dtyx...@gmail.com
# Date 1392832006 28800
# Node ID 1c78bd13a14f3d5227c4b961664af97f86a8810d
# Parent  591ca91f0501b167627adc1c9542aebc60dc320a
Fixed ENC_DEC_TRACE warnings and errors

diff -r 591ca91f0501 -r 1c78bd13a14f source/Lib/TLibCommon/TComRom.h
--- a/source/Lib/TLibCommon/TComRom.h   Wed Feb 19 16:34:20 2014 +0530
+++ b/source/Lib/TLibCommon/TComRom.h   Wed Feb 19 09:46:46 2014 -0800
@@ -189,7 +189,7 @@
 
 #define DTRACE_CABAC_F(x) if ((g_nSymbolCounter = COUNTER_START  
g_nSymbolCounter = COUNTER_END) || g_bJustDoIt) fprintf(g_hTrace, %f, x);
 #define DTRACE_CABAC_V(x) if ((g_nSymbolCounter = COUNTER_START  
g_nSymbolCounter = COUNTER_END) || g_bJustDoIt) fprintf(g_hTrace, %d, x);
-#define DTRACE_CABAC_VL(x)if ((g_nSymbolCounter = COUNTER_START  
g_nSymbolCounter = COUNTER_END) || g_bJustDoIt) fprintf(g_hTrace, %lld, x);
+#define DTRACE_CABAC_VL(x)if ((g_nSymbolCounter = COUNTER_START  
g_nSymbolCounter = COUNTER_END) || g_bJustDoIt) fprintf(g_hTrace, %ld, x);
 #define DTRACE_CABAC_T(x) if ((g_nSymbolCounter = COUNTER_START  
g_nSymbolCounter = COUNTER_END) || g_bJustDoIt) fprintf(g_hTrace, %s, x);
 #define DTRACE_CABAC_X(x) if ((g_nSymbolCounter = COUNTER_START  
g_nSymbolCounter = COUNTER_END) || g_bJustDoIt) fprintf(g_hTrace, %x, x);
 #define DTRACE_CABAC_R(x, y)  if ((g_nSymbolCounter = COUNTER_START  
g_nSymbolCounter = COUNTER_END) || g_bJustDoIt) fprintf(g_hTrace, x,y);
diff -r 591ca91f0501 -r 1c78bd13a14f 
source/Lib/TLibEncoder/SyntaxElementWriter.cpp
--- a/source/Lib/TLibEncoder/SyntaxElementWriter.cppWed Feb 19 16:34:20 
2014 +0530
+++ b/source/Lib/TLibEncoder/SyntaxElementWriter.cppWed Feb 19 09:46:46 
2014 -0800
@@ -50,7 +50,7 @@
 xWriteCode(value, length);
 if (g_HLSTraceEnable)
 {
-fprintf(g_hTrace, %8lld  , g_nSymbolCounter++);
+fprintf(g_hTrace, %8ld  , g_nSymbolCounter++);
 if (length  10)
 {
 fprintf(g_hTrace, %-50s u(%d)  : %d\n, symbolName, length, 
value);
@@ -67,7 +67,7 @@
 xWriteUvlc(value);
 if (g_HLSTraceEnable)
 {
-fprintf(g_hTrace, %8lld  , g_nSymbolCounter++);
+fprintf(g_hTrace, %8ld  , g_nSymbolCounter++);
 fprintf(g_hTrace, %-50s ue(v) : %d\n, symbolName, value);
 }
 }
@@ -77,7 +77,7 @@
 xWriteSvlc(value);
 if (g_HLSTraceEnable)
 {
-fprintf(g_hTrace, %8lld  , g_nSymbolCounter++);
+fprintf(g_hTrace, %8ld  , g_nSymbolCounter++);
 fprintf(g_hTrace, %-50s se(v) : %d\n, symbolName, value);
 }
 }
@@ -87,7 +87,7 @@
 xWriteFlag(value);
 if (g_HLSTraceEnable)
 {
-fprintf(g_hTrace, %8lld  , g_nSymbolCounter++);
+fprintf(g_hTrace, %8ld  , g_nSymbolCounter++);
 fprintf(g_hTrace, %-50s u(1)  : %d\n, symbolName, value);
 }
 }
diff -r 591ca91f0501 -r 1c78bd13a14f 
source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp
--- a/source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp  Wed Feb 19 16:34:20 
2014 +0530
+++ b/source/Lib/TLibEncoder/TEncBinCoderCABAC.cpp  Wed Feb 19 09:46:46 
2014 -0800
@@ -172,7 +172,7 @@
 {
 DTRACE_CABAC_VL(g_nSymbolCounter++)
 DTRACE_CABAC_T(\tstate=)
-DTRACE_CABAC_V((ctxModel.getState()  1) + ctxModel.getMps())
+DTRACE_CABAC_V((ctxModel.m_state  1) + sbacGetMps(ctxModel.m_state))
 DTRACE_CABAC_T(\tsymbol=)
 DTRACE_CABAC_V(binValue)
 DTRACE_CABAC_T(\n)
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] primitives: add count_nonzero

2014-02-19 Thread chen
+INIT_XMM sse2
+cglobal count_nonzero, 2,3,4
+pxorm0, m0
+pxorm1, m1
+mov r2d, r1d
+shr r1d, 3
+
+.loop

+movam2, [r0]
+movam3, [r0 + 16]

+add r0, 32

+packssdwm2, m3,
just count, no need it
 
+pcmpeqw m2, m0
+psrlw   m2, 15
pcmp generte mask, it is 0x, so we no need to shift right
 
+packsswbm2, m2
+psadbw  m2, m0
psad is low perf, why you need exact number in inner loop?
of course, abs(-1) = abs(1) 

+paddd   m1, m2
+dec r1d
+jnz.loop
+
+movdr1d, m1
+sub r2d, r1d
+mov eax, r2d
+
+RET
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] [PATCH] asm : asm routine for chroma_p2s for 4:4:4 color space format

2014-02-19 Thread chen
At 2014-02-17 20:44:29,naba...@multicorewareinc.com wrote:
# HG changeset patch
# User Nabajit Deka
# Date 1392641037 -19800
#  Mon Feb 17 18:13:57 2014 +0530
# Node ID f5275ca8f2985bb0daf563738e6071b81967c2cd
# Parent  ce96cdb390fe26aee6effa731e51303c1d9056b0
asm : asm routine for chroma_p2s for 4:4:4 color space format

+INIT_XMM ssse3
+cglobal chroma_p2s_i444, 3, 7, 4
+
+; load width and height
+mov r3d, r3m
+mov r4d, r4m
+
+; load constant
+movam2, [tab_c_128]
+movam3, [tab_c_64_n64]
+
+.loopH:
+
+xor r5d, r5d
+.loopW:
+lea r6, [r0 + r5]
+
+movhm0, [r6]
+punpcklbw   m0, m2
+pmaddubsw   m0, m3
+
+movhm1, [r6 + r1]
+punpcklbw   m1, m2
+pmaddubsw   m1, m3
+
+add r5d, 8
+cmp r5d, r3d
+lea r6, [r2 + r5 * 2]
+jg  .width4
+movu[r6 + FENC_STRIDE * 0 - 16], m0
+movu[r6 + FENC_STRIDE * 2 - 16], m1
+je  .nextH
+jmp .loopW
+
+.width4:
+testr3d, 4
+jz  .width2
+testr3d, 2
+movh[r6 + FENC_STRIDE * 0 - 16], m0
+movh[r6 + FENC_STRIDE * 2 - 16], m1
+lea r6, [r6 + 8]
+pshufd  m0, m0, 2
+pshufd  m1, m1, 2
+jz  .nextH
+
+.width2:
+movd[r6 + FENC_STRIDE * 0 - 16], m0
+movd[r6 + FENC_STRIDE * 2 - 16], m1
I think YUV444 no need width2 path, please check and confirm it.
 
 ___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] APPCRASH in x265 0.7+207 while encoding in preset 'slow' or slower...

2014-02-19 Thread Steve Borho
On Wed, Feb 19, 2014 at 8:03 AM, Mario *LigH* Rohkrämer cont...@ligh.dewrote:

 Am 18.02.2014, 14:03 Uhr, schrieb Mario *LigH* Rohkrämer cont...@ligh.de
 :


  I ran a loop of encodes through all presets (all default options) with
 Sintel Trailer in 640x272 as Y4M source (YUV 4:2:0).

 During all presets {slow..placebo}, x265 0.7+207-1be6b8c8b9ed [GCC 4.8.2,
 Win64] crashed at different frames, usually around 120/1247, already at
 29/1247 for preset placebo.

 All faster presets passed without crash.


 Probably fixed by patch 6190 (591ca91f0501)?

 x265 0.7+216-591ca91f0501 [Windows][GCC 4.8.2][64 bit] 8bpp does not crash
 anymore in all presets, except placebo (crash during the final statistics
 summary).


verified; if you encode about 100 frames at placebo it reports heap
corruption at exit.  Verified with a debug build in MSVC as well.  I'll see
if valgrind can catch the root cause.


 But quality in default CRF 28 is now a lot worse, files now even about
 half the size as before, in presets {fast..placebo}.

 --preset faster: 544.26 kbps, 20.311 dB SSIM
 --preset fast: 56.79 kbps, 13.542 dB SSIM
 --preset slow: 51.78 kbps, 13.493 db SSIM

 (Sintel trailer, 640x272, no additional options except logging)


will look into this next, thanks for reporting.

We currently still have one known hash-mistake bug, reproducible with the
sintel 480 clip and preset slower.  There's a number of pixels on frame 720
that are off-by one.  Seems to be a rounding issue somewhere, we're
investigating.

--
Steve Borho
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] [PATCH] fix for 420 binary mismatch for --preset=slower option

2014-02-19 Thread Steve Borho
On Wed, Feb 19, 2014 at 5:05 AM, as...@multicorewareinc.com wrote:

 # HG changeset patch
 # User as...@multicorewareinc.com


You need to configure a full name and email address as your Mercurial
commit username

# Date 1392807860 -19800
 #  Wed Feb 19 16:34:20 2014 +0530
 # Node ID f0e4f6aa075587f715a7cd48ef63f97d56caa21a
 # Parent  8571d160aedb00e07a3f47016f04d8d9aeaa5856
 fix for 420 binary mismatch for --preset=slower option

 diff -r 8571d160aedb -r f0e4f6aa0755 source/Lib/TLibCommon/TComDataCU.cpp
 --- a/source/Lib/TLibCommon/TComDataCU.cpp  Tue Feb 18 01:43:42 2014
 -0600
 +++ b/source/Lib/TLibCommon/TComDataCU.cpp  Wed Feb 19 16:34:20 2014
 +0530
 @@ -2852,7 +2852,7 @@
 + (partWidth /
 m_pic-getMinCUWidth()) / 2];
  }

 -uint32_t TComDataCU::getCoefScanIdx(uint32_t absPartIdx, uint32_t width,
 bool bIsLuma, bool bIsIntra)
 +uint32_t TComDataCU::getCoefScanIdx(uint32_t absPartIdx, uint32_t width,
 uint32_t height, bool bIsLuma, bool bIsIntra)
  {
  uint32_t scanIdx;
  uint32_t dirMode;
 @@ -2863,7 +2863,6 @@
  }
  //check that MDCS can be used for this TU

 -uint32_t height = width;
  if (bIsLuma)
  {
  const uint32_t maximumWidth  = MDCS_MAXIMUM_WIDTH;
 diff -r 8571d160aedb -r f0e4f6aa0755 source/Lib/TLibCommon/TComDataCU.h
 --- a/source/Lib/TLibCommon/TComDataCU.hTue Feb 18 01:43:42 2014
 -0600
 +++ b/source/Lib/TLibCommon/TComDataCU.hWed Feb 19 16:34:20 2014
 +0530
 @@ -473,7 +473,7 @@

  uint32_t getTotalNumPart()   { return
 m_numPartitions; }

 -uint32_t  getCoefScanIdx(uint32_t absPartIdx, uint32_t width,
 bool bIsLuma, bool bIsIntra);
 +uint32_t  getCoefScanIdx(uint32_t absPartIdx, uint32_t width,
 uint32_t height, bool bIsLuma, bool bIsIntra);

  //
 ---
  // member functions to support multiple color space formats
 diff -r 8571d160aedb -r f0e4f6aa0755 source/Lib/TLibCommon/TComTrQuant.cpp
 --- a/source/Lib/TLibCommon/TComTrQuant.cpp Tue Feb 18 01:43:42 2014
 -0600
 +++ b/source/Lib/TLibCommon/TComTrQuant.cpp Wed Feb 19 16:34:20 2014
 +0530
 @@ -502,7 +502,7 @@
  const uint32_t log2BlockWidth  =
 g_convertToBit[width]  + 2;
  const uint32_t log2BlockHeight =
 g_convertToBit[height] + 2;

 -result.scanType = COEFF_SCAN_TYPE(cu-getCoefScanIdx(absPartIdx,
 width, ttype == TEXT_LUMA, cu-isIntra(absPartIdx)));
 +result.scanType = COEFF_SCAN_TYPE(cu-getCoefScanIdx(absPartIdx,
 width, height, ttype == TEXT_LUMA, cu-isIntra(absPartIdx)));

  //set the group layout
  result.widthInGroups  = width   MLS_CG_LOG2_WIDTH;
 @@ -516,19 +516,20 @@
  result.scanCG =
 g_scanOrder[SCAN_UNGROUPED][result.scanType][log2WidthInGroups][log2HeightInGroups];

  //set the significance map context selection parameters
 +TextType ctype = ttype == TEXT_LUMA ? TEXT_LUMA : TEXT_CHROMA;
  if ((width == 4)  (height == 4))
  {
 -result.firstSignificanceMapContext =
 significanceMapContextSetStart[ttype][CONTEXT_TYPE_4x4];
 +result.firstSignificanceMapContext =
 significanceMapContextSetStart[ctype][CONTEXT_TYPE_4x4];
  }
  else if ((width == 8)  (height == 8))
  {
 -result.firstSignificanceMapContext =
 significanceMapContextSetStart[ttype][CONTEXT_TYPE_8x8];
 +result.firstSignificanceMapContext =
 significanceMapContextSetStart[ctype][CONTEXT_TYPE_8x8];
  if (result.scanType != SCAN_DIAG)
 -result.firstSignificanceMapContext +=
 nonDiagonalScan8x8ContextOffset[ttype];
 +result.firstSignificanceMapContext +=
 nonDiagonalScan8x8ContextOffset[ctype];
  }
  else
  {
 -result.firstSignificanceMapContext =
 significanceMapContextSetStart[ttype][CONTEXT_TYPE_NxN];
 +result.firstSignificanceMapContext =
 significanceMapContextSetStart[ctype][CONTEXT_TYPE_NxN];
  }
  }

 @@ -1124,8 +1125,8 @@
  }

  const bool notFirstGroup = ((posX  MLS_CG_LOG2_WIDTH) + (posY
  MLS_CG_LOG2_HEIGHT))  0;
 -
 -offset = (notFirstGroup ?
 notFirstGroupNeighbourhoodContextOffset[ttype] : 0) + cnt;
 +TextType ctype = ttype == TEXT_LUMA ? TEXT_LUMA : TEXT_CHROMA;
 +offset = (notFirstGroup ?
 notFirstGroupNeighbourhoodContextOffset[ctype] : 0) + cnt;
  }
  return codingParameters.firstSignificanceMapContext + offset;
  }
 diff -r 8571d160aedb -r f0e4f6aa0755 source/Lib/TLibEncoder/TEncSbac.cpp
 --- a/source/Lib/TLibEncoder/TEncSbac.cpp   Tue Feb 18 01:43:42 2014
 -0600
 +++ b/source/Lib/TLibEncoder/TEncSbac.cpp   Wed Feb 19 16:34:20 2014
 +0530
 @@ -942,7 +942,7 @@
  void TEncSbac::xCodeScalingList(TComScalingList* scalingList, uint32_t
 sizeId, uint32_t listId)
  {
  int coefNum = X265_MIN(MAX_MATRIX_COEF_NUM,
 (int)g_scalingListSize[sizeId]);
 -

Re: [x265] APPCRASH in x265 0.7+207 while encoding in preset 'slow' or slower...

2014-02-19 Thread Steve Borho
On Wed, Feb 19, 2014 at 1:28 PM, Steve Borho st...@borho.org wrote:




 On Wed, Feb 19, 2014 at 8:03 AM, Mario *LigH* Rohkrämer 
 cont...@ligh.dewrote:

 Am 18.02.2014, 14:03 Uhr, schrieb Mario *LigH* Rohkrämer cont...@ligh.de
 :


  I ran a loop of encodes through all presets (all default options) with
 Sintel Trailer in 640x272 as Y4M source (YUV 4:2:0).

 During all presets {slow..placebo}, x265 0.7+207-1be6b8c8b9ed [GCC
 4.8.2, Win64] crashed at different frames, usually around 120/1247, already
 at 29/1247 for preset placebo.

 All faster presets passed without crash.


 Probably fixed by patch 6190 (591ca91f0501)?

 x265 0.7+216-591ca91f0501 [Windows][GCC 4.8.2][64 bit] 8bpp does not
 crash anymore in all presets, except placebo (crash during the final
 statistics summary).


 verified; if you encode about 100 frames at placebo it reports heap
 corruption at exit.  Verified with a debug build in MSVC as well.  I'll see
 if valgrind can catch the root cause.


valgrind finds that transform-skipped chroma blocks are copying too much
data; we're still investigating but I expect this will be fixed by tomorrow.


 But quality in default CRF 28 is now a lot worse, files now even about
 half the size as before, in presets {fast..placebo}.

 --preset faster: 544.26 kbps, 20.311 dB SSIM
 --preset fast: 56.79 kbps, 13.542 dB SSIM
 --preset slow: 51.78 kbps, 13.493 db SSIM

 (Sintel trailer, 640x272, no additional options except logging)


 will look into this next, thanks for reporting.


My fault, a fix for this was just pushed.

We currently still have one known hash-mistake bug, reproducible with the
 sintel 480 clip and preset slower.  There's a number of pixels on frame 720
 that are off-by one.  Seems to be a rounding issue somewhere, we're
 investigating.


Hot on the trail of this one.

-- 
Steve Borho
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] [Bug]reconstruction yuv picture diff with HM decoder out

2014-02-19 Thread Deepthi Nandakumar
Hi,

We've checked out for hash mismatch for all our hash clips with --rd 2. Can
you share the source that caused a mismatch? That will help us identify the
issue.

Thanks,
Deepthi


On Mon, Feb 17, 2014 at 12:38 PM, z...@rock-chips.com 
z...@rock-chips.comwrote:

  hi, x265 members
 1. [Bug report]
 We found that x265 (version 0.7+2-4b8901ae94ece1ac ) recon yuv data diff
 with HM decode out when config CQP mode with QP=34 and set rd=0 or 1 or 2,I 
 think it's a serious bug!

 with command like this


 --input F:\yuv\Samsung_1080p_25.yuv --input-res 1920x1080 --fps 24 -q 34 -o 
 E:\out1.bin -r E:\rec1.yuv --psnr --rd 0


 --input F:\yuv\Samsung_1080p_25.yuv --input-res 1920x1080 --fps 24 -q 34 -o 
 E:\out1.bin -r E:\rec1.yuv --psnr --rd 1

 --input F:\yuv\Samsung_1080p_25.yuv --input-res 1920x1080 --fps 24 -q 34 -o 
 E:\out1.bin -r E:\rec1.yuv --psnr --rd 2


 2. [Proprose]
  The x265 codec encodes video con-tent  using  a  fixed  quantization
 step,  thus  leading to  a  variable bitrate  stream  which  may  not  be
 suitable  for  the  many  multi-media  applications  where  a  constant
 bandwidth  is  required. Therefore,  maybe adaptive quantization  step may
 be better.


 I'm looking forward to you
 thks
  z...@rock-chips.com

 ___
 x265-devel mailing list
 x265-devel@videolan.org
 https://mailman.videolan.org/listinfo/x265-devel


___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] reduce addClip

2014-02-19 Thread Satoshi Nakagawa
# HG changeset patch
# User Satoshi Nakagawa nakagawa...@oki.com
# Date 1392872381 -32400
#  Thu Feb 20 13:59:41 2014 +0900
# Node ID 588adfc60b27190e5d595611c3d34c49e381d9ae
# Parent  3389061b75a486e004409ab628c46fed39d03b72
reduce addClip

diff -r 3389061b75a4 -r 588adfc60b27 source/Lib/TLibCommon/TComDataCU.h
--- a/source/Lib/TLibCommon/TComDataCU.hWed Feb 19 17:03:21 2014 -0600
+++ b/source/Lib/TLibCommon/TComDataCU.hThu Feb 20 13:59:41 2014 +0900
@@ -312,7 +312,7 @@
 
 void  setCbf(uint32_t idx, TextType ttype, UChar uh) { 
m_cbf[ttype][idx] = uh; }
 
-UChar getQtRootCbf(uint32_t idx)   { return getCbf(idx, 
TEXT_LUMA, 0) || getCbf(idx, TEXT_CHROMA_U, 0) || getCbf(idx, TEXT_CHROMA_V, 
0); }
+UChar getQtRootCbf(uint32_t idx)   { return getCbf(idx, 
TEXT_LUMA) || getCbf(idx, TEXT_CHROMA_U) || getCbf(idx, TEXT_CHROMA_V); }
 
 void  setCbfSubParts(uint32_t cbfY, uint32_t cbfU, uint32_t cbfV, 
uint32_t absPartIdx, uint32_t depth);
 void  setCbfSubParts(uint32_t cbf, TextType ttype, uint32_t 
absPartIdx, uint32_t depth);
diff -r 3389061b75a4 -r 588adfc60b27 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Wed Feb 19 17:03:21 2014 -0600
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Thu Feb 20 13:59:41 2014 +0900
@@ -3210,7 +3210,14 @@
 
 assert(bcost != MAX_INT64);
 
-outReconYuv-addClip(predYuv, outBestResiYuv, 0, width);
+if (cu-getQtRootCbf(0))
+{
+outReconYuv-addClip(predYuv, outBestResiYuv, 0, width);
+}
+else
+{
+predYuv-copyToPartYuv(outReconYuv, 0);
+}
 
 // update with clipped distortion and cost (qp estimation loop uses 
unclipped values)
 int part = partitionFromSizes(width, height);
@@ -3246,12 +3253,19 @@
 {
 residualTransformQuantInter(cu, 0, 0, resiYuv, cu-getDepth(0), true);
 uint32_t width  = cu-getWidth(0);
-reconYuv-addClip(predYuv, resiYuv, 0, width);
-
-if (cu-getMergeFlag(0)  cu-getPartitionSize(0) == SIZE_2Nx2N  
cu-getQtRootCbf(0) == 0)
+if (cu-getQtRootCbf(0))
 {
-cu-setSkipFlagSubParts(true, 0, cu-getDepth(0));
+reconYuv-addClip(predYuv, resiYuv, 0, width);
 }
+else
+{
+predYuv-copyToPartYuv(reconYuv, 0);
+if (cu-getMergeFlag(0)  cu-getPartitionSize(0) == SIZE_2Nx2N)
+{
+cu-setSkipFlagSubParts(true, 0, cu-getDepth(0));
+}
+}
+
 }
 else if (cu-getPredictionMode(0) == MODE_INTRA)
 {
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] tcomrom: scaning order table g_sigLastScan replaced with g_scanOrder

2014-02-19 Thread gopu
# HG changeset patch
# User Gopu Govindaswamy
# Date 1392921339 28800
#  Thu Feb 20 10:35:39 2014 -0800
# Node ID 34886273d14b41d777a9129fc3657aef34d2c986
# Parent  3389061b75a486e004409ab628c46fed39d03b72
tcomrom: scaning order table g_sigLastScan replaced with g_scanOrder

Scaning order table initialization moved into initRom() and same scaning order 
table
can be used for both 444 and 420

diff -r 3389061b75a4 -r 34886273d14b source/Lib/TLibCommon/CommonDef.h
--- a/source/Lib/TLibCommon/CommonDef.h Wed Feb 19 17:03:21 2014 -0600
+++ b/source/Lib/TLibCommon/CommonDef.h Thu Feb 20 10:35:39 2014 -0800
@@ -108,7 +108,7 @@
 #define NUM_CHROMA_MODE 5 // total number of chroma modes
 #define DM_CHROMA_IDX   36 // chroma mode index for derived from 
luma intra mode
 
-#define FULL_NBIT 0 /// When enabled, compute costs using full sample 
bitdepth.  When disabled, compute costs as if it is 8-bit source video.
+#define FULL_NBIT 1 /// When enabled, compute costs using full sample 
bitdepth.  When disabled, compute costs as if it is 8-bit source video.
 #if FULL_NBIT || !HIGH_BIT_DEPTH
 # define DISTORTION_PRECISION_ADJUSTMENT(x) 0
 #else
diff -r 3389061b75a4 -r 34886273d14b source/Lib/TLibCommon/TComRom.cpp
--- a/source/Lib/TLibCommon/TComRom.cpp Wed Feb 19 17:03:21 2014 -0600
+++ b/source/Lib/TLibCommon/TComRom.cpp Thu Feb 20 10:35:39 2014 -0800
@@ -458,9 +458,6 @@
 // Scanning order  context model mapping
 // 

 
-// scanning order table
-uint32_t* g_sigLastScan[3][MAX_CU_DEPTH];
-
 const uint32_t g_sigLastScan8x8[3][4] =
 {
 { 0, 2, 1, 3 },
@@ -487,131 +484,6 @@
 
 const uint32_t g_goRicePrefixLen[5] = { 8, 7, 6, 5, 4 };
 
-void initSigLastScan(uint32_t* buffD, uint32_t* buffH, uint32_t* buffV, int 
width, int height)
-{
-const uint32_t  numScanPos  = uint32_t(width * width);
-uint32_tnextScanPos = 0;
-
-if (width = 4)
-{
-for (uint32_t scanLine = 0; nextScanPos  numScanPos; scanLine++)
-{
-int primDim = int(scanLine);
-int scndDim = 0;
-while (primDim = width)
-{
-scndDim++;
-primDim--;
-}
-
-while (primDim = 0  scndDim  width)
-{
-buffD[nextScanPos] = primDim * width + scndDim;
-nextScanPos++;
-scndDim++;
-primDim--;
-}
-}
-}
-if (width  4)
-{
-uint32_t numBlkSide = width  2;
-uint32_t numBlks= numBlkSide * numBlkSide;
-uint32_t log2Blk= g_convertToBit[numBlkSide] + 1;
-
-for (uint32_t blk = 0; blk  numBlks; blk++)
-{
-nextScanPos   = 0;
-uint32_t initBlkPos = g_sigLastScan[SCAN_DIAG][log2Blk][blk];
-if (width == 32)
-{
-initBlkPos = g_sigLastScanCG32x32[blk];
-}
-uint32_t offsetY= initBlkPos / numBlkSide;
-uint32_t offsetX= initBlkPos - offsetY * numBlkSide;
-uint32_t offsetD= 4 * (offsetX + offsetY * width);
-uint32_t offsetScan = 16 * blk;
-for (uint32_t scanLine = 0; nextScanPos  16; scanLine++)
-{
-int primDim = int(scanLine);
-int scndDim = 0;
-while (primDim = 4)
-{
-scndDim++;
-primDim--;
-}
-
-while (primDim = 0  scndDim  4)
-{
-buffD[nextScanPos + offsetScan] = primDim * width + 
scndDim + offsetD;
-nextScanPos++;
-scndDim++;
-primDim--;
-}
-}
-}
-}
-
-uint32_t cnt = 0;
-if (width  2)
-{
-uint32_t numBlkSide = width  2;
-for (int blkY = 0; blkY  numBlkSide; blkY++)
-{
-for (int blkX = 0; blkX  numBlkSide; blkX++)
-{
-uint32_t offset = blkY * 4 * width + blkX * 4;
-for (int y = 0; y  4; y++)
-{
-for (int x = 0; x  4; x++)
-{
-buffH[cnt] = y * width + x + offset;
-cnt++;
-}
-}
-}
-}
-
-cnt = 0;
-for (int blkX = 0; blkX  numBlkSide; blkX++)
-{
-for (int blkY = 0; blkY  numBlkSide; blkY++)
-{
-uint32_t offset= blkY * 4 * width + blkX * 4;
-for (int x = 0; x  4; x++)
-{
-for (int y = 0; y  4; y++)
-{
-buffV[cnt] = y * width + x + offset;
-cnt++;
-}
-}
-}
-