[x265] [PATCH] cu-lossless: fix inter hash mistake

2014-08-26 Thread deepthi
# HG changeset patch
# User Deepthi Nandakumar deep...@multicorewareinc.com
# Date 1409037983 -19800
#  Tue Aug 26 12:56:23 2014 +0530
# Node ID 6573fd72294e481133f2d1636100d6c8419fb597
# Parent  9387a276897fc3ab11bbbe20d4f0d7831caf3115
cu-lossless: fix inter hash mistake

The CU needs to be re-encoded if lossless is chosen as the best mode.

diff -r 9387a276897f -r 6573fd72294e source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Mon Aug 25 17:07:45 2014 -0500
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Tue Aug 26 12:56:23 2014 +0530
@@ -2325,6 +2325,7 @@
 }
 
 uint64_t bestCost = MAX_INT64;
+uint32_t bestMode = 0;
 
 for (uint32_t modeId = 0; modeId  numModes; modeId++)
 {
@@ -2392,6 +2393,7 @@
 if (cu-getQtRootCbf(0))
 xSetResidualQTData(cu, 0, outBestResiYuv, depth, true);
 
+bestMode = modeId; //0 for lossless
 bestBits = bits;
 bestCost = cost;
 bestCoeffBits = cu-m_coeffBits;
@@ -2401,6 +2403,19 @@
 
 X265_CHECK(bestCost != MAX_INT64, no best cost\n);
 
+if(bIsTQBypassEnable  !bestMode)
+{
+cu-setCUTransquantBypassSubParts(true, 0, depth);
+m_entropyCoder-load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
+uint64_t cost = 0;
+uint32_t zeroDistortion = 0;
+uint32_t bits = 0;
+uint32_t distortion = 0;
+xEstimateResidualQT(cu, 0, fencYuv, predYuv, outResiYuv, depth, cost, 
bits, distortion, zeroDistortion);
+xSetResidualQTData(cu, 0, NULL, depth, false);
+m_entropyCoder-store(m_rdEntropyCoders[depth][CI_TEMP_BEST]);
+}
+
 if (cu-getQtRootCbf(0))
 outReconYuv-addClip(predYuv, outBestResiYuv, log2CUSize);
 else
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] [PATCH 1 of 3] analysis: fix inter hash mistake with --cu-lossless

2014-08-26 Thread Deepthi Nandakumar
Thanks, Min. This is a solution but will affect performance with an extra
TComDataCU*. I have sent another patch where I'm just re-encoding the CU if
lossless is chosen as the best mode. This will not affect normal analysis.
Can you review that?

Deepthi


On Tue, Aug 26, 2014 at 3:47 AM, Steve Borho st...@borho.org wrote:

 # HG changeset patch
 # User Min Chen chenm...@163.com
 # Date 1409002891 18000
 #  Mon Aug 25 16:41:31 2014 -0500
 # Node ID 0bf2756898bc78e5660a6b607b2f3cda97834264
 # Parent  5acfb12ec5d17cc700e313fc99248e2408e5967b
 analysis: fix inter hash mistake with --cu-lossless

 diff -r 5acfb12ec5d1 -r 0bf2756898bc source/Lib/TLibEncoder/TEncSearch.cpp
 --- a/source/Lib/TLibEncoder/TEncSearch.cpp Mon Aug 25 17:53:12 2014
 +0900
 +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Mon Aug 25 16:41:31 2014
 -0500
 @@ -2293,7 +2293,7 @@
   * \returns void
   */
  void TEncSearch::encodeResAndCalcRdInterCU(TComDataCU* cu, TComYuv*
 fencYuv, TComYuv* predYuv, ShortYuv* outResiYuv,
 -   ShortYuv* outBestResiYuv,
 TComYuv* outReconYuv)
 +   ShortYuv* outBestResiYuv,
 TComYuv* outReconYuv, TComDataCU* tmpCu)
  {
  X265_CHECK(!cu-isIntra(0), intra CU not expected\n);

 @@ -2321,6 +2321,7 @@
  }

  uint64_t bestCost = MAX_INT64;
 +bool bestTransquantBypassFlag = bIsTQBypassEnable;

  for (uint32_t modeId = 0; modeId  numModes; modeId++)
  {
 @@ -2388,15 +2389,29 @@
  if (cu-getQtRootCbf(0))
  xSetResidualQTData(cu, 0, outBestResiYuv, depth, true);

 +bestTransquantBypassFlag = bIsLosslessMode;
  bestBits = bits;
  bestCost = cost;
  bestCoeffBits = cu-m_coeffBits;
  m_entropyCoder-store(m_rdEntropyCoders[depth][CI_TEMP_BEST]);
  }
 +
 +// Save lossless mode coeff
 +if (bIsLosslessMode)
 +{
 +tmpCu-copyPartFrom(cu, 0, depth, false);
 +}
  }

  X265_CHECK(bestCost != MAX_INT64, no best cost\n);

 +if (bestTransquantBypassFlag  !m_param-bLossless)
 +{
 +assert(log2CUSize  2);
 +cu-setCUTransquantBypassSubParts(true, 0, depth);
 +cu-copyPartFrom(tmpCu, 0, depth, false);
 +}
 +
  if (cu-getQtRootCbf(0))
  outReconYuv-addClip(predYuv, outBestResiYuv, log2CUSize);
  else
 diff -r 5acfb12ec5d1 -r 0bf2756898bc source/Lib/TLibEncoder/TEncSearch.h
 --- a/source/Lib/TLibEncoder/TEncSearch.h   Mon Aug 25 17:53:12 2014
 +0900
 +++ b/source/Lib/TLibEncoder/TEncSearch.h   Mon Aug 25 16:41:31 2014
 -0500
 @@ -147,7 +147,7 @@

  /// encode residual and compute rd-cost for inter mode
  void encodeResAndCalcRdInterCU(TComDataCU* cu, TComYuv* fencYuv,
 TComYuv* predYuv, ShortYuv* resiYuv, ShortYuv* bestResiYuv,
 -   TComYuv* reconYuv);
 +   TComYuv* reconYuv, TComDataCU* tmpCu);
  void encodeResAndCalcRdSkipCU(TComDataCU* cu, TComYuv* fencYuv,
 TComYuv* predYuv, TComYuv* reconYuv);

  void xRecurIntraCodingQT(TComDataCU* cu, uint32_t trDepth, uint32_t
 absPartIdx, TComYuv* fencYuv,
 diff -r 5acfb12ec5d1 -r 0bf2756898bc source/encoder/analysis.cpp
 --- a/source/encoder/analysis.cpp   Mon Aug 25 17:53:12 2014 +0900
 +++ b/source/encoder/analysis.cpp   Mon Aug 25 16:41:31 2014 -0500
 @@ -82,7 +82,7 @@
  uint32_t sizeL = cuSize * cuSize;
  uint32_t sizeC = sizeL  (CHROMA_H_SHIFT(csp) +
 CHROMA_V_SHIFT(csp));

 -ok = m_memPool[i].initialize(numPartitions, sizeL, sizeC, 8,
 tqBypass);
 +ok = m_memPool[i].initialize(numPartitions, sizeL, sizeC, 9,
 tqBypass);

  m_interCU_2Nx2N[i]  = new TComDataCU;
  m_interCU_2Nx2N[i]-create(m_memPool[i], numPartitions, cuSize,
 csp, 0, tqBypass);
 @@ -108,6 +108,9 @@
  m_tempCU[i] = new TComDataCU;
  m_tempCU[i]-create(m_memPool[i], numPartitions, cuSize, csp, 7,
 tqBypass);

 +m_tempLosslessCU[i] = new TComDataCU;
 +m_tempLosslessCU[i]-create(m_memPool[i], numPartitions, cuSize,
 csp, 8, tqBypass);
 +
  m_bestPredYuv[i] = new TComYuv;
  ok = m_bestPredYuv[i]-create(cuSize, cuSize, csp);

 @@ -158,6 +161,7 @@
  delete m_bestMergeCU[i];
  delete m_bestCU[i];
  delete m_tempCU[i];
 +delete m_tempLosslessCU[i];

  if (m_bestPredYuv  m_bestPredYuv[i])
  {
 @@ -240,6 +244,7 @@
  // initialize CU data
  m_bestCU[0]-initCU(cu-m_pic, cu-getAddr());
  m_tempCU[0]-initCU(cu-m_pic, cu-getAddr());
 +m_tempLosslessCU[0]-initCU(cu-m_pic, cu-getAddr());

  // analysis of CU
  uint32_t numPartition = cu-getTotalNumPart();
 @@ -394,6 +399,7 @@
  uint32_tnextDepth = depth + 1;
  TComDataCU* subBestPartCU = m_bestCU[nextDepth];
  TComDataCU* subTempPartCU = m_tempCU[nextDepth];
 +TComDataCU* 

[x265] fix m_initSliceContext (uninitialised m_sliceQp)

2014-08-26 Thread Satoshi Nakagawa
# HG changeset patch
# User Satoshi Nakagawa nakagawa...@oki.com
# Date 1409041357 -32400
#  Tue Aug 26 17:22:37 2014 +0900
# Node ID c18255467f12da1a780340ade55292c32d95bfdd
# Parent  5acfb12ec5d17cc700e313fc99248e2408e5967b
fix m_initSliceContext (uninitialised m_sliceQp)

diff -r 5acfb12ec5d1 -r c18255467f12 source/encoder/frameencoder.cpp
--- a/source/encoder/frameencoder.cpp   Mon Aug 25 17:53:12 2014 +0900
+++ b/source/encoder/frameencoder.cpp   Tue Aug 26 17:22:37 2014 +0900
@@ -158,8 +158,6 @@
 int64_t startCompressTime = x265_mdate();
 Slice* slice = m_frame-m_picSym-m_slice;
 
-m_initSliceContext.resetEntropy(slice);
-
 /* Emit access unit delimiter unless this is the first frame and the user 
is
  * not repeating headers (since AUD is supposed to be the first NAL in the 
access
  * unit) */
@@ -225,12 +223,15 @@
 m_frameFilter.m_sao.m_refDepth = 2 + !IS_REFERENCED(slice);
 break;
 }
-m_frameFilter.start(m_frame);
 
 // Clip slice QP to 0-51 spec range before encoding
 qp = Clip3(-QP_BD_OFFSET, MAX_QP, qp);
 slice-m_sliceQp = qp;
 
+m_initSliceContext.resetEntropy(slice);
+
+m_frameFilter.start(m_frame);
+
 if (m_frame-m_lowres.bKeyframe)
 {
 if (m_param-bEmitHRDSEI)
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] search: whitespace nits

2014-08-26 Thread deepthi
# HG changeset patch
# User Deepthi Nandakumar deep...@multicorewareinc.com
# Date 1409038258 -19800
#  Tue Aug 26 13:00:58 2014 +0530
# Node ID 00adc4fc9cdbd6c2f468a877c4323a8f0e8980f9
# Parent  6573fd72294e481133f2d1636100d6c8419fb597
search: whitespace nits

diff -r 6573fd72294e -r 00adc4fc9cdb source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Tue Aug 26 12:56:23 2014 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Tue Aug 26 13:00:58 2014 +0530
@@ -2403,7 +2403,7 @@
 
 X265_CHECK(bestCost != MAX_INT64, no best cost\n);
 
-if(bIsTQBypassEnable  !bestMode)
+if (bIsTQBypassEnable  !bestMode)
 {
 cu-setCUTransquantBypassSubParts(true, 0, depth);
 m_entropyCoder-load(m_rdEntropyCoders[depth][CI_CURR_BEST]);
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] asm: optimize dct4

2014-08-26 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1409046621 -19800
#  Tue Aug 26 15:20:21 2014 +0530
# Node ID bbd5b3f269b095760d21877e94d67df8bd72f479
# Parent  5acfb12ec5d17cc700e313fc99248e2408e5967b
asm: optimize dct4

diff -r 5acfb12ec5d1 -r bbd5b3f269b0 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asmMon Aug 25 17:53:12 2014 +0900
+++ b/source/common/x86/dct8.asmTue Aug 26 15:20:21 2014 +0530
@@ -30,6 +30,8 @@
 
 SECTION_RODATA 32
 
+dct4_shuf:  db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13
+
 tab_dct4:   times 4 dw 64, 64
 times 4 dw 83, 36
 times 4 dw 64, -64
@@ -118,16 +120,14 @@
 movhm0, [r0 + 0 * r2]
 movhm1, [r0 + 1 * r2]
 punpcklqdq  m0, m1
-pshufd  m0, m0, 0xD8
-pshufhw m0, m0, 0xB1
+pshufb  m0, [dct4_shuf]
 
 lea r0, [r0 + 2 * r2]
 movhm1, [r0]
 movhm2, [r0 + r2]
 punpcklqdq  m1, m2
-pshufd  m1, m1, 0xD8
-pshufhw m1, m1, 0xB1
 
+pshufb  m1, [dct4_shuf]
 punpcklqdq  m2, m0, m1
 punpckhqdq  m0, m1
 
@@ -140,8 +140,7 @@
 paddd   m3, m7
 psrad   m3, DCT_SHIFT
 packssdwm0, m3
-pshufd  m0, m0, 0xD8
-pshufhw m0, m0, 0xB1
+pshufb  m0, [dct4_shuf]
 pmaddwd m1, m6
 paddd   m1, m7
 psrad   m1, DCT_SHIFT
@@ -149,9 +148,8 @@
 paddd   m2, m7
 psrad   m2, DCT_SHIFT
 packssdwm1, m2
-pshufd  m1, m1, 0xD8
-pshufhw m1, m1, 0xB1
 
+pshufb  m1, [dct4_shuf]
 punpcklqdq  m2, m0, m1
 punpckhqdq  m0, m1
 
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] fix m_initSliceContext (uninitialised m_sliceQp)

2014-08-26 Thread Deepthi Nandakumar
Thanks, queued for default (does not apply on stable due to the SAO
refactor).


On Tue, Aug 26, 2014 at 1:55 PM, Satoshi Nakagawa nakagawa...@oki.com
wrote:

 # HG changeset patch
 # User Satoshi Nakagawa nakagawa...@oki.com
 # Date 1409041357 -32400
 #  Tue Aug 26 17:22:37 2014 +0900
 # Node ID c18255467f12da1a780340ade55292c32d95bfdd
 # Parent  5acfb12ec5d17cc700e313fc99248e2408e5967b
 fix m_initSliceContext (uninitialised m_sliceQp)

 diff -r 5acfb12ec5d1 -r c18255467f12 source/encoder/frameencoder.cpp
 --- a/source/encoder/frameencoder.cpp   Mon Aug 25 17:53:12 2014 +0900
 +++ b/source/encoder/frameencoder.cpp   Tue Aug 26 17:22:37 2014 +0900
 @@ -158,8 +158,6 @@
  int64_t startCompressTime = x265_mdate();
  Slice* slice = m_frame-m_picSym-m_slice;

 -m_initSliceContext.resetEntropy(slice);
 -
  /* Emit access unit delimiter unless this is the first frame and the
 user is
   * not repeating headers (since AUD is supposed to be the first NAL
 in the access
   * unit) */
 @@ -225,12 +223,15 @@
  m_frameFilter.m_sao.m_refDepth = 2 + !IS_REFERENCED(slice);
  break;
  }
 -m_frameFilter.start(m_frame);

  // Clip slice QP to 0-51 spec range before encoding
  qp = Clip3(-QP_BD_OFFSET, MAX_QP, qp);
  slice-m_sliceQp = qp;

 +m_initSliceContext.resetEntropy(slice);
 +
 +m_frameFilter.start(m_frame);
 +
  if (m_frame-m_lowres.bKeyframe)
  {
  if (m_param-bEmitHRDSEI)
 ___
 x265-devel mailing list
 x265-devel@videolan.org
 https://mailman.videolan.org/listinfo/x265-devel

___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


Re: [x265] [PATCH] asm: optimize dct4, replaced pshufd(latency 4-6)+pshufhw(latency 2) instructions with pshufb(latency 1)

2014-08-26 Thread Dnyaneshwar Gorade
sorry, ignore this patch, I forgot one more little modification.


On Wed, Aug 27, 2014 at 10:27 AM, dnyanesh...@multicorewareinc.com wrote:

 # HG changeset patch
 # User Dnyaneshwar G dnyanesh...@multicorewareinc.com
 # Date 1409115349 -19800
 #  Wed Aug 27 10:25:49 2014 +0530
 # Node ID f49ed93e3daff100903e5fd7aa1bd874b9e79caf
 # Parent  32891b95f6693a39afbdf7929e12e3e0c6e990d1
 asm: optimize dct4, replaced pshufd(latency 4-6)+pshufhw(latency 2)
 instructions with pshufb(latency 1)

 diff -r 32891b95f669 -r f49ed93e3daf source/common/x86/asm-primitives.cpp
 --- a/source/common/x86/asm-primitives.cpp  Tue Aug 26 15:03:38 2014
 -0500
 +++ b/source/common/x86/asm-primitives.cpp  Wed Aug 27 10:25:49 2014
 +0530
 @@ -1375,7 +1375,7 @@
  p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse2;
  p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse2;

 -p.dct[DCT_4x4] = x265_dct4_sse2;
 +p.dct[DCT_4x4] = x265_dct4_ssse3;
  p.idct[IDCT_4x4] = x265_idct4_sse2;
  p.idct[IDST_4x4] = x265_idst4_sse2;

 @@ -1545,7 +1545,7 @@
  p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
  p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
  p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
 -p.dct[DCT_4x4] = x265_dct4_sse2;
 +p.dct[DCT_4x4] = x265_dct4_ssse3;
  p.idct[IDCT_4x4] = x265_idct4_sse2;
  p.idct[IDST_4x4] = x265_idst4_sse2;
  p.planecopy_sp = x265_downShift_16_sse2;
 diff -r 32891b95f669 -r f49ed93e3daf source/common/x86/dct8.asm
 --- a/source/common/x86/dct8.asmTue Aug 26 15:03:38 2014 -0500
 +++ b/source/common/x86/dct8.asmWed Aug 27 10:25:49 2014 +0530
 @@ -30,6 +30,8 @@

  SECTION_RODATA 32

 +dct4_shuf:  db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13
 +
  tab_dct4:   times 4 dw 64, 64
  times 4 dw 83, 36
  times 4 dw 64, -64
 @@ -98,7 +100,7 @@
  ;--
  ;void dct4(int16_t *src, int32_t *dst, intptr_t stride)
  ;--
 -INIT_XMM sse2
 +INIT_XMM ssse3
  cglobal dct4, 3, 4, 8
  %if BIT_DEPTH == 10
%define   DCT_SHIFT 3
 @@ -112,22 +114,21 @@
  add r2d, r2d
  lea r3, [tab_dct4]

 +movam3, [dct4_shuf]
  movam4, [r3 + 0 * 16]
  movam5, [r3 + 1 * 16]
  movam6, [r3 + 2 * 16]
  movhm0, [r0 + 0 * r2]
  movhm1, [r0 + 1 * r2]
  punpcklqdq  m0, m1
 -pshufd  m0, m0, 0xD8
 -pshufhw m0, m0, 0xB1
 +pshufb  m0, m3

  lea r0, [r0 + 2 * r2]
  movhm1, [r0]
  movhm2, [r0 + r2]
  punpcklqdq  m1, m2
 -pshufd  m1, m1, 0xD8
 -pshufhw m1, m1, 0xB1

 +pshufb  m1, m3
  punpcklqdq  m2, m0, m1
  punpckhqdq  m0, m1

 @@ -140,8 +141,8 @@
  paddd   m3, m7
  psrad   m3, DCT_SHIFT
  packssdwm0, m3
 -pshufd  m0, m0, 0xD8
 -pshufhw m0, m0, 0xB1
 +movam3, [dct4_shuf]
 +pshufb  m0, m3
  pmaddwd m1, m6
  paddd   m1, m7
  psrad   m1, DCT_SHIFT
 @@ -149,9 +150,8 @@
  paddd   m2, m7
  psrad   m2, DCT_SHIFT
  packssdwm1, m2
 -pshufd  m1, m1, 0xD8
 -pshufhw m1, m1, 0xB1

 +pshufb  m1, m3
  punpcklqdq  m2, m0, m1
  punpckhqdq  m0, m1

 diff -r 32891b95f669 -r f49ed93e3daf source/common/x86/dct8.h
 --- a/source/common/x86/dct8.h  Tue Aug 26 15:03:38 2014 -0500
 +++ b/source/common/x86/dct8.h  Wed Aug 27 10:25:49 2014 +0530
 @@ -24,7 +24,7 @@
  #ifndef X265_DCT8_H
  #define X265_DCT8_H

 -void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
 +void x265_dct4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
  void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
  void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
  void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);

___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] asm: optimize dct4, replaced pshufd(latency 4-6)+pshufhw(latency 2) instructions with pshufb(latency 1)

2014-08-26 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1409115913 -19800
#  Wed Aug 27 10:35:13 2014 +0530
# Node ID 9e19a59e1de22bc39924365626c48fdb2557592e
# Parent  32891b95f6693a39afbdf7929e12e3e0c6e990d1
asm: optimize dct4, replaced pshufd(latency 4-6)+pshufhw(latency 2) 
instructions with pshufb(latency 1)

diff -r 32891b95f669 -r 9e19a59e1de2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp  Tue Aug 26 15:03:38 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp  Wed Aug 27 10:35:13 2014 +0530
@@ -1375,7 +1375,6 @@
 p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse2;
 p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse2;
 
-p.dct[DCT_4x4] = x265_dct4_sse2;
 p.idct[IDCT_4x4] = x265_idct4_sse2;
 p.idct[IDST_4x4] = x265_idst4_sse2;
 
@@ -1388,6 +1387,7 @@
 
 INTRA_ANG_SSSE3(ssse3);
 
+p.dct[DCT_4x4] = x265_dct4_ssse3;
 p.dct[DST_4x4] = x265_dst4_ssse3;
 p.idct[IDCT_8x8] = x265_idct8_ssse3;
 p.count_nonzero = x265_count_nonzero_ssse3;
@@ -1545,7 +1545,6 @@
 p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
 p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
 p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
-p.dct[DCT_4x4] = x265_dct4_sse2;
 p.idct[IDCT_4x4] = x265_idct4_sse2;
 p.idct[IDST_4x4] = x265_idst4_sse2;
 p.planecopy_sp = x265_downShift_16_sse2;
@@ -1582,6 +1581,7 @@
 p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_ssse3;
 p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_ssse3; // for i444 , 
chroma_p2s can be replaced by luma_p2s
 
+p.dct[DCT_4x4] = x265_dct4_ssse3;
 p.dct[DST_4x4] = x265_dst4_ssse3;
 p.idct[IDCT_8x8] = x265_idct8_ssse3;
 p.count_nonzero = x265_count_nonzero_ssse3;
diff -r 32891b95f669 -r 9e19a59e1de2 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asmTue Aug 26 15:03:38 2014 -0500
+++ b/source/common/x86/dct8.asmWed Aug 27 10:35:13 2014 +0530
@@ -30,6 +30,8 @@
 
 SECTION_RODATA 32
 
+dct4_shuf:  db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13
+
 tab_dct4:   times 4 dw 64, 64
 times 4 dw 83, 36
 times 4 dw 64, -64
@@ -98,7 +100,7 @@
 ;--
 ;void dct4(int16_t *src, int32_t *dst, intptr_t stride)
 ;--
-INIT_XMM sse2
+INIT_XMM ssse3
 cglobal dct4, 3, 4, 8
 %if BIT_DEPTH == 10
   %define   DCT_SHIFT 3
@@ -112,22 +114,21 @@
 add r2d, r2d
 lea r3, [tab_dct4]
 
+movam3, [dct4_shuf]
 movam4, [r3 + 0 * 16]
 movam5, [r3 + 1 * 16]
 movam6, [r3 + 2 * 16]
 movhm0, [r0 + 0 * r2]
 movhm1, [r0 + 1 * r2]
 punpcklqdq  m0, m1
-pshufd  m0, m0, 0xD8
-pshufhw m0, m0, 0xB1
+pshufb  m0, m3
 
 lea r0, [r0 + 2 * r2]
 movhm1, [r0]
 movhm2, [r0 + r2]
 punpcklqdq  m1, m2
-pshufd  m1, m1, 0xD8
-pshufhw m1, m1, 0xB1
 
+pshufb  m1, m3
 punpcklqdq  m2, m0, m1
 punpckhqdq  m0, m1
 
@@ -140,8 +141,8 @@
 paddd   m3, m7
 psrad   m3, DCT_SHIFT
 packssdwm0, m3
-pshufd  m0, m0, 0xD8
-pshufhw m0, m0, 0xB1
+movam3, [dct4_shuf]
+pshufb  m0, m3
 pmaddwd m1, m6
 paddd   m1, m7
 psrad   m1, DCT_SHIFT
@@ -149,9 +150,8 @@
 paddd   m2, m7
 psrad   m2, DCT_SHIFT
 packssdwm1, m2
-pshufd  m1, m1, 0xD8
-pshufhw m1, m1, 0xB1
 
+pshufb  m1, m3
 punpcklqdq  m2, m0, m1
 punpckhqdq  m0, m1
 
diff -r 32891b95f669 -r 9e19a59e1de2 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h  Tue Aug 26 15:03:38 2014 -0500
+++ b/source/common/x86/dct8.h  Wed Aug 27 10:35:13 2014 +0530
@@ -24,7 +24,7 @@
 #ifndef X265_DCT8_H
 #define X265_DCT8_H
 
-void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
+void x265_dct4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
 void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
 void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
 void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel