Re: [x265] [PATCH] TComYuv::addAvg, primitive function for luma and chroma loops

2013-11-18 Thread chen
@@ -640,26 +621,9 @@
 width  = m_hChromaShift;
 height = m_vChromaShift;
 
-for (y = height - 1; y = 0; y--)
-{
-for (x = width - 1; x = 0; )
-{
-// note: chroma min width is 2
-dstU[x] = ClipC((srcU0[x] + srcU1[x] + offset)  shiftNum);
-dstV[x] = ClipC((srcV0[x] + srcV1[x] + offset)  shiftNum);
-x--;
-dstU[x] = ClipC((srcU0[x] + srcU1[x] + offset)  shiftNum);
-dstV[x] = ClipC((srcV0[x] + srcV1[x] + offset)  shiftNum);
-x--;
-}
-
-srcU0 += src0Stride;
-srcU1 += src1Stride;
-srcV0 += src0Stride;
-srcV1 += src1Stride;
-dstU  += dststride;
-dstV  += dststride;
-}
+int part = partitionFromSizes(width, height);
you use Chroma size to get index, I think is error.
 
+primitives.chroma_addAvg[part](dstU, dststride, srcU0, src0Stride, 
srcU1, src1Stride);
+primitives.chroma_addAvg[part](dstV, dststride, srcV0, src0Stride, 
srcV1, src1Stride);
 }
 }
___
x265-devel mailing list
x265-devel@videolan.org
https://mailman.videolan.org/listinfo/x265-devel


[x265] [PATCH] TComYuv::addAvg, primitive function for luma and chroma loops

2013-11-18 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1384768323 -19800
#  Mon Nov 18 15:22:03 2013 +0530
# Node ID cdd54aa200bd635395c01bbb07c156be4edbf7b1
# Parent  ac9e64d8a80bffe33fdaa0a9b83fdbe84f39d0b0
TComYuv::addAvg, primitive function for luma and chroma loops

diff -r ac9e64d8a80b -r cdd54aa200bd source/Lib/TLibCommon/TComYuv.cpp
--- a/source/Lib/TLibCommon/TComYuv.cpp Mon Nov 18 12:26:44 2013 +0530
+++ b/source/Lib/TLibCommon/TComYuv.cpp Mon Nov 18 15:22:03 2013 +0530
@@ -589,9 +589,7 @@
 
 void TComYuv::addAvg(TShortYUV* srcYuv0, TShortYUV* srcYuv1, uint32_t 
partUnitIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma)
 {
-int x, y;
 uint32_t src0Stride, src1Stride, dststride;
-int shiftNum, offset;
 
 int16_t* srcY0 = srcYuv0-getLumaAddr(partUnitIdx);
 int16_t* srcU0 = srcYuv0-getCbAddr(partUnitIdx);
@@ -605,61 +603,24 @@
 Pel* dstU = getCbAddr(partUnitIdx);
 Pel* dstV = getCrAddr(partUnitIdx);
 
+int part = partitionFromSizes(width, height);
+
 if (bLuma)
 {
 src0Stride = srcYuv0-m_width;
 src1Stride = srcYuv1-m_width;
 dststride  = getStride();
-shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
-offset = (1  (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
 
-for (y = 0; y  height; y++)
-{
-for (x = 0; x  width; x += 4)
-{
-dstY[x + 0] = ClipY((srcY0[x + 0] + srcY1[x + 0] + offset)  
shiftNum);
-dstY[x + 1] = ClipY((srcY0[x + 1] + srcY1[x + 1] + offset)  
shiftNum);
-dstY[x + 2] = ClipY((srcY0[x + 2] + srcY1[x + 2] + offset)  
shiftNum);
-dstY[x + 3] = ClipY((srcY0[x + 3] + srcY1[x + 3] + offset)  
shiftNum);
-}
-
-srcY0 += src0Stride;
-srcY1 += src1Stride;
-dstY  += dststride;
-}
+primitives.luma_addAvg[part](dstY, dststride, srcY0, src0Stride, 
srcY1, src1Stride);
 }
 if (bChroma)
 {
-shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
-offset = (1  (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
-
 src0Stride = srcYuv0-m_cwidth;
 src1Stride = srcYuv1-m_cwidth;
 dststride  = getCStride();
 
-width  = m_hChromaShift;
-height = m_vChromaShift;
-
-for (y = height - 1; y = 0; y--)
-{
-for (x = width - 1; x = 0; )
-{
-// note: chroma min width is 2
-dstU[x] = ClipC((srcU0[x] + srcU1[x] + offset)  shiftNum);
-dstV[x] = ClipC((srcV0[x] + srcV1[x] + offset)  shiftNum);
-x--;
-dstU[x] = ClipC((srcU0[x] + srcU1[x] + offset)  shiftNum);
-dstV[x] = ClipC((srcV0[x] + srcV1[x] + offset)  shiftNum);
-x--;
-}
-
-srcU0 += src0Stride;
-srcU1 += src1Stride;
-srcV0 += src0Stride;
-srcV1 += src1Stride;
-dstU  += dststride;
-dstV  += dststride;
-}
+primitives.chroma_addAvg[part](dstU, dststride, srcU0, src0Stride, 
srcU1, src1Stride);
+primitives.chroma_addAvg[part](dstV, dststride, srcV0, src0Stride, 
srcV1, src1Stride);
 }
 }
 
diff -r ac9e64d8a80b -r cdd54aa200bd source/common/pixel.cpp
--- a/source/common/pixel.cpp   Mon Nov 18 12:26:44 2013 +0530
+++ b/source/common/pixel.cpp   Mon Nov 18 15:22:03 2013 +0530
@@ -794,6 +794,27 @@
 a += dstride;
 }
 }
+
+templateint bx, int by
+void addAvg(pixel* dst, intptr_t dstStride, int16_t* src0, intptr_t 
src0Stride, int16_t* src1, intptr_t src1Stride)
+{
+int shiftNum, offset;
+shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
+offset = (1  (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
+
+for (int y = 0; y  by; y++)
+{
+for (int x = 0; x  bx; x += 2)
+{
+dst[x + 0] = ClipY((src0[x + 0] + src1[x + 0] + offset)  
shiftNum);
+dst[x + 1] = ClipY((src0[x + 1] + src1[x + 1] + offset)  
shiftNum);
+}
+
+src0 += src0Stride;
+src1 += src1Stride;
+dst  += dstStride;
+}
+}
 }  // end anonymous namespace
 
 namespace x265 {
@@ -835,12 +856,14 @@
 p.satd[LUMA_16x64] = satd816, 64;
 
 #define CHROMA(W, H) \
+p.chroma_addAvg[CHROMA_ ## W ## x ## H]  = addAvgW, H; \
 p.chroma_copy_pp[CSP_I420][CHROMA_ ## W ## x ## H] = blockcopy_pp_cW, H; 
\
 p.chroma_copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_cW, H; \
 p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_cW, H;\
 p.chroma_sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_cW, H;
 
 #define LUMA(W, H) \
+p.luma_addAvg[LUMA_ ## W ## x ## H]  = addAvgW, H; \
 p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_cW, H; \
 p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_cW, H; \
 p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_cW, H;\
diff -r ac9e64d8a80b -r cdd54aa200bd source/common/primitives.h
--- 

Re: [x265] [PATCH] TComYuv::addAvg, primitive function for luma and chroma loops

2013-11-18 Thread Deepthi Nandakumar
Pushed. But next time, please organize your patches more clearly.

1. Add C primitive, if it does not exist.
2. Add the function pointer declarations and new primitive declarations to
EncoderPrimitives struct.
3. Add testbench code for primitives.
4. Add asm code.

Once all above patches have been reviewed, pushed and tested on all
platforms, then you can integrate it with the actual encoder.




On Mon, Nov 18, 2013 at 3:23 PM, dnyanesh...@multicorewareinc.com wrote:

 # HG changeset patch
 # User Dnyaneshwar G dnyanesh...@multicorewareinc.com
 # Date 1384768323 -19800
 #  Mon Nov 18 15:22:03 2013 +0530
 # Node ID cdd54aa200bd635395c01bbb07c156be4edbf7b1
 # Parent  ac9e64d8a80bffe33fdaa0a9b83fdbe84f39d0b0
 TComYuv::addAvg, primitive function for luma and chroma loops

 diff -r ac9e64d8a80b -r cdd54aa200bd source/Lib/TLibCommon/TComYuv.cpp
 --- a/source/Lib/TLibCommon/TComYuv.cpp Mon Nov 18 12:26:44 2013 +0530
 +++ b/source/Lib/TLibCommon/TComYuv.cpp Mon Nov 18 15:22:03 2013 +0530
 @@ -589,9 +589,7 @@

  void TComYuv::addAvg(TShortYUV* srcYuv0, TShortYUV* srcYuv1, uint32_t
 partUnitIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma)
  {
 -int x, y;
  uint32_t src0Stride, src1Stride, dststride;
 -int shiftNum, offset;

  int16_t* srcY0 = srcYuv0-getLumaAddr(partUnitIdx);
  int16_t* srcU0 = srcYuv0-getCbAddr(partUnitIdx);
 @@ -605,61 +603,24 @@
  Pel* dstU = getCbAddr(partUnitIdx);
  Pel* dstV = getCrAddr(partUnitIdx);

 +int part = partitionFromSizes(width, height);
 +
  if (bLuma)
  {
  src0Stride = srcYuv0-m_width;
  src1Stride = srcYuv1-m_width;
  dststride  = getStride();
 -shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
 -offset = (1  (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;

 -for (y = 0; y  height; y++)
 -{
 -for (x = 0; x  width; x += 4)
 -{
 -dstY[x + 0] = ClipY((srcY0[x + 0] + srcY1[x + 0] +
 offset)  shiftNum);
 -dstY[x + 1] = ClipY((srcY0[x + 1] + srcY1[x + 1] +
 offset)  shiftNum);
 -dstY[x + 2] = ClipY((srcY0[x + 2] + srcY1[x + 2] +
 offset)  shiftNum);
 -dstY[x + 3] = ClipY((srcY0[x + 3] + srcY1[x + 3] +
 offset)  shiftNum);
 -}
 -
 -srcY0 += src0Stride;
 -srcY1 += src1Stride;
 -dstY  += dststride;
 -}
 +primitives.luma_addAvg[part](dstY, dststride, srcY0, src0Stride,
 srcY1, src1Stride);
  }
  if (bChroma)
  {
 -shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
 -offset = (1  (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
 -
  src0Stride = srcYuv0-m_cwidth;
  src1Stride = srcYuv1-m_cwidth;
  dststride  = getCStride();

 -width  = m_hChromaShift;
 -height = m_vChromaShift;
 -
 -for (y = height - 1; y = 0; y--)
 -{
 -for (x = width - 1; x = 0; )
 -{
 -// note: chroma min width is 2
 -dstU[x] = ClipC((srcU0[x] + srcU1[x] + offset) 
 shiftNum);
 -dstV[x] = ClipC((srcV0[x] + srcV1[x] + offset) 
 shiftNum);
 -x--;
 -dstU[x] = ClipC((srcU0[x] + srcU1[x] + offset) 
 shiftNum);
 -dstV[x] = ClipC((srcV0[x] + srcV1[x] + offset) 
 shiftNum);
 -x--;
 -}
 -
 -srcU0 += src0Stride;
 -srcU1 += src1Stride;
 -srcV0 += src0Stride;
 -srcV1 += src1Stride;
 -dstU  += dststride;
 -dstV  += dststride;
 -}
 +primitives.chroma_addAvg[part](dstU, dststride, srcU0,
 src0Stride, srcU1, src1Stride);
 +primitives.chroma_addAvg[part](dstV, dststride, srcV0,
 src0Stride, srcV1, src1Stride);
  }
  }

 diff -r ac9e64d8a80b -r cdd54aa200bd source/common/pixel.cpp
 --- a/source/common/pixel.cpp   Mon Nov 18 12:26:44 2013 +0530
 +++ b/source/common/pixel.cpp   Mon Nov 18 15:22:03 2013 +0530
 @@ -794,6 +794,27 @@
  a += dstride;
  }
  }
 +
 +templateint bx, int by
 +void addAvg(pixel* dst, intptr_t dstStride, int16_t* src0, intptr_t
 src0Stride, int16_t* src1, intptr_t src1Stride)
 +{
 +int shiftNum, offset;
 +shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
 +offset = (1  (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
 +
 +for (int y = 0; y  by; y++)
 +{
 +for (int x = 0; x  bx; x += 2)
 +{
 +dst[x + 0] = ClipY((src0[x + 0] + src1[x + 0] + offset) 
 shiftNum);
 +dst[x + 1] = ClipY((src0[x + 1] + src1[x + 1] + offset) 
 shiftNum);
 +}
 +
 +src0 += src0Stride;
 +src1 += src1Stride;
 +dst  += dstStride;
 +}
 +}
  }  // end anonymous namespace

  namespace x265 {
 @@ -835,12 +856,14 @@
  p.satd[LUMA_16x64] = satd816, 64;

  #define CHROMA(W, H) \
 +p.chroma_addAvg[CHROMA_ ## W ## x ## H]  = addAvgW, H; \
  p.chroma_copy_pp[CSP_I420][CHROMA_ ## W ## x ## 

[x265] [PATCH] TComYuv::addAvg, primitive function for luma and chroma loops

2013-11-17 Thread dnyaneshwar
# HG changeset patch
# User Dnyaneshwar Gorade dnyanesh...@multicorewareinc.com
# Date 1384758687 -19800
#  Mon Nov 18 12:41:27 2013 +0530
# Node ID ee062baf96b18ab2ecd64a2e4219b2a5a3c09e5d
# Parent  e2895ce7bbeb2c3d845fee2578758d0012fa2cb4
TComYuv::addAvg, primitive function for luma and chroma loops

diff -r e2895ce7bbeb -r ee062baf96b1 source/Lib/TLibCommon/TComYuv.cpp
--- a/source/Lib/TLibCommon/TComYuv.cpp Sun Nov 17 11:24:13 2013 -0600
+++ b/source/Lib/TLibCommon/TComYuv.cpp Mon Nov 18 12:41:27 2013 +0530
@@ -589,9 +589,7 @@
 
 void TComYuv::addAvg(TShortYUV* srcYuv0, TShortYUV* srcYuv1, uint32_t 
partUnitIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma)
 {
-int x, y;
 uint32_t src0Stride, src1Stride, dststride;
-int shiftNum, offset;
 
 int16_t* srcY0 = srcYuv0-getLumaAddr(partUnitIdx);
 int16_t* srcU0 = srcYuv0-getCbAddr(partUnitIdx);
@@ -610,29 +608,12 @@
 src0Stride = srcYuv0-m_width;
 src1Stride = srcYuv1-m_width;
 dststride  = getStride();
-shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
-offset = (1  (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
 
-for (y = 0; y  height; y++)
-{
-for (x = 0; x  width; x += 4)
-{
-dstY[x + 0] = ClipY((srcY0[x + 0] + srcY1[x + 0] + offset)  
shiftNum);
-dstY[x + 1] = ClipY((srcY0[x + 1] + srcY1[x + 1] + offset)  
shiftNum);
-dstY[x + 2] = ClipY((srcY0[x + 2] + srcY1[x + 2] + offset)  
shiftNum);
-dstY[x + 3] = ClipY((srcY0[x + 3] + srcY1[x + 3] + offset)  
shiftNum);
-}
-
-srcY0 += src0Stride;
-srcY1 += src1Stride;
-dstY  += dststride;
-}
+int part = partitionFromSizes(width, height);
+primitives.luma_addAvg[part](dstY, dststride, srcY0, src0Stride, 
srcY1, src1Stride);
 }
 if (bChroma)
 {
-shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
-offset = (1  (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
-
 src0Stride = srcYuv0-m_cwidth;
 src1Stride = srcYuv1-m_cwidth;
 dststride  = getCStride();
@@ -640,26 +621,9 @@
 width  = m_hChromaShift;
 height = m_vChromaShift;
 
-for (y = height - 1; y = 0; y--)
-{
-for (x = width - 1; x = 0; )
-{
-// note: chroma min width is 2
-dstU[x] = ClipC((srcU0[x] + srcU1[x] + offset)  shiftNum);
-dstV[x] = ClipC((srcV0[x] + srcV1[x] + offset)  shiftNum);
-x--;
-dstU[x] = ClipC((srcU0[x] + srcU1[x] + offset)  shiftNum);
-dstV[x] = ClipC((srcV0[x] + srcV1[x] + offset)  shiftNum);
-x--;
-}
-
-srcU0 += src0Stride;
-srcU1 += src1Stride;
-srcV0 += src0Stride;
-srcV1 += src1Stride;
-dstU  += dststride;
-dstV  += dststride;
-}
+int part = partitionFromSizes(width, height);
+primitives.chroma_addAvg[part](dstU, dststride, srcU0, src0Stride, 
srcU1, src1Stride);
+primitives.chroma_addAvg[part](dstV, dststride, srcV0, src0Stride, 
srcV1, src1Stride);
 }
 }
 
diff -r e2895ce7bbeb -r ee062baf96b1 source/common/pixel.cpp
--- a/source/common/pixel.cpp   Sun Nov 17 11:24:13 2013 -0600
+++ b/source/common/pixel.cpp   Mon Nov 18 12:41:27 2013 +0530
@@ -794,6 +794,27 @@
 a += dstride;
 }
 }
+
+templateint bx, int by
+void addAvg(pixel* dst, intptr_t dstStride, int16_t* src0, intptr_t 
src0Stride, int16_t* src1, intptr_t src1Stride)
+{
+int shiftNum, offset;
+shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
+offset = (1  (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
+
+for (int y = 0; y  by; y++)
+{
+for (int x = 0; x  bx; x += 2)
+{
+dst[x + 0] = ClipY((src0[x + 0] + src1[x + 0] + offset)  
shiftNum);
+dst[x + 1] = ClipY((src0[x + 1] + src1[x + 1] + offset)  
shiftNum);
+}
+
+src0 += src0Stride;
+src1 += src1Stride;
+dst  += dstStride;
+}
+}
 }  // end anonymous namespace
 
 namespace x265 {
@@ -835,12 +856,14 @@
 p.satd[LUMA_16x64] = satd816, 64;
 
 #define CHROMA(W, H) \
+p.chroma_addAvg[CHROMA_ ## W ## x ## H]  = addAvgW, H; \
 p.chroma_copy_pp[CSP_I420][CHROMA_ ## W ## x ## H] = blockcopy_pp_cW, H; 
\
 p.chroma_copy_sp[CHROMA_ ## W ## x ## H] = blockcopy_sp_cW, H; \
 p.chroma_copy_ps[CHROMA_ ## W ## x ## H] = blockcopy_ps_cW, H;\
 p.chroma_sub_ps[CHROMA_ ## W ## x ## H] = pixel_sub_ps_cW, H;
 
 #define LUMA(W, H) \
+p.luma_addAvg[LUMA_ ## W ## x ## H]  = addAvgW, H; \
 p.luma_copy_pp[LUMA_ ## W ## x ## H] = blockcopy_pp_cW, H; \
 p.luma_copy_sp[LUMA_ ## W ## x ## H] = blockcopy_sp_cW, H; \
 p.luma_copy_ps[LUMA_ ## W ## x ## H] = blockcopy_ps_cW, H;\
diff -r e2895ce7bbeb -r ee062baf96b1 source/common/primitives.h
--- a/source/common/primitives.h