Pushed. But next time, please organize your patches more clearly.
1. Add C primitive, if it does not exist.
2. Add the function pointer declarations and new primitive declarations to
EncoderPrimitives struct.
3. Add testbench code for primitives.
4. Add asm code.
Once all above patches have been reviewed, pushed and tested on all
platforms, then you can integrate it with the actual encoder.
On Mon, Nov 18, 2013 at 3:23 PM, dnyanesh...@multicorewareinc.com wrote:
# HG changeset patch
# User Dnyaneshwar G dnyanesh...@multicorewareinc.com
# Date 1384768323 -19800
# Mon Nov 18 15:22:03 2013 +0530
# Node ID cdd54aa200bd635395c01bbb07c156be4edbf7b1
# Parent ac9e64d8a80bffe33fdaa0a9b83fdbe84f39d0b0
TComYuv::addAvg, primitive function for luma and chroma loops
diff -r ac9e64d8a80b -r cdd54aa200bd source/Lib/TLibCommon/TComYuv.cpp
--- a/source/Lib/TLibCommon/TComYuv.cpp Mon Nov 18 12:26:44 2013 +0530
+++ b/source/Lib/TLibCommon/TComYuv.cpp Mon Nov 18 15:22:03 2013 +0530
@@ -589,9 +589,7 @@
void TComYuv::addAvg(TShortYUV* srcYuv0, TShortYUV* srcYuv1, uint32_t
partUnitIdx, uint32_t width, uint32_t height, bool bLuma, bool bChroma)
{
-int x, y;
uint32_t src0Stride, src1Stride, dststride;
-int shiftNum, offset;
int16_t* srcY0 = srcYuv0-getLumaAddr(partUnitIdx);
int16_t* srcU0 = srcYuv0-getCbAddr(partUnitIdx);
@@ -605,61 +603,24 @@
Pel* dstU = getCbAddr(partUnitIdx);
Pel* dstV = getCrAddr(partUnitIdx);
+int part = partitionFromSizes(width, height);
+
if (bLuma)
{
src0Stride = srcYuv0-m_width;
src1Stride = srcYuv1-m_width;
dststride = getStride();
-shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
-offset = (1 (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
-for (y = 0; y height; y++)
-{
-for (x = 0; x width; x += 4)
-{
-dstY[x + 0] = ClipY((srcY0[x + 0] + srcY1[x + 0] +
offset) shiftNum);
-dstY[x + 1] = ClipY((srcY0[x + 1] + srcY1[x + 1] +
offset) shiftNum);
-dstY[x + 2] = ClipY((srcY0[x + 2] + srcY1[x + 2] +
offset) shiftNum);
-dstY[x + 3] = ClipY((srcY0[x + 3] + srcY1[x + 3] +
offset) shiftNum);
-}
-
-srcY0 += src0Stride;
-srcY1 += src1Stride;
-dstY += dststride;
-}
+primitives.luma_addAvg[part](dstY, dststride, srcY0, src0Stride,
srcY1, src1Stride);
}
if (bChroma)
{
-shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
-offset = (1 (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
-
src0Stride = srcYuv0-m_cwidth;
src1Stride = srcYuv1-m_cwidth;
dststride = getCStride();
-width = m_hChromaShift;
-height = m_vChromaShift;
-
-for (y = height - 1; y = 0; y--)
-{
-for (x = width - 1; x = 0; )
-{
-// note: chroma min width is 2
-dstU[x] = ClipC((srcU0[x] + srcU1[x] + offset)
shiftNum);
-dstV[x] = ClipC((srcV0[x] + srcV1[x] + offset)
shiftNum);
-x--;
-dstU[x] = ClipC((srcU0[x] + srcU1[x] + offset)
shiftNum);
-dstV[x] = ClipC((srcV0[x] + srcV1[x] + offset)
shiftNum);
-x--;
-}
-
-srcU0 += src0Stride;
-srcU1 += src1Stride;
-srcV0 += src0Stride;
-srcV1 += src1Stride;
-dstU += dststride;
-dstV += dststride;
-}
+primitives.chroma_addAvg[part](dstU, dststride, srcU0,
src0Stride, srcU1, src1Stride);
+primitives.chroma_addAvg[part](dstV, dststride, srcV0,
src0Stride, srcV1, src1Stride);
}
}
diff -r ac9e64d8a80b -r cdd54aa200bd source/common/pixel.cpp
--- a/source/common/pixel.cpp Mon Nov 18 12:26:44 2013 +0530
+++ b/source/common/pixel.cpp Mon Nov 18 15:22:03 2013 +0530
@@ -794,6 +794,27 @@
a += dstride;
}
}
+
+templateint bx, int by
+void addAvg(pixel* dst, intptr_t dstStride, int16_t* src0, intptr_t
src0Stride, int16_t* src1, intptr_t src1Stride)
+{
+int shiftNum, offset;
+shiftNum = IF_INTERNAL_PREC + 1 - X265_DEPTH;
+offset = (1 (shiftNum - 1)) + 2 * IF_INTERNAL_OFFS;
+
+for (int y = 0; y by; y++)
+{
+for (int x = 0; x bx; x += 2)
+{
+dst[x + 0] = ClipY((src0[x + 0] + src1[x + 0] + offset)
shiftNum);
+dst[x + 1] = ClipY((src0[x + 1] + src1[x + 1] + offset)
shiftNum);
+}
+
+src0 += src0Stride;
+src1 += src1Stride;
+dst += dstStride;
+}
+}
} // end anonymous namespace
namespace x265 {
@@ -835,12 +856,14 @@
p.satd[LUMA_16x64] = satd816, 64;
#define CHROMA(W, H) \
+p.chroma_addAvg[CHROMA_ ## W ## x ## H] = addAvgW, H; \
p.chroma_copy_pp[CSP_I420][CHROMA_ ## W ## x ##