# HG changeset patch # User Dnyaneshwar G <dnyanesh...@multicorewareinc.com> # Date 1444121396 -19800 # Tue Oct 06 14:19:56 2015 +0530 # Node ID 38e4b94377fa6ffe57472c49ecff6c909ed4f6dc # Parent f8ad1ff7074aab85a6cf376886014c88f46b7275 asm: separated deblocking filter into horizontal & vertical primitives for asm
diff -r f8ad1ff7074a -r 38e4b94377fa source/common/deblock.cpp --- a/source/common/deblock.cpp Thu Oct 08 15:27:34 2015 -0500 +++ b/source/common/deblock.cpp Tue Oct 06 14:19:56 2015 +0530 @@ -280,31 +280,6 @@ * \param maskQ indicator to enable filtering on partQ * \param maskP1 decision weak filter/no filter for partP * \param maskQ1 decision weak filter/no filter for partQ */ -static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ) -{ - int32_t tc2 = 2 * tc; - int32_t tcP = (tc2 & maskP); - int32_t tcQ = (tc2 & maskQ); - for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep) - { - int16_t m4 = (int16_t)src[0]; - int16_t m3 = (int16_t)src[-offset]; - int16_t m5 = (int16_t)src[offset]; - int16_t m2 = (int16_t)src[-offset * 2]; - int16_t m6 = (int16_t)src[offset * 2]; - int16_t m1 = (int16_t)src[-offset * 3]; - int16_t m7 = (int16_t)src[offset * 3]; - int16_t m0 = (int16_t)src[-offset * 4]; - src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1); - src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2); - src[-offset] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3); - src[0] = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4); - src[offset] = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5); - src[offset * 2] = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6); - } -} - -/* Weak filter */ static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ, int32_t maskP1, int32_t maskQ1) { @@ -446,7 +421,12 @@ useStrongFiltering(offset, beta, tc, src + unitOffset + srcStep * 3)); if (sw) - pelFilterLumaStrong(src + unitOffset, srcStep, offset, tc, maskP, maskQ); + { + int32_t tc2 = 2 * tc; + int32_t tcP = (tc2 & maskP); + int32_t tcQ = (tc2 & maskQ); + primitives.pelFilterLumaStrong[dir](src + unitOffset, srcStep, offset, tcP, tcQ); + } else { int32_t sideThreshold = (beta + (beta >> 1)) >> 3; diff -r f8ad1ff7074a -r 38e4b94377fa source/common/loopfilter.cpp --- a/source/common/loopfilter.cpp Thu Oct 08 15:27:34 2015 -0500 +++ b/source/common/loopfilter.cpp Tue Oct 06 14:19:56 2015 +0530 @@ -137,6 +137,27 @@ rec += stride; } } + +void pelFilterLumaStrong_c(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ) +{ + for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep) + { + int16_t m4 = (int16_t)src[0]; + int16_t m3 = (int16_t)src[-offset]; + int16_t m5 = (int16_t)src[offset]; + int16_t m2 = (int16_t)src[-offset * 2]; + int16_t m6 = (int16_t)src[offset * 2]; + int16_t m1 = (int16_t)src[-offset * 3]; + int16_t m7 = (int16_t)src[offset * 3]; + int16_t m0 = (int16_t)src[-offset * 4]; + src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1); + src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2); + src[-offset] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3); + src[0] = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4); + src[offset] = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5); + src[offset * 2] = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6); + } +} } namespace X265_NS { @@ -151,5 +172,9 @@ p.saoCuOrgE3[1] = processSaoCUE3; p.saoCuOrgB0 = processSaoCUB0; p.sign = calSign; + + // C code is same for EDGE_VER and EDGE_HOR only asm code is different + p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c; + p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c; } } diff -r f8ad1ff7074a -r 38e4b94377fa source/common/primitives.h --- a/source/common/primitives.h Thu Oct 08 15:27:34 2015 -0500 +++ b/source/common/primitives.h Tue Oct 06 14:19:56 2015 +0530 @@ -196,6 +196,8 @@ typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero, int idx); typedef uint32_t (*costC1C2Flag_t)(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset); +typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ); + /* Function pointers to optimized encoder primitives. Each pointer can reference * either an assembly routine, a SIMD intrinsic primitive, or a C function */ struct EncoderPrimitives @@ -330,6 +332,7 @@ costCoeffRemain_t costCoeffRemain; costC1C2Flag_t costC1C2Flag; + pelFilterLumaStrong_t pelFilterLumaStrong[2]; // EDGE_VER = 0, EDGE_HOR = 1 /* There is one set of chroma primitives per color space. An encoder will * have just a single color space and thus it will only ever use one entry _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel