Unrolling provides 9-14% uplift on Arm Neoverse server platforms. --- source/common/loopfilter.cpp | 46 +++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-)
diff --git a/source/common/loopfilter.cpp b/source/common/loopfilter.cpp index f4cd65389..3aad542b8 100644 --- a/source/common/loopfilter.cpp +++ b/source/common/loopfilter.cpp @@ -172,6 +172,50 @@ static void pelFilterChroma_c(pixel* src, intptr_t srcStep, intptr_t offset, int src[0] = x265_clip(m4 - (delta & maskQ)); } } + +void pelFilterChroma_V_c(pixel *src, intptr_t srcStep, intptr_t offset, int32_t tc, + int32_t maskP, int32_t maskQ) +{ + assert(offset == 1); + (void)offset; + + int16_t m2 = (int16_t)src[0 * srcStep - 2]; + int16_t m3 = (int16_t)src[0 * srcStep - 1]; + int16_t m4 = (int16_t)src[0 * srcStep + 0]; + int16_t m5 = (int16_t)src[0 * srcStep + 1]; + + int32_t delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3); + src[0 * srcStep - 1] = x265_clip(m3 + (delta & maskP)); + src[0 * srcStep + 0] = x265_clip(m4 - (delta & maskQ)); + + m2 = (int16_t)src[1 * srcStep - 2]; + m3 = (int16_t)src[1 * srcStep - 1]; + m4 = (int16_t)src[1 * srcStep + 0]; + m5 = (int16_t)src[1 * srcStep + 1]; + + delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3); + src[1 * srcStep - 1] = x265_clip(m3 + (delta & maskP)); + src[1 * srcStep + 0] = x265_clip(m4 - (delta & maskQ)); + + m2 = (int16_t)src[2 * srcStep - 2]; + m3 = (int16_t)src[2 * srcStep - 1]; + m4 = (int16_t)src[2 * srcStep + 0]; + m5 = (int16_t)src[2 * srcStep + 1]; + + delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3); + src[2 * srcStep - 1] = x265_clip(m3 + (delta & maskP)); + src[2 * srcStep + 0] = x265_clip(m4 - (delta & maskQ)); + + m2 = (int16_t)src[3 * srcStep - 2]; + m3 = (int16_t)src[3 * srcStep - 1]; + m4 = (int16_t)src[3 * srcStep + 0]; + m5 = (int16_t)src[3 * srcStep + 1]; + + delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3); + src[3 * srcStep - 1] = x265_clip(m3 + (delta & maskP)); + src[3 * srcStep + 0] = x265_clip(m4 - (delta & maskQ)); +} + } namespace X265_NS { @@ -190,7 +234,7 @@ void setupLoopFilterPrimitives_c(EncoderPrimitives &p) // C code is same for EDGE_VER and EDGE_HOR only asm code is different p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c; p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c; - p.pelFilterChroma[0] = pelFilterChroma_c; + p.pelFilterChroma[0] = pelFilterChroma_V_c; p.pelFilterChroma[1] = pelFilterChroma_c; } } -- 2.34.1
>From a5042902b2d65a0d0328cd700d6112f00798d298 Mon Sep 17 00:00:00 2001 Message-Id: <a5042902b2d65a0d0328cd700d6112f00798d298.1739282617.git.microdaryl.rob...@arm.com> In-Reply-To: <cover.1739282617.git.microdaryl.rob...@arm.com> References: <cover.1739282617.git.microdaryl.rob...@arm.com> From: Micro Daryl Robles <microdaryl.rob...@arm.com> Date: Mon, 19 Aug 2024 14:45:07 +0100 Subject: [PATCH 4/5] Unroll C implementation of pelFilterChroma_V Unrolling provides 9-14% uplift on Arm Neoverse server platforms. --- source/common/loopfilter.cpp | 46 +++++++++++++++++++++++++++++++++++- 1 file changed, 45 insertions(+), 1 deletion(-) diff --git a/source/common/loopfilter.cpp b/source/common/loopfilter.cpp index f4cd65389..3aad542b8 100644 --- a/source/common/loopfilter.cpp +++ b/source/common/loopfilter.cpp @@ -172,6 +172,50 @@ static void pelFilterChroma_c(pixel* src, intptr_t srcStep, intptr_t offset, int src[0] = x265_clip(m4 - (delta & maskQ)); } } + +void pelFilterChroma_V_c(pixel *src, intptr_t srcStep, intptr_t offset, int32_t tc, + int32_t maskP, int32_t maskQ) +{ + assert(offset == 1); + (void)offset; + + int16_t m2 = (int16_t)src[0 * srcStep - 2]; + int16_t m3 = (int16_t)src[0 * srcStep - 1]; + int16_t m4 = (int16_t)src[0 * srcStep + 0]; + int16_t m5 = (int16_t)src[0 * srcStep + 1]; + + int32_t delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3); + src[0 * srcStep - 1] = x265_clip(m3 + (delta & maskP)); + src[0 * srcStep + 0] = x265_clip(m4 - (delta & maskQ)); + + m2 = (int16_t)src[1 * srcStep - 2]; + m3 = (int16_t)src[1 * srcStep - 1]; + m4 = (int16_t)src[1 * srcStep + 0]; + m5 = (int16_t)src[1 * srcStep + 1]; + + delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3); + src[1 * srcStep - 1] = x265_clip(m3 + (delta & maskP)); + src[1 * srcStep + 0] = x265_clip(m4 - (delta & maskQ)); + + m2 = (int16_t)src[2 * srcStep - 2]; + m3 = (int16_t)src[2 * srcStep - 1]; + m4 = (int16_t)src[2 * srcStep + 0]; + m5 = (int16_t)src[2 * srcStep + 1]; + + delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3); + src[2 * srcStep - 1] = x265_clip(m3 + (delta & maskP)); + src[2 * srcStep + 0] = x265_clip(m4 - (delta & maskQ)); + + m2 = (int16_t)src[3 * srcStep - 2]; + m3 = (int16_t)src[3 * srcStep - 1]; + m4 = (int16_t)src[3 * srcStep + 0]; + m5 = (int16_t)src[3 * srcStep + 1]; + + delta = x265_clip3(-tc, tc, ((m4 - m3) * 4 + m2 - m5 + 4) >> 3); + src[3 * srcStep - 1] = x265_clip(m3 + (delta & maskP)); + src[3 * srcStep + 0] = x265_clip(m4 - (delta & maskQ)); +} + } namespace X265_NS { @@ -190,7 +234,7 @@ void setupLoopFilterPrimitives_c(EncoderPrimitives &p) // C code is same for EDGE_VER and EDGE_HOR only asm code is different p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c; p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c; - p.pelFilterChroma[0] = pelFilterChroma_c; + p.pelFilterChroma[0] = pelFilterChroma_V_c; p.pelFilterChroma[1] = pelFilterChroma_c; } } -- 2.34.1
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel