Relative performance compared to scalar C: Neoverse N1: 3.67x Neoverse N2: 2.57x Neoverse V1: 2.31x Neoverse V2: 2.35x --- source/common/aarch64/loopfilter-prim.cpp | 75 +++++++++++++++++++++++ 1 file changed, 75 insertions(+)
diff --git a/source/common/aarch64/loopfilter-prim.cpp b/source/common/aarch64/loopfilter-prim.cpp index 9242c859e..7fef25cbb 100644 --- a/source/common/aarch64/loopfilter-prim.cpp +++ b/source/common/aarch64/loopfilter-prim.cpp @@ -1,5 +1,6 @@ #include "common.h" #include "loopfilter-prim.h" +#include "mem-neon.h" #define PIXEL_MIN 0 @@ -311,6 +312,79 @@ void pelFilterLumaStrong_V_neon(pixel *src, intptr_t srcStep, intptr_t offset, } } +void pelFilterLumaStrong_H_neon(pixel *src, intptr_t srcStep, intptr_t offset, + int32_t tcP, int32_t tcQ) +{ + assert(UNIT_SIZE == 4 && srcStep == 1); + (void)srcStep; + + const int16x8_t tc_vec = vcombine_s16(vdup_n_s16(tcP), vdup_n_s16(tcQ)); + const int16x8_t neg_tc_vec = vnegq_s16(tc_vec); + + uint8x8_t m0 = vld1_u8(src - 4 * offset); + uint8x8_t m1 = vld1_u8(src - 3 * offset); + uint8x8_t m2 = vld1_u8(src - 2 * offset); + uint8x8_t m3 = vld1_u8(src - 1 * offset); + uint8x8_t m4 = vld1_u8(src - 0 * offset); + uint8x8_t m5 = vld1_u8(src + 1 * offset); + uint8x8_t m6 = vld1_u8(src + 2 * offset); + uint8x8_t m7 = vld1_u8(src + 3 * offset); + + uint8x8_t m12 = + vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m1), vreinterpret_u32_u8(m2))); + uint8x8_t m23 = + vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m2), vreinterpret_u32_u8(m3))); + uint8x8_t m34 = + vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m3), vreinterpret_u32_u8(m4))); + uint8x8_t m45 = + vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m4), vreinterpret_u32_u8(m5))); + uint8x8_t m56 = + vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m5), vreinterpret_u32_u8(m6))); + + // src[-1 * offset], src[0 * offset] + uint16x8_t p0 = vaddl_u8(m23, m34); + p0 = vaddw_u8(p0, m45); + uint16x8_t t0 = vshlq_n_u16(p0, 1); + uint16x8_t t1 = vaddl_u8(m12, m56); + uint16x8_t t01 = vaddq_u16(t0, t1); + t01 = vrshrq_n_u16(t01, 3); + t01 = vsubw_u8(t01, m34); + t01 = vreinterpretq_u16_s16( + vminq_s16(tc_vec, vmaxq_s16(neg_tc_vec, vreinterpretq_s16_u16(t01)))); + uint8x8_t d01 = vmovn_u16(t01); + d01 = vadd_u8(d01, m34); + store_u8x4_strided_xN<2>(&src[-1 * offset], 1 * offset, &d01); + + uint8x8_t m16 = + vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m1), vreinterpret_u32_u8(m6))); + uint8x8_t m25 = + vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m2), vreinterpret_u32_u8(m5))); + + // src[-2 * offset], src[1 * offset] + uint16x8_t p1 = vaddw_u8(p0, m16); + uint16x8_t t23 = vrshrq_n_u16(p1, 2); + t23 = vsubw_u8(t23, m25); + t23 = vreinterpretq_u16_s16( + vminq_s16(tc_vec, vmaxq_s16(neg_tc_vec, vreinterpretq_s16_u16(t23)))); + uint8x8_t d23 = vmovn_u16(t23); + d23 = vadd_u8(d23, m25); + store_u8x4_strided_xN<2>(&src[-2 * offset], 3 * offset, &d23); + + uint8x8_t m07 = + vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m0), vreinterpret_u32_u8(m7))); + + // src[-3 * offset], src[2 * offset] + uint16x8_t p2 = vaddl_u8(m07, m16); + uint16x8_t t45 = vmlaq_n_u16(p1, p2, 2); + t45 = vrshrq_n_u16(t45, 3); + t45 = vsubw_u8(t45, m16); + t45 = vreinterpretq_u16_s16( + vminq_s16(tc_vec, vmaxq_s16(neg_tc_vec, vreinterpretq_s16_u16(t45)))); + uint8x8_t d45 = vmovn_u16(t45); + d45 = vadd_u8(d45, m16); + store_u8x4_strided_xN<2>(&src[-3 * offset], 5 * offset, &d45); +} + } // namespace namespace X265_NS @@ -328,6 +402,7 @@ void setupLoopFilterPrimitives_neon(EncoderPrimitives &p) p.sign = calSign_neon; p.pelFilterLumaStrong[0] = pelFilterLumaStrong_V_neon; + p.pelFilterLumaStrong[1] = pelFilterLumaStrong_H_neon; } -- 2.34.1
>From b8d26d05e75e64129c92a67a0c63ffe432856341 Mon Sep 17 00:00:00 2001 Message-Id: <b8d26d05e75e64129c92a67a0c63ffe432856341.1739282617.git.microdaryl.rob...@arm.com> In-Reply-To: <cover.1739282617.git.microdaryl.rob...@arm.com> References: <cover.1739282617.git.microdaryl.rob...@arm.com> From: Micro Daryl Robles <microdaryl.rob...@arm.com> Date: Wed, 14 Aug 2024 01:22:44 +0100 Subject: [PATCH 2/5] AArch64: Add Neon implementation of pelFilterLumaStrong_H Relative performance compared to scalar C: Neoverse N1: 3.67x Neoverse N2: 2.57x Neoverse V1: 2.31x Neoverse V2: 2.35x --- source/common/aarch64/loopfilter-prim.cpp | 75 +++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/source/common/aarch64/loopfilter-prim.cpp b/source/common/aarch64/loopfilter-prim.cpp index 9242c859e..7fef25cbb 100644 --- a/source/common/aarch64/loopfilter-prim.cpp +++ b/source/common/aarch64/loopfilter-prim.cpp @@ -1,5 +1,6 @@ #include "common.h" #include "loopfilter-prim.h" +#include "mem-neon.h" #define PIXEL_MIN 0 @@ -311,6 +312,79 @@ void pelFilterLumaStrong_V_neon(pixel *src, intptr_t srcStep, intptr_t offset, } } +void pelFilterLumaStrong_H_neon(pixel *src, intptr_t srcStep, intptr_t offset, + int32_t tcP, int32_t tcQ) +{ + assert(UNIT_SIZE == 4 && srcStep == 1); + (void)srcStep; + + const int16x8_t tc_vec = vcombine_s16(vdup_n_s16(tcP), vdup_n_s16(tcQ)); + const int16x8_t neg_tc_vec = vnegq_s16(tc_vec); + + uint8x8_t m0 = vld1_u8(src - 4 * offset); + uint8x8_t m1 = vld1_u8(src - 3 * offset); + uint8x8_t m2 = vld1_u8(src - 2 * offset); + uint8x8_t m3 = vld1_u8(src - 1 * offset); + uint8x8_t m4 = vld1_u8(src - 0 * offset); + uint8x8_t m5 = vld1_u8(src + 1 * offset); + uint8x8_t m6 = vld1_u8(src + 2 * offset); + uint8x8_t m7 = vld1_u8(src + 3 * offset); + + uint8x8_t m12 = + vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m1), vreinterpret_u32_u8(m2))); + uint8x8_t m23 = + vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m2), vreinterpret_u32_u8(m3))); + uint8x8_t m34 = + vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m3), vreinterpret_u32_u8(m4))); + uint8x8_t m45 = + vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m4), vreinterpret_u32_u8(m5))); + uint8x8_t m56 = + vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m5), vreinterpret_u32_u8(m6))); + + // src[-1 * offset], src[0 * offset] + uint16x8_t p0 = vaddl_u8(m23, m34); + p0 = vaddw_u8(p0, m45); + uint16x8_t t0 = vshlq_n_u16(p0, 1); + uint16x8_t t1 = vaddl_u8(m12, m56); + uint16x8_t t01 = vaddq_u16(t0, t1); + t01 = vrshrq_n_u16(t01, 3); + t01 = vsubw_u8(t01, m34); + t01 = vreinterpretq_u16_s16( + vminq_s16(tc_vec, vmaxq_s16(neg_tc_vec, vreinterpretq_s16_u16(t01)))); + uint8x8_t d01 = vmovn_u16(t01); + d01 = vadd_u8(d01, m34); + store_u8x4_strided_xN<2>(&src[-1 * offset], 1 * offset, &d01); + + uint8x8_t m16 = + vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m1), vreinterpret_u32_u8(m6))); + uint8x8_t m25 = + vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m2), vreinterpret_u32_u8(m5))); + + // src[-2 * offset], src[1 * offset] + uint16x8_t p1 = vaddw_u8(p0, m16); + uint16x8_t t23 = vrshrq_n_u16(p1, 2); + t23 = vsubw_u8(t23, m25); + t23 = vreinterpretq_u16_s16( + vminq_s16(tc_vec, vmaxq_s16(neg_tc_vec, vreinterpretq_s16_u16(t23)))); + uint8x8_t d23 = vmovn_u16(t23); + d23 = vadd_u8(d23, m25); + store_u8x4_strided_xN<2>(&src[-2 * offset], 3 * offset, &d23); + + uint8x8_t m07 = + vreinterpret_u8_u32(vzip1_u32(vreinterpret_u32_u8(m0), vreinterpret_u32_u8(m7))); + + // src[-3 * offset], src[2 * offset] + uint16x8_t p2 = vaddl_u8(m07, m16); + uint16x8_t t45 = vmlaq_n_u16(p1, p2, 2); + t45 = vrshrq_n_u16(t45, 3); + t45 = vsubw_u8(t45, m16); + t45 = vreinterpretq_u16_s16( + vminq_s16(tc_vec, vmaxq_s16(neg_tc_vec, vreinterpretq_s16_u16(t45)))); + uint8x8_t d45 = vmovn_u16(t45); + d45 = vadd_u8(d45, m16); + store_u8x4_strided_xN<2>(&src[-3 * offset], 5 * offset, &d45); +} + } // namespace namespace X265_NS @@ -328,6 +402,7 @@ void setupLoopFilterPrimitives_neon(EncoderPrimitives &p) p.sign = calSign_neon; p.pelFilterLumaStrong[0] = pelFilterLumaStrong_V_neon; + p.pelFilterLumaStrong[1] = pelFilterLumaStrong_H_neon; } -- 2.34.1
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel