Add a standard bit depth Neon asm implementation for pixel_ssd_s for block size 64x64.
A high bit depth Neon asm implementation already exists just enable it. --- source/common/aarch64/asm-primitives.cpp | 2 ++ source/common/aarch64/ssd-a.S | 35 ++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index c1f41cb3c..c1317eb74 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -557,11 +557,13 @@ void setupNeonPrimitives(EncoderPrimitives &p) p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_neon); p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_neon); p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_neon); + p.cu[BLOCK_64x64].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_64x64_neon); p.cu[BLOCK_4x4].ssd_s[ALIGNED] = PFX(pixel_ssd_s_4x4_neon); p.cu[BLOCK_8x8].ssd_s[ALIGNED] = PFX(pixel_ssd_s_8x8_neon); p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_16x16_neon); p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32x32_neon); + p.cu[BLOCK_64x64].ssd_s[ALIGNED] = PFX(pixel_ssd_s_64x64_neon); #if !HIGH_BIT_DEPTH // pixel_avg_pp diff --git a/source/common/aarch64/ssd-a.S b/source/common/aarch64/ssd-a.S index f4d612137..e5da67679 100644 --- a/source/common/aarch64/ssd-a.S +++ b/source/common/aarch64/ssd-a.S @@ -406,6 +406,41 @@ function PFX(pixel_ssd_s_32x32_neon) ret_v0_w0 endfunc +function PFX(pixel_ssd_s_64x64_neon) + add x1, x1, x1 + sub x1, x1, #64 + sub x3, x3, #64 + + mov w12, #32 + movi v0.16b, #0 + movi v1.16b, #0 +.Loop_ssd_ss_64: + sub w12, w12, #1 +.rept 2 + ld1 {v16.16b-v19.16b}, [x0], #64 + ld1 {v20.16b-v23.16b}, [x0], x1 + smlal v0.4s, v16.4h, v16.4h + smlal2 v1.4s, v16.8h, v16.8h + smlal v0.4s, v17.4h, v17.4h + smlal2 v1.4s, v17.8h, v17.8h + smlal v0.4s, v18.4h, v18.4h + smlal2 v1.4s, v18.8h, v18.8h + smlal v0.4s, v19.4h, v19.4h + smlal2 v1.4s, v19.8h, v19.8h + smlal v0.4s, v20.4h, v20.4h + smlal2 v1.4s, v20.8h, v20.8h + smlal v0.4s, v21.4h, v21.4h + smlal2 v1.4s, v21.8h, v21.8h + smlal v0.4s, v22.4h, v22.4h + smlal2 v1.4s, v22.8h, v22.8h + smlal v0.4s, v23.4h, v23.4h + smlal2 v1.4s, v23.8h, v23.8h +.endr + cbnz w12, .Loop_ssd_ss_64 + add v0.4s, v0.4s, v1.4s + ret_v0_w0 +endfunc + #else // HIGH_BIT_DEPTH .macro SSE_PP_4x2 -- 2.39.5 (Apple Git-154)
>From 338fa1e80701f5c1cc874e95ec90afc81a7af3e1 Mon Sep 17 00:00:00 2001 Message-Id: <338fa1e80701f5c1cc874e95ec90afc81a7af3e1.1740155166.git.gerdazsejke.m...@arm.com> In-Reply-To: <cover.1740155166.git.gerdazsejke.m...@arm.com> References: <cover.1740155166.git.gerdazsejke.m...@arm.com> From: Gerda Zsejke More <gerdazsejke.m...@arm.com> Date: Tue, 4 Feb 2025 17:28:37 +0100 Subject: [PATCH v2 2/2] AArch64: Add Neon asm impl. for SBD ssd_s block size 64x64 Add a standard bit depth Neon asm implementation for pixel_ssd_s for block size 64x64. A high bit depth Neon asm implementation already exists just enable it. --- source/common/aarch64/asm-primitives.cpp | 2 ++ source/common/aarch64/ssd-a.S | 35 ++++++++++++++++++++++++ 2 files changed, 37 insertions(+) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index c1f41cb3c..c1317eb74 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -557,11 +557,13 @@ void setupNeonPrimitives(EncoderPrimitives &p) p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_neon); p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_neon); p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_neon); + p.cu[BLOCK_64x64].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_64x64_neon); p.cu[BLOCK_4x4].ssd_s[ALIGNED] = PFX(pixel_ssd_s_4x4_neon); p.cu[BLOCK_8x8].ssd_s[ALIGNED] = PFX(pixel_ssd_s_8x8_neon); p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_16x16_neon); p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32x32_neon); + p.cu[BLOCK_64x64].ssd_s[ALIGNED] = PFX(pixel_ssd_s_64x64_neon); #if !HIGH_BIT_DEPTH // pixel_avg_pp diff --git a/source/common/aarch64/ssd-a.S b/source/common/aarch64/ssd-a.S index f4d612137..e5da67679 100644 --- a/source/common/aarch64/ssd-a.S +++ b/source/common/aarch64/ssd-a.S @@ -406,6 +406,41 @@ function PFX(pixel_ssd_s_32x32_neon) ret_v0_w0 endfunc +function PFX(pixel_ssd_s_64x64_neon) + add x1, x1, x1 + sub x1, x1, #64 + sub x3, x3, #64 + + mov w12, #32 + movi v0.16b, #0 + movi v1.16b, #0 +.Loop_ssd_ss_64: + sub w12, w12, #1 +.rept 2 + ld1 {v16.16b-v19.16b}, [x0], #64 + ld1 {v20.16b-v23.16b}, [x0], x1 + smlal v0.4s, v16.4h, v16.4h + smlal2 v1.4s, v16.8h, v16.8h + smlal v0.4s, v17.4h, v17.4h + smlal2 v1.4s, v17.8h, v17.8h + smlal v0.4s, v18.4h, v18.4h + smlal2 v1.4s, v18.8h, v18.8h + smlal v0.4s, v19.4h, v19.4h + smlal2 v1.4s, v19.8h, v19.8h + smlal v0.4s, v20.4h, v20.4h + smlal2 v1.4s, v20.8h, v20.8h + smlal v0.4s, v21.4h, v21.4h + smlal2 v1.4s, v21.8h, v21.8h + smlal v0.4s, v22.4h, v22.4h + smlal2 v1.4s, v22.8h, v22.8h + smlal v0.4s, v23.4h, v23.4h + smlal2 v1.4s, v23.8h, v23.8h +.endr + cbnz w12, .Loop_ssd_ss_64 + add v0.4s, v0.4s, v1.4s + ret_v0_w0 +endfunc + #else // HIGH_BIT_DEPTH .macro SSE_PP_4x2 -- 2.39.5 (Apple Git-154)
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel