Add an SVE asm implementation of high bitdepth SSD_S functions for all block sizes. This implementation is 42-45% faster on Neoverse platforms compared to the existing Neon asm implementation.
Change-Id: Ibedb5fa7f30c88523fb0388ccaf24a8f3ae87a06 --- source/common/aarch64/asm-primitives.cpp | 14 +++ source/common/aarch64/ssd-a-sve.S | 112 +++++++++++++++++++++++ 2 files changed, 126 insertions(+) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index a9076509c..f88fdc000 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -879,6 +879,20 @@ void setupSvePrimitives(EncoderPrimitives &p) p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = PFX(pixel_sse_pp_8x16_sve); p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = PFX(pixel_sse_pp_16x32_sve); p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = PFX(pixel_sse_pp_32x64_sve); + + // ssd_s + p.cu[BLOCK_4x4].ssd_s[ALIGNED] = PFX(pixel_ssd_s_4x4_sve); + p.cu[BLOCK_8x8].ssd_s[ALIGNED] = PFX(pixel_ssd_s_8x8_sve); + p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_16x16_sve); + p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32x32_sve); + p.cu[BLOCK_64x64].ssd_s[ALIGNED] = PFX(pixel_ssd_s_64x64_sve); + + p.cu[BLOCK_4x4].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_4x4_sve); + p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_sve); + p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_sve); + p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_sve); + p.cu[BLOCK_64x64].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_64x64_sve); + #endif // !HIGH_BIT_DEPTH } #endif // defined(HAVE_SVE2) || defined(HAVE_SVE) diff --git a/source/common/aarch64/ssd-a-sve.S b/source/common/aarch64/ssd-a-sve.S index c1f745947..dbb750e17 100644 --- a/source/common/aarch64/ssd-a-sve.S +++ b/source/common/aarch64/ssd-a-sve.S @@ -213,6 +213,118 @@ function PFX(pixel_sse_pp_64x64_sve) ret endfunc +function PFX(pixel_ssd_s_4x4_sve) + movi v0.4s, #0 + add x1, x1, x1 + + ldr d16, [x0] + ldr d17, [x0, x1] + sdot z0.d, z16.h, z16.h + sdot z0.d, z17.h, z17.h + add x0, x0, x1, lsl #1 + ldr d16, [x0] + ldr d17, [x0, x1] + sdot z0.d, z16.h, z16.h + sdot z0.d, z17.h, z17.h + + fmov w0, s0 + ret +endfunc + +function PFX(pixel_ssd_s_8x8_sve) + movi v0.4s, #0 + movi v1.4s, #0 + add x1, x1, x1 + +.rept 4 + ld1 {v16.8h}, [x0], x1 + sdot z0.d, z16.h, z16.h + ld1 {v17.8h}, [x0], x1 + sdot z1.d, z17.h, z17.h +.endr + + add v0.2d, v0.2d, v1.2d + addp d0, v0.2d + fmov w0, s0 + ret +endfunc + +function PFX(pixel_ssd_s_16x16_sve) + movi v0.4s, #0 + movi v1.4s, #0 + add x1, x1, x1 + + mov w12, #16 +.Loop_ssd_s_16: + sub w12, w12, #1 + + ld1 {v16.8h-v17.8h}, [x0], x1 + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + cbnz w12, .Loop_ssd_s_16 + + add v0.2d, v0.2d, v1.2d + addp d0, v0.2d + fmov x0, d0 + ret +endfunc + +function PFX(pixel_ssd_s_32x32_sve) + movi v0.4s, #0 + movi v1.4s, #0 + add x1, x1, x1 + + mov w12, #32 +.Loop_ssd_s_32: + sub w12, w12, #1 + + ldp q16, q17, [x0] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + ldp q16, q17, [x0, #32] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + + add x0, x0, x1 + cbnz w12, .Loop_ssd_s_32 + + add v0.2d, v0.2d, v1.2d + addp d0, v0.2d + fmov x0, d0 + ret +endfunc + +function PFX(pixel_ssd_s_64x64_sve) + movi v0.4s, #0 + movi v1.4s, #0 + add x1, x1, x1 + + mov w12, #64 +.Loop_ssd_s_64: + sub w12, w12, #1 + + ldp q16, q17, [x0] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + ldp q16, q17, [x0, #32] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + ldp q16, q17, [x0, #64] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + ldp q16, q17, [x0, #96] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + + add x0, x0, x1 + cbnz w12, .Loop_ssd_s_64 + + add v0.2d, v0.2d, v1.2d + addp d0, v0.2d + fmov x0, d0 + ret +endfunc + #endif // HIGH_BIT_DEPTH .macro SSE_SS_4x2 -- 2.39.5 (Apple Git-154)
>From 61fb770f867aa547b41ad0adafa82e3135e19017 Mon Sep 17 00:00:00 2001 Message-Id: <61fb770f867aa547b41ad0adafa82e3135e19017.1733846134.git.gerdazsejke.m...@arm.com> In-Reply-To: <cover.1733846134.git.gerdazsejke.m...@arm.com> References: <cover.1733846134.git.gerdazsejke.m...@arm.com> From: Gerda Zsejke More <gerdazsejke.m...@arm.com> Date: Sat, 7 Dec 2024 13:05:13 +0100 Subject: [PATCH 09/11] AArch64: Add SVE asm implementation of HBD SSD_S Add an SVE asm implementation of high bitdepth SSD_S functions for all block sizes. This implementation is 42-45% faster on Neoverse platforms compared to the existing Neon asm implementation. Change-Id: Ibedb5fa7f30c88523fb0388ccaf24a8f3ae87a06 --- source/common/aarch64/asm-primitives.cpp | 14 +++ source/common/aarch64/ssd-a-sve.S | 112 +++++++++++++++++++++++ 2 files changed, 126 insertions(+) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index a9076509c..f88fdc000 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -879,6 +879,20 @@ void setupSvePrimitives(EncoderPrimitives &p) p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = PFX(pixel_sse_pp_8x16_sve); p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = PFX(pixel_sse_pp_16x32_sve); p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = PFX(pixel_sse_pp_32x64_sve); + + // ssd_s + p.cu[BLOCK_4x4].ssd_s[ALIGNED] = PFX(pixel_ssd_s_4x4_sve); + p.cu[BLOCK_8x8].ssd_s[ALIGNED] = PFX(pixel_ssd_s_8x8_sve); + p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_16x16_sve); + p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32x32_sve); + p.cu[BLOCK_64x64].ssd_s[ALIGNED] = PFX(pixel_ssd_s_64x64_sve); + + p.cu[BLOCK_4x4].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_4x4_sve); + p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_sve); + p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_sve); + p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_sve); + p.cu[BLOCK_64x64].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_64x64_sve); + #endif // !HIGH_BIT_DEPTH } #endif // defined(HAVE_SVE2) || defined(HAVE_SVE) diff --git a/source/common/aarch64/ssd-a-sve.S b/source/common/aarch64/ssd-a-sve.S index c1f745947..dbb750e17 100644 --- a/source/common/aarch64/ssd-a-sve.S +++ b/source/common/aarch64/ssd-a-sve.S @@ -213,6 +213,118 @@ function PFX(pixel_sse_pp_64x64_sve) ret endfunc +function PFX(pixel_ssd_s_4x4_sve) + movi v0.4s, #0 + add x1, x1, x1 + + ldr d16, [x0] + ldr d17, [x0, x1] + sdot z0.d, z16.h, z16.h + sdot z0.d, z17.h, z17.h + add x0, x0, x1, lsl #1 + ldr d16, [x0] + ldr d17, [x0, x1] + sdot z0.d, z16.h, z16.h + sdot z0.d, z17.h, z17.h + + fmov w0, s0 + ret +endfunc + +function PFX(pixel_ssd_s_8x8_sve) + movi v0.4s, #0 + movi v1.4s, #0 + add x1, x1, x1 + +.rept 4 + ld1 {v16.8h}, [x0], x1 + sdot z0.d, z16.h, z16.h + ld1 {v17.8h}, [x0], x1 + sdot z1.d, z17.h, z17.h +.endr + + add v0.2d, v0.2d, v1.2d + addp d0, v0.2d + fmov w0, s0 + ret +endfunc + +function PFX(pixel_ssd_s_16x16_sve) + movi v0.4s, #0 + movi v1.4s, #0 + add x1, x1, x1 + + mov w12, #16 +.Loop_ssd_s_16: + sub w12, w12, #1 + + ld1 {v16.8h-v17.8h}, [x0], x1 + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + cbnz w12, .Loop_ssd_s_16 + + add v0.2d, v0.2d, v1.2d + addp d0, v0.2d + fmov x0, d0 + ret +endfunc + +function PFX(pixel_ssd_s_32x32_sve) + movi v0.4s, #0 + movi v1.4s, #0 + add x1, x1, x1 + + mov w12, #32 +.Loop_ssd_s_32: + sub w12, w12, #1 + + ldp q16, q17, [x0] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + ldp q16, q17, [x0, #32] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + + add x0, x0, x1 + cbnz w12, .Loop_ssd_s_32 + + add v0.2d, v0.2d, v1.2d + addp d0, v0.2d + fmov x0, d0 + ret +endfunc + +function PFX(pixel_ssd_s_64x64_sve) + movi v0.4s, #0 + movi v1.4s, #0 + add x1, x1, x1 + + mov w12, #64 +.Loop_ssd_s_64: + sub w12, w12, #1 + + ldp q16, q17, [x0] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + ldp q16, q17, [x0, #32] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + ldp q16, q17, [x0, #64] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + ldp q16, q17, [x0, #96] + sdot z0.d, z16.h, z16.h + sdot z1.d, z17.h, z17.h + + add x0, x0, x1 + cbnz w12, .Loop_ssd_s_64 + + add v0.2d, v0.2d, v1.2d + addp d0, v0.2d + fmov x0, d0 + ret +endfunc + #endif // HIGH_BIT_DEPTH .macro SSE_SS_4x2 -- 2.39.5 (Apple Git-154)
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel