Add SVE intrinsics implementation for standard bit-depth pixel_var functions making use of the 16-bit dot product instruction.
This implementation is 1.0x-1.7x faster than the existing Armv8.0 Neon implementation depending on the block sizes. --- source/common/CMakeLists.txt | 2 +- source/common/aarch64/asm-primitives.cpp | 1 + source/common/aarch64/neon-sve-bridge.h | 7 ++ source/common/aarch64/pixel-prim-sve.cpp | 137 +++++++++++++++++++++++ source/common/aarch64/pixel-prim.h | 3 + 5 files changed, 149 insertions(+), 1 deletion(-) create mode 100644 source/common/aarch64/pixel-prim-sve.cpp diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index 14a837429..fdb15e756 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -107,7 +107,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp mem-neon.h) set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp pixel-prim-neon-dotprod.cpp) set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp) - set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp filter-prim-sve.h filter-prim-sve.cpp) + set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp filter-prim-sve.h filter-prim-sve.cpp pixel-prim-sve.cpp) set(C_SRCS_SVE2 sao-prim-sve2.cpp) enable_language(ASM) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index e3f8788dd..49d980616 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -791,6 +791,7 @@ void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask) setupFilterPrimitives_sve(p); setupSaoPrimitives_sve(p); setupDCTPrimitives_sve(p); + setupPixelPrimitives_sve(p); } #endif #if defined(HAVE_SVE2) && HAVE_SVE_BRIDGE diff --git a/source/common/aarch64/neon-sve-bridge.h b/source/common/aarch64/neon-sve-bridge.h index 48f89ea6e..6b450474a 100644 --- a/source/common/aarch64/neon-sve-bridge.h +++ b/source/common/aarch64/neon-sve-bridge.h @@ -58,6 +58,13 @@ static inline int64x2_t x265_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) svset_neonq_s16(svundef_s16(), s0), \ svset_neonq_s16(svundef_s16(), f), lane)) +static inline uint64x2_t x265_udotq_u16(uint64x2_t acc, uint16x8_t x, uint16x8_t y) +{ + return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc), + svset_neonq_u16(svundef_u16(), x), + svset_neonq_u16(svundef_u16(), y))); +} + static inline uint16x8_t x265_tblq_u16(uint16x8_t x, uint16x8_t idx) { return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), x), diff --git a/source/common/aarch64/pixel-prim-sve.cpp b/source/common/aarch64/pixel-prim-sve.cpp new file mode 100644 index 000000000..5740b2083 --- /dev/null +++ b/source/common/aarch64/pixel-prim-sve.cpp @@ -0,0 +1,137 @@ +/***************************************************************************** + * Copyright (C) 2025 MulticoreWare, Inc + * + * Authors: Li Zhang <li.zha...@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "pixel-prim.h" +#include "mem-neon.h" +#include "neon-sve-bridge.h" + +#include <arm_neon.h> + +namespace +{ +#if HIGH_BIT_DEPTH +template<int size> +uint64_t pixel_var_sve(const uint16_t *pix, intptr_t i_stride) +{ + if (size > 16) + { + uint64x2_t sum[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + uint64x2_t sqr[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + for (int h = 0; h < size; ++h) + { + for (int w = 0; w + 16 <= size; w += 16) + { + uint16x8_t s[2]; + load_u16x8xn<2>(pix + w, 8, s); + + sum[0] = x265_udotq_u16(sum[0], s[0], vdupq_n_u16(1)); + sum[1] = x265_udotq_u16(sum[1], s[1], vdupq_n_u16(1)); + + sqr[0] = x265_udotq_u16(sqr[0], s[0], s[0]); + sqr[1] = x265_udotq_u16(sqr[1], s[1], s[1]); + } + + pix += i_stride; + } + + sum[0] = vaddq_u64(sum[0], sum[1]); + sqr[0] = vaddq_u64(sqr[0], sqr[1]); + + return vaddvq_u64(sum[0]) + (vaddvq_u64(sqr[0]) << 32); + } + if (size == 16) + { + uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; + uint64x2_t sqr[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + for (int h = 0; h < size; ++h) + { + uint16x8_t s[2]; + load_u16x8xn<2>(pix, 8, s); + + sum[0] = vaddq_u16(sum[0], s[0]); + sum[1] = vaddq_u16(sum[1], s[1]); + + sqr[0] = x265_udotq_u16(sqr[0], s[0], s[0]); + sqr[1] = x265_udotq_u16(sqr[1], s[1], s[1]); + + pix += i_stride; + } + + uint32x4_t sum_u32 = vpaddlq_u16(sum[0]); + sum_u32 = vpadalq_u16(sum_u32, sum[1]); + sqr[0] = vaddq_u64(sqr[0], sqr[1]); + + return vaddvq_u32(sum_u32) + (vaddvq_u64(sqr[0]) << 32); + } + if (size == 8) + { + uint16x8_t sum = vdupq_n_u16(0); + uint64x2_t sqr = vdupq_n_u64(0); + + for (int h = 0; h < size; ++h) + { + uint16x8_t s = vld1q_u16(pix); + + sum = vaddq_u16(sum, s); + sqr = x265_udotq_u16(sqr, s, s); + + pix += i_stride; + } + + return vaddlvq_u16(sum) + (vaddvq_u64(sqr) << 32); + } + if (size == 4) { + uint16x4_t sum = vdup_n_u16(0); + uint32x4_t sqr = vdupq_n_u32(0); + + for (int h = 0; h < size; ++h) + { + uint16x4_t s = vld1_u16(pix); + + sum = vadd_u16(sum, s); + sqr = vmlal_u16(sqr, s, s); + + pix += i_stride; + } + + return vaddv_u16(sum) + (vaddlvq_u32(sqr) << 32); + } +} +#endif // HIGH_BIT_DEPTH +} + +namespace X265_NS +{ +void setupPixelPrimitives_sve(EncoderPrimitives &p) +{ +#if HIGH_BIT_DEPTH + p.cu[BLOCK_4x4].var = pixel_var_sve<4>; + p.cu[BLOCK_8x8].var = pixel_var_sve<8>; + p.cu[BLOCK_16x16].var = pixel_var_sve<16>; + p.cu[BLOCK_32x32].var = pixel_var_sve<32>; + p.cu[BLOCK_64x64].var = pixel_var_sve<64>; +#endif // HIGH_BIT_DEPTH +} +} diff --git a/source/common/aarch64/pixel-prim.h b/source/common/aarch64/pixel-prim.h index 74271b10c..dac00995e 100644 --- a/source/common/aarch64/pixel-prim.h +++ b/source/common/aarch64/pixel-prim.h @@ -19,6 +19,9 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p); void setupPixelPrimitives_neon_dotprod(EncoderPrimitives &p); #endif +#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE +void setupPixelPrimitives_sve(EncoderPrimitives &p); +#endif } -- 2.39.5 (Apple Git-154)
>From 8a7b4f427135b84a0e7139728bd5dbf2bd877821 Mon Sep 17 00:00:00 2001 Message-Id: <8a7b4f427135b84a0e7139728bd5dbf2bd877821.1750183023.git.li.zha...@arm.com> In-Reply-To: <cover.1750183023.git.li.zha...@arm.com> References: <cover.1750183023.git.li.zha...@arm.com> From: Li Zhang <li.zha...@arm.com> Date: Mon, 16 Jun 2025 17:23:00 +0200 Subject: [PATCH 4/4] AArch64: Add HBD pixel_var SVE intrinsics implementations Add SVE intrinsics implementation for standard bit-depth pixel_var functions making use of the 16-bit dot product instruction. This implementation is 1.0x-1.7x faster than the existing Armv8.0 Neon implementation depending on the block sizes. --- source/common/CMakeLists.txt | 2 +- source/common/aarch64/asm-primitives.cpp | 1 + source/common/aarch64/neon-sve-bridge.h | 7 ++ source/common/aarch64/pixel-prim-sve.cpp | 137 +++++++++++++++++++++++ source/common/aarch64/pixel-prim.h | 3 + 5 files changed, 149 insertions(+), 1 deletion(-) create mode 100644 source/common/aarch64/pixel-prim-sve.cpp diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index 14a837429..fdb15e756 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -107,7 +107,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp mem-neon.h) set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp pixel-prim-neon-dotprod.cpp) set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp) - set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp filter-prim-sve.h filter-prim-sve.cpp) + set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp filter-prim-sve.h filter-prim-sve.cpp pixel-prim-sve.cpp) set(C_SRCS_SVE2 sao-prim-sve2.cpp) enable_language(ASM) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index e3f8788dd..49d980616 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -791,6 +791,7 @@ void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask) setupFilterPrimitives_sve(p); setupSaoPrimitives_sve(p); setupDCTPrimitives_sve(p); + setupPixelPrimitives_sve(p); } #endif #if defined(HAVE_SVE2) && HAVE_SVE_BRIDGE diff --git a/source/common/aarch64/neon-sve-bridge.h b/source/common/aarch64/neon-sve-bridge.h index 48f89ea6e..6b450474a 100644 --- a/source/common/aarch64/neon-sve-bridge.h +++ b/source/common/aarch64/neon-sve-bridge.h @@ -58,6 +58,13 @@ static inline int64x2_t x265_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) svset_neonq_s16(svundef_s16(), s0), \ svset_neonq_s16(svundef_s16(), f), lane)) +static inline uint64x2_t x265_udotq_u16(uint64x2_t acc, uint16x8_t x, uint16x8_t y) +{ + return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc), + svset_neonq_u16(svundef_u16(), x), + svset_neonq_u16(svundef_u16(), y))); +} + static inline uint16x8_t x265_tblq_u16(uint16x8_t x, uint16x8_t idx) { return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), x), diff --git a/source/common/aarch64/pixel-prim-sve.cpp b/source/common/aarch64/pixel-prim-sve.cpp new file mode 100644 index 000000000..5740b2083 --- /dev/null +++ b/source/common/aarch64/pixel-prim-sve.cpp @@ -0,0 +1,137 @@ +/***************************************************************************** + * Copyright (C) 2025 MulticoreWare, Inc + * + * Authors: Li Zhang <li.zha...@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "pixel-prim.h" +#include "mem-neon.h" +#include "neon-sve-bridge.h" + +#include <arm_neon.h> + +namespace +{ +#if HIGH_BIT_DEPTH +template<int size> +uint64_t pixel_var_sve(const uint16_t *pix, intptr_t i_stride) +{ + if (size > 16) + { + uint64x2_t sum[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + uint64x2_t sqr[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + for (int h = 0; h < size; ++h) + { + for (int w = 0; w + 16 <= size; w += 16) + { + uint16x8_t s[2]; + load_u16x8xn<2>(pix + w, 8, s); + + sum[0] = x265_udotq_u16(sum[0], s[0], vdupq_n_u16(1)); + sum[1] = x265_udotq_u16(sum[1], s[1], vdupq_n_u16(1)); + + sqr[0] = x265_udotq_u16(sqr[0], s[0], s[0]); + sqr[1] = x265_udotq_u16(sqr[1], s[1], s[1]); + } + + pix += i_stride; + } + + sum[0] = vaddq_u64(sum[0], sum[1]); + sqr[0] = vaddq_u64(sqr[0], sqr[1]); + + return vaddvq_u64(sum[0]) + (vaddvq_u64(sqr[0]) << 32); + } + if (size == 16) + { + uint16x8_t sum[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; + uint64x2_t sqr[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + for (int h = 0; h < size; ++h) + { + uint16x8_t s[2]; + load_u16x8xn<2>(pix, 8, s); + + sum[0] = vaddq_u16(sum[0], s[0]); + sum[1] = vaddq_u16(sum[1], s[1]); + + sqr[0] = x265_udotq_u16(sqr[0], s[0], s[0]); + sqr[1] = x265_udotq_u16(sqr[1], s[1], s[1]); + + pix += i_stride; + } + + uint32x4_t sum_u32 = vpaddlq_u16(sum[0]); + sum_u32 = vpadalq_u16(sum_u32, sum[1]); + sqr[0] = vaddq_u64(sqr[0], sqr[1]); + + return vaddvq_u32(sum_u32) + (vaddvq_u64(sqr[0]) << 32); + } + if (size == 8) + { + uint16x8_t sum = vdupq_n_u16(0); + uint64x2_t sqr = vdupq_n_u64(0); + + for (int h = 0; h < size; ++h) + { + uint16x8_t s = vld1q_u16(pix); + + sum = vaddq_u16(sum, s); + sqr = x265_udotq_u16(sqr, s, s); + + pix += i_stride; + } + + return vaddlvq_u16(sum) + (vaddvq_u64(sqr) << 32); + } + if (size == 4) { + uint16x4_t sum = vdup_n_u16(0); + uint32x4_t sqr = vdupq_n_u32(0); + + for (int h = 0; h < size; ++h) + { + uint16x4_t s = vld1_u16(pix); + + sum = vadd_u16(sum, s); + sqr = vmlal_u16(sqr, s, s); + + pix += i_stride; + } + + return vaddv_u16(sum) + (vaddlvq_u32(sqr) << 32); + } +} +#endif // HIGH_BIT_DEPTH +} + +namespace X265_NS +{ +void setupPixelPrimitives_sve(EncoderPrimitives &p) +{ +#if HIGH_BIT_DEPTH + p.cu[BLOCK_4x4].var = pixel_var_sve<4>; + p.cu[BLOCK_8x8].var = pixel_var_sve<8>; + p.cu[BLOCK_16x16].var = pixel_var_sve<16>; + p.cu[BLOCK_32x32].var = pixel_var_sve<32>; + p.cu[BLOCK_64x64].var = pixel_var_sve<64>; +#endif // HIGH_BIT_DEPTH +} +} diff --git a/source/common/aarch64/pixel-prim.h b/source/common/aarch64/pixel-prim.h index 74271b10c..dac00995e 100644 --- a/source/common/aarch64/pixel-prim.h +++ b/source/common/aarch64/pixel-prim.h @@ -19,6 +19,9 @@ void setupPixelPrimitives_neon(EncoderPrimitives &p); void setupPixelPrimitives_neon_dotprod(EncoderPrimitives &p); #endif +#if defined(HAVE_SVE) && HAVE_SVE_BRIDGE +void setupPixelPrimitives_sve(EncoderPrimitives &p); +#endif } -- 2.39.5 (Apple Git-154)
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel