Add SVE implementation of HBD interp_horiz_pp for LUMA filtering. An implementation was added for block sizes with width equal to 4 for both 10-bit and 12-bit build, but for bigger block sizes the SVE implementation was only enabled for 12-bit build.
This implementation gives up to 9% uplift compared to the existing Neon implementation. --- source/common/CMakeLists.txt | 2 +- source/common/aarch64/asm-primitives.cpp | 2 + source/common/aarch64/filter-prim-sve.cpp | 317 ++++++++++++++++++++++ source/common/aarch64/filter-prim-sve.h | 37 +++ source/common/aarch64/neon-sve-bridge.h | 12 + 5 files changed, 369 insertions(+), 1 deletion(-) create mode 100644 source/common/aarch64/filter-prim-sve.cpp create mode 100644 source/common/aarch64/filter-prim-sve.h diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index 744fc21de..a6f56c8c8 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -107,7 +107,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp mem-neon.h) set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp) set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp) - set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp) + set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp filter-prim-sve.h filter-prim-sve.cpp) set(C_SRCS_SVE2 sao-prim-sve2.cpp) enable_language(ASM) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index c1317eb74..b295d419b 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -380,6 +380,7 @@ extern "C" { #include "pixel-prim.h" #include "filter-prim.h" +#include "filter-prim-sve.h" #include "dct-prim.h" #include "loopfilter-prim.h" #include "intrapred-prim.h" @@ -1075,6 +1076,7 @@ void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask) #if defined(HAVE_SVE) && HAVE_SVE_BRIDGE if (cpuMask & X265_CPU_SVE) { + setupFilterPrimitives_sve(p); setupSaoPrimitives_sve(p); setupDCTPrimitives_sve(p); } diff --git a/source/common/aarch64/filter-prim-sve.cpp b/source/common/aarch64/filter-prim-sve.cpp new file mode 100644 index 000000000..267619af0 --- /dev/null +++ b/source/common/aarch64/filter-prim-sve.cpp @@ -0,0 +1,317 @@ +/***************************************************************************** + * Copyright (C) 2025 MulticoreWare, Inc + * + * Authors: Gerda Zsejke More <gerdazsejke.m...@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "filter-prim-sve.h" +#include "mem-neon.h" +#include "neon-sve-bridge.h" + +#include <arm_neon.h> + +#if HIGH_BIT_DEPTH +static const uint16_t dotprod_h_permute_tbl[32] = { + // clang-format off + 0, 1, 2, 3, 1, 2, 3, 4, + 2, 3, 4, 5, 3, 4, 5, 6, + 3, 2, 1, 0, 4, 3, 2, 1, + 5, 4, 3, 2, 6, 5, 4, 3, + // clang-format on +}; + +template<bool coeff2> +void inline filter8_u16x4(const uint16x8_t *s, uint16x4_t &d, int16x8_t filter, + uint16x4_t maxVal) +{ + if (coeff2) + { + int16x8_t sum01 = vreinterpretq_s16_u16(vaddq_u16(s[0], s[1])); + int16x8_t sum23 = vreinterpretq_s16_u16(vaddq_u16(s[2], s[3])); + + int64x2_t sum_lo = x265_sdotq_lane_s16(vdupq_n_s64(0), sum01, filter, 0); + int64x2_t sum_hi = x265_sdotq_lane_s16(vdupq_n_s64(0), sum23, filter, 0); + + int32x4_t sum = vcombine_s32(vmovn_s64(sum_lo), vmovn_s64(sum_hi)); + + d = vqrshrun_n_s32(sum, IF_FILTER_PREC); + d = vmin_u16(d, maxVal); + } + else + { + int64x2_t sum_lo = + x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[0]), filter, 0); + int64x2_t sum_hi = + x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[2]), filter, 0); + + sum_lo = x265_sdotq_lane_s16(sum_lo, vreinterpretq_s16_u16(s[1]), filter, 1); + sum_hi = x265_sdotq_lane_s16(sum_hi, vreinterpretq_s16_u16(s[3]), filter, 1); + + int32x4_t sum = vcombine_s32(vmovn_s64(sum_lo), vmovn_s64(sum_hi)); + + d = vqrshrun_n_s32(sum, IF_FILTER_PREC); + d = vmin_u16(d, maxVal); + } +} + +template<bool coeff2> +void inline filter8_u16x8(uint16x8_t *s, uint16x8_t &d, int16x8_t filter, + uint16x8_t maxVal) +{ + if (coeff2) + { + int16x8_t sum01 = vreinterpretq_s16_u16(vaddq_u16(s[0], s[1])); + int16x8_t sum23 = vreinterpretq_s16_u16(vaddq_u16(s[2], s[3])); + int16x8_t sum45 = vreinterpretq_s16_u16(vaddq_u16(s[4], s[5])); + int16x8_t sum67 = vreinterpretq_s16_u16(vaddq_u16(s[6], s[7])); + + int64x2_t sum0 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum01, filter, 0); + int64x2_t sum1 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum23, filter, 0); + int64x2_t sum2 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum45, filter, 0); + int64x2_t sum3 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum67, filter, 0); + + int32x4_t sum_lo = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1)); + int32x4_t sum_hi = vcombine_s32(vmovn_s64(sum2), vmovn_s64(sum3)); + + uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC); + uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC); + + d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal); + } + else + { + int64x2_t sum0 = + x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[0]), filter, 0); + int64x2_t sum1 = + x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[1]), filter, 0); + int64x2_t sum2 = + x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[2]), filter, 0); + int64x2_t sum3 = + x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[3]), filter, 0); + + sum0 = x265_sdotq_lane_s16(sum0, vreinterpretq_s16_u16(s[4]), filter, 1); + sum1 = x265_sdotq_lane_s16(sum1, vreinterpretq_s16_u16(s[5]), filter, 1); + sum2 = x265_sdotq_lane_s16(sum2, vreinterpretq_s16_u16(s[6]), filter, 1); + sum3 = x265_sdotq_lane_s16(sum3, vreinterpretq_s16_u16(s[7]), filter, 1); + + int32x4_t sum_lo = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum2)); + int32x4_t sum_hi = vcombine_s32(vmovn_s64(sum1), vmovn_s64(sum3)); + + uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC); + uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC); + + d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal); + } +} + +template<bool coeff2> +void inline setup_s_hpp_x4(uint16x8_t *d, uint16x8_t s0, uint16x8_t s1, uint16x8_t *idx) +{ + if (coeff2) + { + d[0] = x265_tblq_u16(s0, idx[0]); + d[1] = x265_tblq_u16(s1, idx[2]); + d[2] = x265_tblq_u16(s0, idx[1]); + d[3] = x265_tblq_u16(s1, idx[3]); + } + else + { + d[0] = x265_tblq_u16(s0, idx[0]); + d[1] = x265_tblq_u16(s1, idx[0]); + d[2] = x265_tblq_u16(s0, idx[1]); + d[3] = x265_tblq_u16(s1, idx[1]); + } +} + +template<bool coeff2> +void inline setup_s_hpp_x8(uint16x8_t *d, uint16x8_t s0, uint16x8_t s1, uint16x8_t s2, + uint16x8_t *idx) +{ + if (coeff2) + { + d[0] = x265_tblq_u16(s0, idx[0]); + d[1] = x265_tblq_u16(s1, idx[2]); + d[2] = x265_tblq_u16(s0, idx[1]); + d[3] = x265_tblq_u16(s1, idx[3]); + d[4] = x265_tblq_u16(s1, idx[0]); + d[5] = x265_tblq_u16(s2, idx[2]); + d[6] = x265_tblq_u16(s1, idx[1]); + d[7] = x265_tblq_u16(s2, idx[3]); + } + else + { + d[0] = x265_tblq_u16(s0, idx[0]); + d[1] = x265_tblq_u16(s1, idx[0]); + d[2] = x265_tblq_u16(s0, idx[1]); + d[3] = x265_tblq_u16(s1, idx[1]); + d[4] = d[1]; + d[5] = x265_tblq_u16(s2, idx[0]); + d[6] = d[3]; + d[7] = x265_tblq_u16(s2, idx[1]); + } +} + +template<bool coeff2, int width, int height> +void inline interp8_hpp_sve(const pixel *src, intptr_t srcStride, + pixel *dst, intptr_t dstStride, int coeffIdx) +{ + const int N_TAPS = 8; + const uint16x8_t maxVal = vdupq_n_u16((1 << X265_DEPTH) - 1); + const int16x8_t filter = vld1q_s16(X265_NS::g_lumaFilter[coeffIdx]); + uint16x8_t idx[4]; + + idx[0] = vld1q_u16(dotprod_h_permute_tbl + 0); + idx[1] = vld1q_u16(dotprod_h_permute_tbl + 8); + if (coeff2) + { + idx[2] = vld1q_u16(dotprod_h_permute_tbl + 16); + idx[3] = vld1q_u16(dotprod_h_permute_tbl + 24); + } + + src -= N_TAPS / 2 - 1; + + for (int row = 0; row < height; row++) + { + if (width % 16 == 0 || width == 24) + { + int col = 0; + for (; col <= width - 16; col += 16) + { + uint16x8_t s[5]; + load_u16x8xn<5>(src + col, 4, s); + + uint16x8_t s0[N_TAPS], s1[N_TAPS]; + setup_s_hpp_x8<coeff2>(s0, s[0], s[1], s[2], idx); + setup_s_hpp_x8<coeff2>(s1, s[2], s[3], s[4], idx); + + uint16x8_t d0, d1; + filter8_u16x8<coeff2>(s0, d0, filter, maxVal); + filter8_u16x8<coeff2>(s1, d1, filter, maxVal); + + vst1q_u16(dst + col + 0, d0); + vst1q_u16(dst + col + 8, d1); + } + + if (width == 24) + { + uint16x8_t s[3]; + load_u16x8xn<3>(src + col, 4, s); + + uint16x8_t s0[N_TAPS]; + setup_s_hpp_x8<coeff2>(s0, s[0], s[1], s[2], idx); + + uint16x8_t d0; + filter8_u16x8<coeff2>(s0, d0, filter, maxVal); + + vst1q_u16(dst + col, d0); + } + } + else if (width == 4) + { + uint16x8_t s[2]; + load_u16x8xn<2>(src, 4, s); + + uint16x8_t s0[N_TAPS]; + setup_s_hpp_x4<coeff2>(s0, s[0], s[1], idx); + + uint16x4_t d0; + filter8_u16x4<coeff2>(s0, d0, filter, vget_low_u16(maxVal)); + + vst1_u16(dst, d0); + } + + src += srcStride; + dst += dstStride; + } +} + +namespace X265_NS { +// Declaration for use in interp8_horiz_pp_sve(). +template<int N, int width, int height> +void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, + intptr_t dstStride, int coeffIdx); + +template<int width, int height> +void interp8_horiz_pp_sve(const pixel *src, intptr_t srcStride, pixel *dst, + intptr_t dstStride, int coeffIdx) +{ + switch (coeffIdx) + { + case 1: + if (width <= 16) + { + return interp_horiz_pp_neon<8, width, height>(src, srcStride, dst, + dstStride, coeffIdx); + } + else + { + return interp8_hpp_sve<false, width, height>(src, srcStride, dst, + dstStride, coeffIdx); + } + case 2: + if (width > 4) + { + return interp_horiz_pp_neon<8, width, height>(src, srcStride, dst, + dstStride, coeffIdx); + } + else + { + return interp8_hpp_sve<true, width, height>(src, srcStride, dst, + dstStride, coeffIdx); + } + case 3: + return interp_horiz_pp_neon<8, width, height>(src, srcStride, dst, + dstStride, coeffIdx); + } +} + +void setupFilterPrimitives_sve(EncoderPrimitives &p) +{ + p.pu[LUMA_4x4].luma_hpp = interp8_horiz_pp_sve<4, 4>; + p.pu[LUMA_4x8].luma_hpp = interp8_horiz_pp_sve<4, 8>; + p.pu[LUMA_4x16].luma_hpp = interp8_horiz_pp_sve<4, 16>; +#if X265_DEPTH == 12 + p.pu[LUMA_16x4].luma_hpp = interp8_horiz_pp_sve<16, 4>; + p.pu[LUMA_16x8].luma_hpp = interp8_horiz_pp_sve<16, 8>; + p.pu[LUMA_16x12].luma_hpp = interp8_horiz_pp_sve<16, 12>; + p.pu[LUMA_16x16].luma_hpp = interp8_horiz_pp_sve<16, 16>; + p.pu[LUMA_16x32].luma_hpp = interp8_horiz_pp_sve<16, 32>; + p.pu[LUMA_16x64].luma_hpp = interp8_horiz_pp_sve<16, 64>; + p.pu[LUMA_24x32].luma_hpp = interp8_horiz_pp_sve<24, 32>; + p.pu[LUMA_32x8].luma_hpp = interp8_horiz_pp_sve<32, 8>; + p.pu[LUMA_32x16].luma_hpp = interp8_horiz_pp_sve<32, 16>; + p.pu[LUMA_32x24].luma_hpp = interp8_horiz_pp_sve<32, 24>; + p.pu[LUMA_32x32].luma_hpp = interp8_horiz_pp_sve<32, 32>; + p.pu[LUMA_32x64].luma_hpp = interp8_horiz_pp_sve<32, 64>; + p.pu[LUMA_48x64].luma_hpp = interp8_horiz_pp_sve<48, 64>; + p.pu[LUMA_64x16].luma_hpp = interp8_horiz_pp_sve<64, 16>; + p.pu[LUMA_64x32].luma_hpp = interp8_horiz_pp_sve<64, 32>; + p.pu[LUMA_64x48].luma_hpp = interp8_horiz_pp_sve<64, 48>; + p.pu[LUMA_64x64].luma_hpp = interp8_horiz_pp_sve<64, 64>; +#endif // X265_DEPTH == 12 +} +} // namespace X265_NS +#else // !HIGH_BIT_DEPTH +namespace X265_NS { +void setupFilterPrimitives_sve(EncoderPrimitives &) +{ +} +} +#endif // HIGH_BIT_DEPTH diff --git a/source/common/aarch64/filter-prim-sve.h b/source/common/aarch64/filter-prim-sve.h new file mode 100644 index 000000000..382a1adbd --- /dev/null +++ b/source/common/aarch64/filter-prim-sve.h @@ -0,0 +1,37 @@ +/***************************************************************************** + * Copyright (C) 2025 MulticoreWare, Inc + * + * Authors: Gerda Zsejke More <gerdazsejke.m...@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_FILTER_PRIM_SVE_H +#define X265_FILTER_PRIM_SVE_H + +#if defined(HAVE_SVE) + +#include "primitives.h" + +namespace X265_NS { +void setupFilterPrimitives_sve(EncoderPrimitives &p); +} + +#endif // defined(HAVE_SVE) + +#endif // X265_FILTER_PRIM_SVE_H diff --git a/source/common/aarch64/neon-sve-bridge.h b/source/common/aarch64/neon-sve-bridge.h index dad5fa909..48f89ea6e 100644 --- a/source/common/aarch64/neon-sve-bridge.h +++ b/source/common/aarch64/neon-sve-bridge.h @@ -3,6 +3,7 @@ * * Authors: Hari Limaye <hari.lim...@arm.com> * Jonathan Wright <jonathan.wri...@arm.com> + * Gerda Zsejke More <gerdazsejke.m...@arm.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -52,6 +53,17 @@ static inline int64x2_t x265_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) svset_neonq_s16(svundef_s16(), y))); } +#define x265_sdotq_lane_s16(sum, s0, f, lane) \ + svget_neonq_s64(svdot_lane_s64(svset_neonq_s64(svundef_s64(), sum), \ + svset_neonq_s16(svundef_s16(), s0), \ + svset_neonq_s16(svundef_s16(), f), lane)) + +static inline uint16x8_t x265_tblq_u16(uint16x8_t x, uint16x8_t idx) +{ + return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), x), + svset_neonq_u16(svundef_u16(), idx))); +} + static inline int8x16_t x265_sve_mask(const int x, const int endX, const int8x16_t in) { -- 2.39.5 (Apple Git-154)
>From 5ca9958faf11e0e4cf6de7acf6001933878da884 Mon Sep 17 00:00:00 2001 Message-Id: <5ca9958faf11e0e4cf6de7acf6001933878da884.1745588006.git.gerdazsejke.m...@arm.com> In-Reply-To: <cover.1745588006.git.gerdazsejke.m...@arm.com> References: <cover.1745588006.git.gerdazsejke.m...@arm.com> From: Gerda Zsejke More <gerdazsejke.m...@arm.com> Date: Mon, 10 Mar 2025 13:22:27 +0100 Subject: [PATCH v3 1/4] AArch64: Add SVE implementation of HBD interp_horiz_pp Add SVE implementation of HBD interp_horiz_pp for LUMA filtering. An implementation was added for block sizes with width equal to 4 for both 10-bit and 12-bit build, but for bigger block sizes the SVE implementation was only enabled for 12-bit build. This implementation gives up to 9% uplift compared to the existing Neon implementation. --- source/common/CMakeLists.txt | 2 +- source/common/aarch64/asm-primitives.cpp | 2 + source/common/aarch64/filter-prim-sve.cpp | 317 ++++++++++++++++++++++ source/common/aarch64/filter-prim-sve.h | 37 +++ source/common/aarch64/neon-sve-bridge.h | 12 + 5 files changed, 369 insertions(+), 1 deletion(-) create mode 100644 source/common/aarch64/filter-prim-sve.cpp create mode 100644 source/common/aarch64/filter-prim-sve.h diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index 744fc21de..a6f56c8c8 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -107,7 +107,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp mem-neon.h) set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp) set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp) - set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp) + set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp filter-prim-sve.h filter-prim-sve.cpp) set(C_SRCS_SVE2 sao-prim-sve2.cpp) enable_language(ASM) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index c1317eb74..b295d419b 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -380,6 +380,7 @@ extern "C" { #include "pixel-prim.h" #include "filter-prim.h" +#include "filter-prim-sve.h" #include "dct-prim.h" #include "loopfilter-prim.h" #include "intrapred-prim.h" @@ -1075,6 +1076,7 @@ void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask) #if defined(HAVE_SVE) && HAVE_SVE_BRIDGE if (cpuMask & X265_CPU_SVE) { + setupFilterPrimitives_sve(p); setupSaoPrimitives_sve(p); setupDCTPrimitives_sve(p); } diff --git a/source/common/aarch64/filter-prim-sve.cpp b/source/common/aarch64/filter-prim-sve.cpp new file mode 100644 index 000000000..267619af0 --- /dev/null +++ b/source/common/aarch64/filter-prim-sve.cpp @@ -0,0 +1,317 @@ +/***************************************************************************** + * Copyright (C) 2025 MulticoreWare, Inc + * + * Authors: Gerda Zsejke More <gerdazsejke.m...@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#include "filter-prim-sve.h" +#include "mem-neon.h" +#include "neon-sve-bridge.h" + +#include <arm_neon.h> + +#if HIGH_BIT_DEPTH +static const uint16_t dotprod_h_permute_tbl[32] = { + // clang-format off + 0, 1, 2, 3, 1, 2, 3, 4, + 2, 3, 4, 5, 3, 4, 5, 6, + 3, 2, 1, 0, 4, 3, 2, 1, + 5, 4, 3, 2, 6, 5, 4, 3, + // clang-format on +}; + +template<bool coeff2> +void inline filter8_u16x4(const uint16x8_t *s, uint16x4_t &d, int16x8_t filter, + uint16x4_t maxVal) +{ + if (coeff2) + { + int16x8_t sum01 = vreinterpretq_s16_u16(vaddq_u16(s[0], s[1])); + int16x8_t sum23 = vreinterpretq_s16_u16(vaddq_u16(s[2], s[3])); + + int64x2_t sum_lo = x265_sdotq_lane_s16(vdupq_n_s64(0), sum01, filter, 0); + int64x2_t sum_hi = x265_sdotq_lane_s16(vdupq_n_s64(0), sum23, filter, 0); + + int32x4_t sum = vcombine_s32(vmovn_s64(sum_lo), vmovn_s64(sum_hi)); + + d = vqrshrun_n_s32(sum, IF_FILTER_PREC); + d = vmin_u16(d, maxVal); + } + else + { + int64x2_t sum_lo = + x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[0]), filter, 0); + int64x2_t sum_hi = + x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[2]), filter, 0); + + sum_lo = x265_sdotq_lane_s16(sum_lo, vreinterpretq_s16_u16(s[1]), filter, 1); + sum_hi = x265_sdotq_lane_s16(sum_hi, vreinterpretq_s16_u16(s[3]), filter, 1); + + int32x4_t sum = vcombine_s32(vmovn_s64(sum_lo), vmovn_s64(sum_hi)); + + d = vqrshrun_n_s32(sum, IF_FILTER_PREC); + d = vmin_u16(d, maxVal); + } +} + +template<bool coeff2> +void inline filter8_u16x8(uint16x8_t *s, uint16x8_t &d, int16x8_t filter, + uint16x8_t maxVal) +{ + if (coeff2) + { + int16x8_t sum01 = vreinterpretq_s16_u16(vaddq_u16(s[0], s[1])); + int16x8_t sum23 = vreinterpretq_s16_u16(vaddq_u16(s[2], s[3])); + int16x8_t sum45 = vreinterpretq_s16_u16(vaddq_u16(s[4], s[5])); + int16x8_t sum67 = vreinterpretq_s16_u16(vaddq_u16(s[6], s[7])); + + int64x2_t sum0 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum01, filter, 0); + int64x2_t sum1 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum23, filter, 0); + int64x2_t sum2 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum45, filter, 0); + int64x2_t sum3 = x265_sdotq_lane_s16(vdupq_n_s64(0), sum67, filter, 0); + + int32x4_t sum_lo = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum1)); + int32x4_t sum_hi = vcombine_s32(vmovn_s64(sum2), vmovn_s64(sum3)); + + uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC); + uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC); + + d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal); + } + else + { + int64x2_t sum0 = + x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[0]), filter, 0); + int64x2_t sum1 = + x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[1]), filter, 0); + int64x2_t sum2 = + x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[2]), filter, 0); + int64x2_t sum3 = + x265_sdotq_lane_s16(vdupq_n_s64(0), vreinterpretq_s16_u16(s[3]), filter, 0); + + sum0 = x265_sdotq_lane_s16(sum0, vreinterpretq_s16_u16(s[4]), filter, 1); + sum1 = x265_sdotq_lane_s16(sum1, vreinterpretq_s16_u16(s[5]), filter, 1); + sum2 = x265_sdotq_lane_s16(sum2, vreinterpretq_s16_u16(s[6]), filter, 1); + sum3 = x265_sdotq_lane_s16(sum3, vreinterpretq_s16_u16(s[7]), filter, 1); + + int32x4_t sum_lo = vcombine_s32(vmovn_s64(sum0), vmovn_s64(sum2)); + int32x4_t sum_hi = vcombine_s32(vmovn_s64(sum1), vmovn_s64(sum3)); + + uint16x4_t d_lo = vqrshrun_n_s32(sum_lo, IF_FILTER_PREC); + uint16x4_t d_hi = vqrshrun_n_s32(sum_hi, IF_FILTER_PREC); + + d = vminq_u16(vcombine_u16(d_lo, d_hi), maxVal); + } +} + +template<bool coeff2> +void inline setup_s_hpp_x4(uint16x8_t *d, uint16x8_t s0, uint16x8_t s1, uint16x8_t *idx) +{ + if (coeff2) + { + d[0] = x265_tblq_u16(s0, idx[0]); + d[1] = x265_tblq_u16(s1, idx[2]); + d[2] = x265_tblq_u16(s0, idx[1]); + d[3] = x265_tblq_u16(s1, idx[3]); + } + else + { + d[0] = x265_tblq_u16(s0, idx[0]); + d[1] = x265_tblq_u16(s1, idx[0]); + d[2] = x265_tblq_u16(s0, idx[1]); + d[3] = x265_tblq_u16(s1, idx[1]); + } +} + +template<bool coeff2> +void inline setup_s_hpp_x8(uint16x8_t *d, uint16x8_t s0, uint16x8_t s1, uint16x8_t s2, + uint16x8_t *idx) +{ + if (coeff2) + { + d[0] = x265_tblq_u16(s0, idx[0]); + d[1] = x265_tblq_u16(s1, idx[2]); + d[2] = x265_tblq_u16(s0, idx[1]); + d[3] = x265_tblq_u16(s1, idx[3]); + d[4] = x265_tblq_u16(s1, idx[0]); + d[5] = x265_tblq_u16(s2, idx[2]); + d[6] = x265_tblq_u16(s1, idx[1]); + d[7] = x265_tblq_u16(s2, idx[3]); + } + else + { + d[0] = x265_tblq_u16(s0, idx[0]); + d[1] = x265_tblq_u16(s1, idx[0]); + d[2] = x265_tblq_u16(s0, idx[1]); + d[3] = x265_tblq_u16(s1, idx[1]); + d[4] = d[1]; + d[5] = x265_tblq_u16(s2, idx[0]); + d[6] = d[3]; + d[7] = x265_tblq_u16(s2, idx[1]); + } +} + +template<bool coeff2, int width, int height> +void inline interp8_hpp_sve(const pixel *src, intptr_t srcStride, + pixel *dst, intptr_t dstStride, int coeffIdx) +{ + const int N_TAPS = 8; + const uint16x8_t maxVal = vdupq_n_u16((1 << X265_DEPTH) - 1); + const int16x8_t filter = vld1q_s16(X265_NS::g_lumaFilter[coeffIdx]); + uint16x8_t idx[4]; + + idx[0] = vld1q_u16(dotprod_h_permute_tbl + 0); + idx[1] = vld1q_u16(dotprod_h_permute_tbl + 8); + if (coeff2) + { + idx[2] = vld1q_u16(dotprod_h_permute_tbl + 16); + idx[3] = vld1q_u16(dotprod_h_permute_tbl + 24); + } + + src -= N_TAPS / 2 - 1; + + for (int row = 0; row < height; row++) + { + if (width % 16 == 0 || width == 24) + { + int col = 0; + for (; col <= width - 16; col += 16) + { + uint16x8_t s[5]; + load_u16x8xn<5>(src + col, 4, s); + + uint16x8_t s0[N_TAPS], s1[N_TAPS]; + setup_s_hpp_x8<coeff2>(s0, s[0], s[1], s[2], idx); + setup_s_hpp_x8<coeff2>(s1, s[2], s[3], s[4], idx); + + uint16x8_t d0, d1; + filter8_u16x8<coeff2>(s0, d0, filter, maxVal); + filter8_u16x8<coeff2>(s1, d1, filter, maxVal); + + vst1q_u16(dst + col + 0, d0); + vst1q_u16(dst + col + 8, d1); + } + + if (width == 24) + { + uint16x8_t s[3]; + load_u16x8xn<3>(src + col, 4, s); + + uint16x8_t s0[N_TAPS]; + setup_s_hpp_x8<coeff2>(s0, s[0], s[1], s[2], idx); + + uint16x8_t d0; + filter8_u16x8<coeff2>(s0, d0, filter, maxVal); + + vst1q_u16(dst + col, d0); + } + } + else if (width == 4) + { + uint16x8_t s[2]; + load_u16x8xn<2>(src, 4, s); + + uint16x8_t s0[N_TAPS]; + setup_s_hpp_x4<coeff2>(s0, s[0], s[1], idx); + + uint16x4_t d0; + filter8_u16x4<coeff2>(s0, d0, filter, vget_low_u16(maxVal)); + + vst1_u16(dst, d0); + } + + src += srcStride; + dst += dstStride; + } +} + +namespace X265_NS { +// Declaration for use in interp8_horiz_pp_sve(). +template<int N, int width, int height> +void interp_horiz_pp_neon(const pixel *src, intptr_t srcStride, pixel *dst, + intptr_t dstStride, int coeffIdx); + +template<int width, int height> +void interp8_horiz_pp_sve(const pixel *src, intptr_t srcStride, pixel *dst, + intptr_t dstStride, int coeffIdx) +{ + switch (coeffIdx) + { + case 1: + if (width <= 16) + { + return interp_horiz_pp_neon<8, width, height>(src, srcStride, dst, + dstStride, coeffIdx); + } + else + { + return interp8_hpp_sve<false, width, height>(src, srcStride, dst, + dstStride, coeffIdx); + } + case 2: + if (width > 4) + { + return interp_horiz_pp_neon<8, width, height>(src, srcStride, dst, + dstStride, coeffIdx); + } + else + { + return interp8_hpp_sve<true, width, height>(src, srcStride, dst, + dstStride, coeffIdx); + } + case 3: + return interp_horiz_pp_neon<8, width, height>(src, srcStride, dst, + dstStride, coeffIdx); + } +} + +void setupFilterPrimitives_sve(EncoderPrimitives &p) +{ + p.pu[LUMA_4x4].luma_hpp = interp8_horiz_pp_sve<4, 4>; + p.pu[LUMA_4x8].luma_hpp = interp8_horiz_pp_sve<4, 8>; + p.pu[LUMA_4x16].luma_hpp = interp8_horiz_pp_sve<4, 16>; +#if X265_DEPTH == 12 + p.pu[LUMA_16x4].luma_hpp = interp8_horiz_pp_sve<16, 4>; + p.pu[LUMA_16x8].luma_hpp = interp8_horiz_pp_sve<16, 8>; + p.pu[LUMA_16x12].luma_hpp = interp8_horiz_pp_sve<16, 12>; + p.pu[LUMA_16x16].luma_hpp = interp8_horiz_pp_sve<16, 16>; + p.pu[LUMA_16x32].luma_hpp = interp8_horiz_pp_sve<16, 32>; + p.pu[LUMA_16x64].luma_hpp = interp8_horiz_pp_sve<16, 64>; + p.pu[LUMA_24x32].luma_hpp = interp8_horiz_pp_sve<24, 32>; + p.pu[LUMA_32x8].luma_hpp = interp8_horiz_pp_sve<32, 8>; + p.pu[LUMA_32x16].luma_hpp = interp8_horiz_pp_sve<32, 16>; + p.pu[LUMA_32x24].luma_hpp = interp8_horiz_pp_sve<32, 24>; + p.pu[LUMA_32x32].luma_hpp = interp8_horiz_pp_sve<32, 32>; + p.pu[LUMA_32x64].luma_hpp = interp8_horiz_pp_sve<32, 64>; + p.pu[LUMA_48x64].luma_hpp = interp8_horiz_pp_sve<48, 64>; + p.pu[LUMA_64x16].luma_hpp = interp8_horiz_pp_sve<64, 16>; + p.pu[LUMA_64x32].luma_hpp = interp8_horiz_pp_sve<64, 32>; + p.pu[LUMA_64x48].luma_hpp = interp8_horiz_pp_sve<64, 48>; + p.pu[LUMA_64x64].luma_hpp = interp8_horiz_pp_sve<64, 64>; +#endif // X265_DEPTH == 12 +} +} // namespace X265_NS +#else // !HIGH_BIT_DEPTH +namespace X265_NS { +void setupFilterPrimitives_sve(EncoderPrimitives &) +{ +} +} +#endif // HIGH_BIT_DEPTH diff --git a/source/common/aarch64/filter-prim-sve.h b/source/common/aarch64/filter-prim-sve.h new file mode 100644 index 000000000..382a1adbd --- /dev/null +++ b/source/common/aarch64/filter-prim-sve.h @@ -0,0 +1,37 @@ +/***************************************************************************** + * Copyright (C) 2025 MulticoreWare, Inc + * + * Authors: Gerda Zsejke More <gerdazsejke.m...@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_FILTER_PRIM_SVE_H +#define X265_FILTER_PRIM_SVE_H + +#if defined(HAVE_SVE) + +#include "primitives.h" + +namespace X265_NS { +void setupFilterPrimitives_sve(EncoderPrimitives &p); +} + +#endif // defined(HAVE_SVE) + +#endif // X265_FILTER_PRIM_SVE_H diff --git a/source/common/aarch64/neon-sve-bridge.h b/source/common/aarch64/neon-sve-bridge.h index dad5fa909..48f89ea6e 100644 --- a/source/common/aarch64/neon-sve-bridge.h +++ b/source/common/aarch64/neon-sve-bridge.h @@ -3,6 +3,7 @@ * * Authors: Hari Limaye <hari.lim...@arm.com> * Jonathan Wright <jonathan.wri...@arm.com> + * Gerda Zsejke More <gerdazsejke.m...@arm.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -52,6 +53,17 @@ static inline int64x2_t x265_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) svset_neonq_s16(svundef_s16(), y))); } +#define x265_sdotq_lane_s16(sum, s0, f, lane) \ + svget_neonq_s64(svdot_lane_s64(svset_neonq_s64(svundef_s64(), sum), \ + svset_neonq_s16(svundef_s16(), s0), \ + svset_neonq_s16(svundef_s16(), f), lane)) + +static inline uint16x8_t x265_tblq_u16(uint16x8_t x, uint16x8_t idx) +{ + return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), x), + svset_neonq_u16(svundef_u16(), idx))); +} + static inline int8x16_t x265_sve_mask(const int x, const int endX, const int8x16_t in) { -- 2.39.5 (Apple Git-154)
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel