Add implementations of luma_hpp primitives using Neon I8MM instructions, which are mandatory from Armv8.6.
Luma filters 1 and 3 are actually 7-tap filters 0-padded to 8 taps. We can use this fact to accelerate these cases using the Armv8.6 USMMLA matrix multiply instructions - which do twice as much work as the equivalent USDOT dot product instructions. Geomean uplift across all block sizes for luma filters, relative to Armv8.4 Neon DotProd implementations: Neoverse N2: 1.481x Neoverse V1: 1.337x Neoverse V2: 1.399x --- source/common/CMakeLists.txt | 7 + source/common/aarch64/asm-primitives.cpp | 7 + source/common/aarch64/filter-neon-i8mm.cpp | 341 +++++++++++++++++++++ source/common/aarch64/filter-neon-i8mm.h | 37 +++ 4 files changed, 392 insertions(+) create mode 100644 source/common/aarch64/filter-neon-i8mm.cpp create mode 100644 source/common/aarch64/filter-neon-i8mm.h diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index f8167121e..4b7145132 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -105,6 +105,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp mem-neon.h) set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp) + set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp) set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp) set(C_SRCS_SVE2 sao-prim-sve2.cpp) enable_language(ASM) @@ -124,6 +125,12 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) endforeach() + if(CPU_HAS_NEON_I8MM) + foreach(SRC ${C_SRCS_NEON_I8MM}) + set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + endforeach() + endif() + if(CPU_HAS_NEON_DOTPROD) foreach(SRC ${C_SRCS_NEON_DOTPROD}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index e67901ca2..dd3c2a4ba 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -385,6 +385,7 @@ extern "C" { #include "intrapred-prim.h" #include "sao-prim.h" #include "filter-neon-dotprod.h" +#include "filter-neon-i8mm.h" namespace X265_NS { @@ -1046,6 +1047,12 @@ void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask) setupFilterPrimitives_neon_dotprod(p); } #endif +#ifdef HAVE_NEON_I8MM + if (cpuMask & X265_CPU_NEON_I8MM) + { + setupFilterPrimitives_neon_i8mm(p); + } +#endif #if defined(HAVE_SVE) && HAVE_SVE_BRIDGE if (cpuMask & X265_CPU_SVE) { diff --git a/source/common/aarch64/filter-neon-i8mm.cpp b/source/common/aarch64/filter-neon-i8mm.cpp new file mode 100644 index 000000000..c19592fa1 --- /dev/null +++ b/source/common/aarch64/filter-neon-i8mm.cpp @@ -0,0 +1,341 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.lim...@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#if defined(HAVE_NEON_I8MM) +#include "filter-neon-i8mm.h" +#if !HIGH_BIT_DEPTH + +#include "mem-neon.h" + +#include <arm_neon.h> + +namespace { +static const uint8_t dotprod_permute_tbl[48] = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +static const uint8_t matmul_permute_tbl[2][32] = { + // Permute for luma filter 3. + { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9, + 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 }, + // Permute for luma filter 1. + { 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10, + 5, 6, 7, 8, 9, 10, 11, 12, 7, 8, 9, 10, 11, 12, 13, 14 } +}; + +static const int8_t matmul_luma_filter[2][16] = { + { -1, 4, -10, 58, 17, -5, 1, 0, 0, -1, 4, -10, 58, 17, -5, 1 }, + { 1, -5, 17, 58, -10, 4, -1, 0, 0, 1, -5, 17, 58, -10, 4, -1 } +}; + +uint8x8_t inline filter8_8_pp(uint8x16_t samples, const int8x8_t filter, + const uint8x16x3_t tbl) +{ + // Permute input samples for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t perm_S2 = vqtbl1q_u8(samples, tbl.val[2]); + + int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0); + dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1); + int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0); + dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_S2, filter, 1); + + // Narrow and combine. + int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo), + vmovn_s32(dotprod_hi)); + return vqrshrun_n_s16(dotprod, IF_FILTER_PREC); +} + +void inline init_sample_permute(uint8x8_t *samples, const uint8x16x3_t tbl, + uint8x16_t *d) +{ + // Permute input samples for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + d[0] = vqtbl1q_u8(vcombine_u8(samples[0], vdup_n_u8(0)), tbl.val[0]); + d[1] = vqtbl1q_u8(vcombine_u8(samples[1], vdup_n_u8(0)), tbl.val[0]); + d[2] = vqtbl1q_u8(vcombine_u8(samples[2], vdup_n_u8(0)), tbl.val[0]); + d[3] = vqtbl1q_u8(vcombine_u8(samples[3], vdup_n_u8(0)), tbl.val[0]); +} + +uint8x8_t inline filter8_8_pp_reuse(uint8x16_t samples, const int8x8_t filter, + const uint8x16x3_t tbl, uint8x16_t &perm_s0) +{ + // Permute input samples for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // Already in perm_s0. + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t perm_s2 = vqtbl1q_u8(samples, tbl.val[2]); + + int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0); + dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1); + int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0); + dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_s2, filter, 1); + + // Save for re-use in next iteration. + perm_s0 = perm_s2; + + // Narrow and combine. + int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo), + vmovn_s32(dotprod_hi)); + return vqrshrun_n_s16(dotprod, IF_FILTER_PREC); +} + +uint8x8_t inline filter8_8_pp_matmul(uint8x16_t samples, const int8x16_t filter, + const uint8x16x2_t tbl) +{ + // Permute input samples for 8x2 by 2x8 matrix multiply. + uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val[0]); + uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val[1]); + + int32x4_t matmul_lo = vusmmlaq_s32(vdupq_n_s32(0), perm_s0, filter); + int32x4_t matmul_hi = vusmmlaq_s32(vdupq_n_s32(0), perm_s1, filter); + + // Narrow and combine. + int16x8_t matmul = vcombine_s16(vmovn_s32(matmul_lo), vmovn_s32(matmul_hi)); + return vqrshrun_n_s16(matmul, IF_FILTER_PREC); +} +} // Unnamed namespace. + +namespace X265_NS { +template<int width, int height> +void inline interp8_horiz_pp_dotprod(const uint8_t *src, intptr_t srcStride, + uint8_t *dst, intptr_t dstStride, + int coeffIdx) +{ + const int N_TAPS = 8; + src -= N_TAPS / 2 - 1; + + const uint8x16x3_t tbl = vld1q_u8_x3(dotprod_permute_tbl); + const int8x8_t filter = vmovn_s16(vld1q_s16(g_lumaFilter[coeffIdx])); + + for (int row = 0; row < height; row += 4) + { + int col = 0; + if (width >= 32) + { + // Peel first sample permute to enable passing between iterations. + uint8x8_t s0[4]; + load_u8x8xn<4>(src, srcStride, s0); + uint8x16_t ps0[4]; + init_sample_permute(s0, tbl, ps0); + + for (; (col + 16) <= width; col += 16) + { + uint8x16_t s_lo[4], s_hi[4]; + load_u8x16xn<4>(src + col + 0, srcStride, s_lo); + load_u8x16xn<4>(src + col + 8, srcStride, s_hi); + + uint8x8_t d_lo[4]; + d_lo[0] = filter8_8_pp_reuse(s_lo[0], filter, tbl, ps0[0]); + d_lo[1] = filter8_8_pp_reuse(s_lo[1], filter, tbl, ps0[1]); + d_lo[2] = filter8_8_pp_reuse(s_lo[2], filter, tbl, ps0[2]); + d_lo[3] = filter8_8_pp_reuse(s_lo[3], filter, tbl, ps0[3]); + + uint8x8_t d_hi[4]; + d_hi[0] = filter8_8_pp_reuse(s_hi[0], filter, tbl, ps0[0]); + d_hi[1] = filter8_8_pp_reuse(s_hi[1], filter, tbl, ps0[1]); + d_hi[2] = filter8_8_pp_reuse(s_hi[2], filter, tbl, ps0[2]); + d_hi[3] = filter8_8_pp_reuse(s_hi[3], filter, tbl, ps0[3]); + + store_u8x8xn<4>(dst + col + 0, dstStride, d_lo); + store_u8x8xn<4>(dst + col + 8, dstStride, d_hi); + } + } + else + { + for (; col + 8 <= width; col += 8) + { + uint8x16_t s[4]; + load_u8x16xn<4>(src + col, srcStride, s); + + uint8x8_t d[4]; + d[0] = filter8_8_pp(s[0], filter, tbl); + d[1] = filter8_8_pp(s[1], filter, tbl); + d[2] = filter8_8_pp(s[2], filter, tbl); + d[3] = filter8_8_pp(s[3], filter, tbl); + + store_u8x8xn<4>(dst + col, dstStride, d); + } + } + for (; col < width; col += 4) + { + uint8x16_t s[4]; + load_u8x16xn<4>(src + col, srcStride, s); + + uint8x8_t d[4]; + d[0] = filter8_8_pp(s[0], filter, tbl); + d[1] = filter8_8_pp(s[1], filter, tbl); + d[2] = filter8_8_pp(s[2], filter, tbl); + d[3] = filter8_8_pp(s[3], filter, tbl); + + store_u8x4xn<4>(dst + col, dstStride, d); + } + + src += 4 * srcStride; + dst += 4 * dstStride; + } +} + +template<int coeffIdx, int width, int height> +void inline interp8_horiz_pp_matmul(const uint8_t *src, intptr_t srcStride, + uint8_t *dst, intptr_t dstStride) +{ + const int N_TAPS = 8; + src -= N_TAPS / 2 - 1; + + // coeffIdx is 1 or 3 for g_lumaFilter index. + // Select filter and permute table from the first or second array indices. + const int index = coeffIdx >> 1; + const uint8x16x2_t tbl = vld1q_u8_x2(matmul_permute_tbl[index]); + const int8x16_t filter = vld1q_s8(matmul_luma_filter[index]); + + for (int row = 0; row < height; row += 4) + { + int col = 0; + if (width >= 32) + { + for (; (col + 16) <= width; col += 16) + { + uint8x16_t s_lo[4], s_hi[4]; + load_u8x16xn<4>(src + col + 0, srcStride, s_lo); + load_u8x16xn<4>(src + col + 8, srcStride, s_hi); + + uint8x8_t d_lo[4]; + d_lo[0] = filter8_8_pp_matmul(s_lo[0], filter, tbl); + d_lo[1] = filter8_8_pp_matmul(s_lo[1], filter, tbl); + d_lo[2] = filter8_8_pp_matmul(s_lo[2], filter, tbl); + d_lo[3] = filter8_8_pp_matmul(s_lo[3], filter, tbl); + + uint8x8_t d_hi[4]; + d_hi[0] = filter8_8_pp_matmul(s_hi[0], filter, tbl); + d_hi[1] = filter8_8_pp_matmul(s_hi[1], filter, tbl); + d_hi[2] = filter8_8_pp_matmul(s_hi[2], filter, tbl); + d_hi[3] = filter8_8_pp_matmul(s_hi[3], filter, tbl); + + store_u8x8xn<4>(dst + col + 0, dstStride, d_lo); + store_u8x8xn<4>(dst + col + 8, dstStride, d_hi); + } + } + else + { + for (; col + 8 <= width; col += 8) + { + uint8x16_t s[4]; + load_u8x16xn<4>(src + col, srcStride, s); + + uint8x8_t d[4]; + d[0] = filter8_8_pp_matmul(s[0], filter, tbl); + d[1] = filter8_8_pp_matmul(s[1], filter, tbl); + d[2] = filter8_8_pp_matmul(s[2], filter, tbl); + d[3] = filter8_8_pp_matmul(s[3], filter, tbl); + + store_u8x8xn<4>(dst + col, dstStride, d); + } + } + for (; col < width; col += 4) + { + uint8x16_t s[4]; + load_u8x16xn<4>(src + col, srcStride, s); + + uint8x8_t d[4]; + d[0] = filter8_8_pp_matmul(s[0], filter, tbl); + d[1] = filter8_8_pp_matmul(s[1], filter, tbl); + d[2] = filter8_8_pp_matmul(s[2], filter, tbl); + d[3] = filter8_8_pp_matmul(s[3], filter, tbl); + + store_u8x4xn<4>(dst + col, dstStride, d); + } + + src += 4 * srcStride; + dst += 4 * dstStride; + } +} + +template<int width, int height> +void interp8_horiz_pp_i8mm(const uint8_t *src, intptr_t srcStride, uint8_t *dst, + intptr_t dstStride, int coeffIdx) +{ + switch (coeffIdx) + { + case 1: + return interp8_horiz_pp_matmul<1, width, height>(src, srcStride, dst, + dstStride); + case 2: + return interp8_horiz_pp_dotprod<width, height>(src, srcStride, dst, + dstStride, coeffIdx); + case 3: + return interp8_horiz_pp_matmul<3, width, height>(src, srcStride, dst, + dstStride); + } +} + +#define LUMA_I8MM(W, H) \ + p.pu[LUMA_ ## W ## x ## H].luma_hpp = interp8_horiz_pp_i8mm<W, H>; + +void setupFilterPrimitives_neon_i8mm(EncoderPrimitives &p) +{ + LUMA_I8MM(4, 4); + LUMA_I8MM(4, 8); + LUMA_I8MM(4, 16); + LUMA_I8MM(12, 16); + LUMA_I8MM(8, 4); + LUMA_I8MM(8, 8); + LUMA_I8MM(8, 16); + LUMA_I8MM(8, 32); + LUMA_I8MM(16, 4); + LUMA_I8MM(16, 8); + LUMA_I8MM(16, 12); + LUMA_I8MM(16, 16); + LUMA_I8MM(16, 32); + LUMA_I8MM(16, 64); + LUMA_I8MM(24, 32); + LUMA_I8MM(32, 8); + LUMA_I8MM(32, 16); + LUMA_I8MM(32, 24); + LUMA_I8MM(32, 32); + LUMA_I8MM(32, 64); + LUMA_I8MM(48, 64); + LUMA_I8MM(64, 16); + LUMA_I8MM(64, 32); + LUMA_I8MM(64, 48); + LUMA_I8MM(64, 64); +} +} + +#else // if !HIGH_BIT_DEPTH +namespace X265_NS { +void setupFilterPrimitives_neon_i8mm(EncoderPrimitives &) +{ +} +} +#endif // !HIGH_BIT_DEPTH + +#endif // defined(HAVE_NEON_I8MM) diff --git a/source/common/aarch64/filter-neon-i8mm.h b/source/common/aarch64/filter-neon-i8mm.h new file mode 100644 index 000000000..aa9cd8225 --- /dev/null +++ b/source/common/aarch64/filter-neon-i8mm.h @@ -0,0 +1,37 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.lim...@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_FILTER_NEON_I8MM_H +#define X265_FILTER_NEON_I8MM_H + +#if defined(HAVE_NEON_I8MM) + +#include "primitives.h" + +namespace X265_NS { +void setupFilterPrimitives_neon_i8mm(EncoderPrimitives &p); +} + +#endif // defined(HAVE_NEON_I8MM) + +#endif // X265_FILTER_NEON_I8MM_H -- 2.42.1
>From a7f0416acce55694c9cf997bbfbc2db2520b3160 Mon Sep 17 00:00:00 2001 Message-ID: <a7f0416acce55694c9cf997bbfbc2db2520b3160.1725629250.git.hari.lim...@arm.com> In-Reply-To: <cover.1725629250.git.hari.lim...@arm.com> References: <cover.1725629250.git.hari.lim...@arm.com> From: Hari Limaye <hari.lim...@arm.com> Date: Sun, 21 Apr 2024 20:07:58 +0100 Subject: [PATCH 08/14] AArch64: Add Armv8.6 Neon I8MM implementations of luma_hpp Add implementations of luma_hpp primitives using Neon I8MM instructions, which are mandatory from Armv8.6. Luma filters 1 and 3 are actually 7-tap filters 0-padded to 8 taps. We can use this fact to accelerate these cases using the Armv8.6 USMMLA matrix multiply instructions - which do twice as much work as the equivalent USDOT dot product instructions. Geomean uplift across all block sizes for luma filters, relative to Armv8.4 Neon DotProd implementations: Neoverse N2: 1.481x Neoverse V1: 1.337x Neoverse V2: 1.399x --- source/common/CMakeLists.txt | 7 + source/common/aarch64/asm-primitives.cpp | 7 + source/common/aarch64/filter-neon-i8mm.cpp | 341 +++++++++++++++++++++ source/common/aarch64/filter-neon-i8mm.h | 37 +++ 4 files changed, 392 insertions(+) create mode 100644 source/common/aarch64/filter-neon-i8mm.cpp create mode 100644 source/common/aarch64/filter-neon-i8mm.h diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index f8167121e..4b7145132 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -105,6 +105,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp mem-neon.h) set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp) + set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp) set(C_SRCS_SVE sao-prim-sve.cpp dct-prim-sve.cpp) set(C_SRCS_SVE2 sao-prim-sve2.cpp) enable_language(ASM) @@ -124,6 +125,12 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) endforeach() + if(CPU_HAS_NEON_I8MM) + foreach(SRC ${C_SRCS_NEON_I8MM}) + set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + endforeach() + endif() + if(CPU_HAS_NEON_DOTPROD) foreach(SRC ${C_SRCS_NEON_DOTPROD}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp index e67901ca2..dd3c2a4ba 100644 --- a/source/common/aarch64/asm-primitives.cpp +++ b/source/common/aarch64/asm-primitives.cpp @@ -385,6 +385,7 @@ extern "C" { #include "intrapred-prim.h" #include "sao-prim.h" #include "filter-neon-dotprod.h" +#include "filter-neon-i8mm.h" namespace X265_NS { @@ -1046,6 +1047,12 @@ void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask) setupFilterPrimitives_neon_dotprod(p); } #endif +#ifdef HAVE_NEON_I8MM + if (cpuMask & X265_CPU_NEON_I8MM) + { + setupFilterPrimitives_neon_i8mm(p); + } +#endif #if defined(HAVE_SVE) && HAVE_SVE_BRIDGE if (cpuMask & X265_CPU_SVE) { diff --git a/source/common/aarch64/filter-neon-i8mm.cpp b/source/common/aarch64/filter-neon-i8mm.cpp new file mode 100644 index 000000000..c19592fa1 --- /dev/null +++ b/source/common/aarch64/filter-neon-i8mm.cpp @@ -0,0 +1,341 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.lim...@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#if defined(HAVE_NEON_I8MM) +#include "filter-neon-i8mm.h" +#if !HIGH_BIT_DEPTH + +#include "mem-neon.h" + +#include <arm_neon.h> + +namespace { +static const uint8_t dotprod_permute_tbl[48] = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10, + 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +}; + +static const uint8_t matmul_permute_tbl[2][32] = { + // Permute for luma filter 3. + { 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9, + 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 }, + // Permute for luma filter 1. + { 1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10, + 5, 6, 7, 8, 9, 10, 11, 12, 7, 8, 9, 10, 11, 12, 13, 14 } +}; + +static const int8_t matmul_luma_filter[2][16] = { + { -1, 4, -10, 58, 17, -5, 1, 0, 0, -1, 4, -10, 58, 17, -5, 1 }, + { 1, -5, 17, 58, -10, 4, -1, 0, 0, 1, -5, 17, 58, -10, 4, -1 } +}; + +uint8x8_t inline filter8_8_pp(uint8x16_t samples, const int8x8_t filter, + const uint8x16x3_t tbl) +{ + // Permute input samples for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val[0]); + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t perm_S2 = vqtbl1q_u8(samples, tbl.val[2]); + + int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0); + dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1); + int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0); + dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_S2, filter, 1); + + // Narrow and combine. + int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo), + vmovn_s32(dotprod_hi)); + return vqrshrun_n_s16(dotprod, IF_FILTER_PREC); +} + +void inline init_sample_permute(uint8x8_t *samples, const uint8x16x3_t tbl, + uint8x16_t *d) +{ + // Permute input samples for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + d[0] = vqtbl1q_u8(vcombine_u8(samples[0], vdup_n_u8(0)), tbl.val[0]); + d[1] = vqtbl1q_u8(vcombine_u8(samples[1], vdup_n_u8(0)), tbl.val[0]); + d[2] = vqtbl1q_u8(vcombine_u8(samples[2], vdup_n_u8(0)), tbl.val[0]); + d[3] = vqtbl1q_u8(vcombine_u8(samples[3], vdup_n_u8(0)), tbl.val[0]); +} + +uint8x8_t inline filter8_8_pp_reuse(uint8x16_t samples, const int8x8_t filter, + const uint8x16x3_t tbl, uint8x16_t &perm_s0) +{ + // Permute input samples for dot product. + // { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 } + // Already in perm_s0. + // { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } + uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val[1]); + // { 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 } + uint8x16_t perm_s2 = vqtbl1q_u8(samples, tbl.val[2]); + + int32x4_t dotprod_lo = vusdotq_lane_s32(vdupq_n_s32(0), perm_s0, filter, 0); + dotprod_lo = vusdotq_lane_s32(dotprod_lo, perm_s1, filter, 1); + int32x4_t dotprod_hi = vusdotq_lane_s32(vdupq_n_s32(0), perm_s1, filter, 0); + dotprod_hi = vusdotq_lane_s32(dotprod_hi, perm_s2, filter, 1); + + // Save for re-use in next iteration. + perm_s0 = perm_s2; + + // Narrow and combine. + int16x8_t dotprod = vcombine_s16(vmovn_s32(dotprod_lo), + vmovn_s32(dotprod_hi)); + return vqrshrun_n_s16(dotprod, IF_FILTER_PREC); +} + +uint8x8_t inline filter8_8_pp_matmul(uint8x16_t samples, const int8x16_t filter, + const uint8x16x2_t tbl) +{ + // Permute input samples for 8x2 by 2x8 matrix multiply. + uint8x16_t perm_s0 = vqtbl1q_u8(samples, tbl.val[0]); + uint8x16_t perm_s1 = vqtbl1q_u8(samples, tbl.val[1]); + + int32x4_t matmul_lo = vusmmlaq_s32(vdupq_n_s32(0), perm_s0, filter); + int32x4_t matmul_hi = vusmmlaq_s32(vdupq_n_s32(0), perm_s1, filter); + + // Narrow and combine. + int16x8_t matmul = vcombine_s16(vmovn_s32(matmul_lo), vmovn_s32(matmul_hi)); + return vqrshrun_n_s16(matmul, IF_FILTER_PREC); +} +} // Unnamed namespace. + +namespace X265_NS { +template<int width, int height> +void inline interp8_horiz_pp_dotprod(const uint8_t *src, intptr_t srcStride, + uint8_t *dst, intptr_t dstStride, + int coeffIdx) +{ + const int N_TAPS = 8; + src -= N_TAPS / 2 - 1; + + const uint8x16x3_t tbl = vld1q_u8_x3(dotprod_permute_tbl); + const int8x8_t filter = vmovn_s16(vld1q_s16(g_lumaFilter[coeffIdx])); + + for (int row = 0; row < height; row += 4) + { + int col = 0; + if (width >= 32) + { + // Peel first sample permute to enable passing between iterations. + uint8x8_t s0[4]; + load_u8x8xn<4>(src, srcStride, s0); + uint8x16_t ps0[4]; + init_sample_permute(s0, tbl, ps0); + + for (; (col + 16) <= width; col += 16) + { + uint8x16_t s_lo[4], s_hi[4]; + load_u8x16xn<4>(src + col + 0, srcStride, s_lo); + load_u8x16xn<4>(src + col + 8, srcStride, s_hi); + + uint8x8_t d_lo[4]; + d_lo[0] = filter8_8_pp_reuse(s_lo[0], filter, tbl, ps0[0]); + d_lo[1] = filter8_8_pp_reuse(s_lo[1], filter, tbl, ps0[1]); + d_lo[2] = filter8_8_pp_reuse(s_lo[2], filter, tbl, ps0[2]); + d_lo[3] = filter8_8_pp_reuse(s_lo[3], filter, tbl, ps0[3]); + + uint8x8_t d_hi[4]; + d_hi[0] = filter8_8_pp_reuse(s_hi[0], filter, tbl, ps0[0]); + d_hi[1] = filter8_8_pp_reuse(s_hi[1], filter, tbl, ps0[1]); + d_hi[2] = filter8_8_pp_reuse(s_hi[2], filter, tbl, ps0[2]); + d_hi[3] = filter8_8_pp_reuse(s_hi[3], filter, tbl, ps0[3]); + + store_u8x8xn<4>(dst + col + 0, dstStride, d_lo); + store_u8x8xn<4>(dst + col + 8, dstStride, d_hi); + } + } + else + { + for (; col + 8 <= width; col += 8) + { + uint8x16_t s[4]; + load_u8x16xn<4>(src + col, srcStride, s); + + uint8x8_t d[4]; + d[0] = filter8_8_pp(s[0], filter, tbl); + d[1] = filter8_8_pp(s[1], filter, tbl); + d[2] = filter8_8_pp(s[2], filter, tbl); + d[3] = filter8_8_pp(s[3], filter, tbl); + + store_u8x8xn<4>(dst + col, dstStride, d); + } + } + for (; col < width; col += 4) + { + uint8x16_t s[4]; + load_u8x16xn<4>(src + col, srcStride, s); + + uint8x8_t d[4]; + d[0] = filter8_8_pp(s[0], filter, tbl); + d[1] = filter8_8_pp(s[1], filter, tbl); + d[2] = filter8_8_pp(s[2], filter, tbl); + d[3] = filter8_8_pp(s[3], filter, tbl); + + store_u8x4xn<4>(dst + col, dstStride, d); + } + + src += 4 * srcStride; + dst += 4 * dstStride; + } +} + +template<int coeffIdx, int width, int height> +void inline interp8_horiz_pp_matmul(const uint8_t *src, intptr_t srcStride, + uint8_t *dst, intptr_t dstStride) +{ + const int N_TAPS = 8; + src -= N_TAPS / 2 - 1; + + // coeffIdx is 1 or 3 for g_lumaFilter index. + // Select filter and permute table from the first or second array indices. + const int index = coeffIdx >> 1; + const uint8x16x2_t tbl = vld1q_u8_x2(matmul_permute_tbl[index]); + const int8x16_t filter = vld1q_s8(matmul_luma_filter[index]); + + for (int row = 0; row < height; row += 4) + { + int col = 0; + if (width >= 32) + { + for (; (col + 16) <= width; col += 16) + { + uint8x16_t s_lo[4], s_hi[4]; + load_u8x16xn<4>(src + col + 0, srcStride, s_lo); + load_u8x16xn<4>(src + col + 8, srcStride, s_hi); + + uint8x8_t d_lo[4]; + d_lo[0] = filter8_8_pp_matmul(s_lo[0], filter, tbl); + d_lo[1] = filter8_8_pp_matmul(s_lo[1], filter, tbl); + d_lo[2] = filter8_8_pp_matmul(s_lo[2], filter, tbl); + d_lo[3] = filter8_8_pp_matmul(s_lo[3], filter, tbl); + + uint8x8_t d_hi[4]; + d_hi[0] = filter8_8_pp_matmul(s_hi[0], filter, tbl); + d_hi[1] = filter8_8_pp_matmul(s_hi[1], filter, tbl); + d_hi[2] = filter8_8_pp_matmul(s_hi[2], filter, tbl); + d_hi[3] = filter8_8_pp_matmul(s_hi[3], filter, tbl); + + store_u8x8xn<4>(dst + col + 0, dstStride, d_lo); + store_u8x8xn<4>(dst + col + 8, dstStride, d_hi); + } + } + else + { + for (; col + 8 <= width; col += 8) + { + uint8x16_t s[4]; + load_u8x16xn<4>(src + col, srcStride, s); + + uint8x8_t d[4]; + d[0] = filter8_8_pp_matmul(s[0], filter, tbl); + d[1] = filter8_8_pp_matmul(s[1], filter, tbl); + d[2] = filter8_8_pp_matmul(s[2], filter, tbl); + d[3] = filter8_8_pp_matmul(s[3], filter, tbl); + + store_u8x8xn<4>(dst + col, dstStride, d); + } + } + for (; col < width; col += 4) + { + uint8x16_t s[4]; + load_u8x16xn<4>(src + col, srcStride, s); + + uint8x8_t d[4]; + d[0] = filter8_8_pp_matmul(s[0], filter, tbl); + d[1] = filter8_8_pp_matmul(s[1], filter, tbl); + d[2] = filter8_8_pp_matmul(s[2], filter, tbl); + d[3] = filter8_8_pp_matmul(s[3], filter, tbl); + + store_u8x4xn<4>(dst + col, dstStride, d); + } + + src += 4 * srcStride; + dst += 4 * dstStride; + } +} + +template<int width, int height> +void interp8_horiz_pp_i8mm(const uint8_t *src, intptr_t srcStride, uint8_t *dst, + intptr_t dstStride, int coeffIdx) +{ + switch (coeffIdx) + { + case 1: + return interp8_horiz_pp_matmul<1, width, height>(src, srcStride, dst, + dstStride); + case 2: + return interp8_horiz_pp_dotprod<width, height>(src, srcStride, dst, + dstStride, coeffIdx); + case 3: + return interp8_horiz_pp_matmul<3, width, height>(src, srcStride, dst, + dstStride); + } +} + +#define LUMA_I8MM(W, H) \ + p.pu[LUMA_ ## W ## x ## H].luma_hpp = interp8_horiz_pp_i8mm<W, H>; + +void setupFilterPrimitives_neon_i8mm(EncoderPrimitives &p) +{ + LUMA_I8MM(4, 4); + LUMA_I8MM(4, 8); + LUMA_I8MM(4, 16); + LUMA_I8MM(12, 16); + LUMA_I8MM(8, 4); + LUMA_I8MM(8, 8); + LUMA_I8MM(8, 16); + LUMA_I8MM(8, 32); + LUMA_I8MM(16, 4); + LUMA_I8MM(16, 8); + LUMA_I8MM(16, 12); + LUMA_I8MM(16, 16); + LUMA_I8MM(16, 32); + LUMA_I8MM(16, 64); + LUMA_I8MM(24, 32); + LUMA_I8MM(32, 8); + LUMA_I8MM(32, 16); + LUMA_I8MM(32, 24); + LUMA_I8MM(32, 32); + LUMA_I8MM(32, 64); + LUMA_I8MM(48, 64); + LUMA_I8MM(64, 16); + LUMA_I8MM(64, 32); + LUMA_I8MM(64, 48); + LUMA_I8MM(64, 64); +} +} + +#else // if !HIGH_BIT_DEPTH +namespace X265_NS { +void setupFilterPrimitives_neon_i8mm(EncoderPrimitives &) +{ +} +} +#endif // !HIGH_BIT_DEPTH + +#endif // defined(HAVE_NEON_I8MM) diff --git a/source/common/aarch64/filter-neon-i8mm.h b/source/common/aarch64/filter-neon-i8mm.h new file mode 100644 index 000000000..aa9cd8225 --- /dev/null +++ b/source/common/aarch64/filter-neon-i8mm.h @@ -0,0 +1,37 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.lim...@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_FILTER_NEON_I8MM_H +#define X265_FILTER_NEON_I8MM_H + +#if defined(HAVE_NEON_I8MM) + +#include "primitives.h" + +namespace X265_NS { +void setupFilterPrimitives_neon_i8mm(EncoderPrimitives &p); +} + +#endif // defined(HAVE_NEON_I8MM) + +#endif // X265_FILTER_NEON_I8MM_H -- 2.42.1
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel