Thank for the patch, it looks good to me
At 2025-06-20 17:04:24, "George Steed" <george.st...@arm.com> wrote: >Use the BEXT instruction to pack bits based on a bitmask, this avoids >the need for the loop in the prior Arm implementations. > >There was an existing function declaration of scanPosLast_sve2, however >this function was never defined and unused so simply replace it with the >new scanPosLast_sve2_bitperm declaration. > >Benchmarking on a Neoverse V2 machine supporting the SVE2_BitPerm >extension, this implementation improves --preset=medium encoding speed >by ~1.3%. > >Also take this opportunity to reorder the ARM_ASMS extension list in >CMakeLists.txt to be in architecture order to match elsewhere. >--- > source/CMakeLists.txt | 12 ++ > source/common/CMakeLists.txt | 8 +- > source/common/aarch64/asm-primitives.cpp | 13 ++ > source/common/aarch64/fun-decls.h | 3 +- > .../common/aarch64/pixel-util-sve2-bitperm.S | 125 ++++++++++++++++++ > 5 files changed, 157 insertions(+), 4 deletions(-) > create mode 100644 source/common/aarch64/pixel-util-sve2-bitperm.S > >diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt >index c4253b723..4160514b9 100755 >--- a/source/CMakeLists.txt >+++ b/source/CMakeLists.txt >@@ -835,6 +835,18 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) > DEPENDS ${ASM_SRC}) > endforeach() > endif() >+ if(CPU_HAS_SVE2_BITPERM) >+ foreach(ASM ${ARM_ASMS_SVE2_BITPERM}) >+ set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM}) >+ list(APPEND ASM_SRCS ${ASM_SRC}) >+ list(APPEND ASM_OBJS ${ASM}.${SUFFIX}) >+ add_custom_command( >+ OUTPUT ${ASM}.${SUFFIX} >+ COMMAND ${CMAKE_CXX_COMPILER} >+ ARGS ${ARM_ARGS} ${ASM_FLAGS} >${AARCH64_SVE2_BITPERM_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} >+ DEPENDS ${ASM_SRC}) >+ endforeach() >+ endif() > elseif(X86) > # compile X86 arch asm files here > foreach(ASM ${MSVC_ASMS}) >diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt >index 405ec0b2d..37f3e462c 100644 >--- a/source/common/CMakeLists.txt >+++ b/source/common/CMakeLists.txt >@@ -116,12 +116,14 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) > set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S) > set(A_SRCS_SVE asm-sve.S p2s-sve.S pixel-util-sve.S ssd-a-sve.S) > set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S) >+ set(A_SRCS_SVE2_BITPERM pixel-util-sve2-bitperm.S) > set(VEC_PRIMITIVES) > >- set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") >- set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "ARM Assembly Sources >that use SVE instruction set") >- set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "ARM Assembly Sources >that use SVE2 instruction set") >+ set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "Arm Assembly Sources") > set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm > Assembly Sources that use the Neon DotProd extension") >+ set(ARM_ASMS_SVE "${A_SRCS_SVE}" CACHE INTERNAL "Arm Assembly Sources >that use the SVE extension") >+ set(ARM_ASMS_SVE2 "${A_SRCS_SVE2}" CACHE INTERNAL "Arm Assembly Sources >that use the SVE2 extension") >+ set(ARM_ASMS_SVE2_BITPERM "${A_SRCS_SVE2_BITPERM}" CACHE INTERNAL "Arm >Assembly Sources that use the SVE2 BitPerm extension") > foreach(SRC ${C_SRCS_NEON}) > set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) > set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS > ${AARCH64_NEON_FLAG}) >diff --git a/source/common/aarch64/asm-primitives.cpp >b/source/common/aarch64/asm-primitives.cpp >index 5ce9352bd..b3c89b370 100644 >--- a/source/common/aarch64/asm-primitives.cpp >+++ b/source/common/aarch64/asm-primitives.cpp >@@ -721,6 +721,13 @@ void setupSve2Primitives(EncoderPrimitives &p) > } > #endif // defined(HAVE_SVE2) > >+#if defined(HAVE_SVE2_BITPERM) >+void setupSve2BitPermPrimitives(EncoderPrimitives &p) >+{ >+ p.scanPosLast = PFX(scanPosLast_sve2_bitperm); >+} >+#endif // defined(HAVE_SVE2_BITPERM) >+ > #ifdef HAVE_NEON_DOTPROD > #if !HIGH_BIT_DEPTH > void setupNeonDotProdPrimitives(EncoderPrimitives &p) >@@ -771,6 +778,12 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int >cpuMask) > setupSve2Primitives(p); > } > #endif >+#ifdef HAVE_SVE2_BITPERM >+ if (cpuMask & X265_CPU_SVE2_BITPERM) >+ { >+ setupSve2BitPermPrimitives(p); >+ } >+#endif > } > > void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask) >diff --git a/source/common/aarch64/fun-decls.h >b/source/common/aarch64/fun-decls.h >index 12383b573..56a434d34 100644 >--- a/source/common/aarch64/fun-decls.h >+++ b/source/common/aarch64/fun-decls.h >@@ -255,4 +255,5 @@ void PFX(ssim_4x4x2_core_sve2(const pixel* pix1, intptr_t >stride1, const pixel* > > int PFX(psyCost_8x8_sve2)(const pixel* source, intptr_t sstride, const pixel* > recon, intptr_t rstride); > void PFX(weight_sp_sve2)(const int16_t* src, pixel* dst, intptr_t srcStride, > intptr_t dstStride, int width, int height, int w0, int round, int shift, int > offset); >-int PFX(scanPosLast_sve2)(const uint16_t *scan, const coeff_t *coeff, >uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const >uint16_t* scanCG4x4, const int trSize); >+ >+int PFX(scanPosLast_sve2_bitperm)(const uint16_t *scan, const coeff_t *coeff, >uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const >uint16_t* scanCG4x4, const int trSize); >diff --git a/source/common/aarch64/pixel-util-sve2-bitperm.S >b/source/common/aarch64/pixel-util-sve2-bitperm.S >new file mode 100644 >index 000000000..5d7828317 >--- /dev/null >+++ b/source/common/aarch64/pixel-util-sve2-bitperm.S >@@ -0,0 +1,125 @@ >+/***************************************************************************** >+ * Copyright (C) 2025 MulticoreWare, Inc >+ * >+ * Authors: George Steed <george.st...@arm.com> >+ * >+ * This program is free software; you can redistribute it and/or modify >+ * it under the terms of the GNU General Public License as published by >+ * the Free Software Foundation; either version 2 of the License, or >+ * (at your option) any later version. >+ * >+ * This program is distributed in the hope that it will be useful, >+ * but WITHOUT ANY WARRANTY; without even the implied warranty of >+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >+ * GNU General Public License for more details. >+ * >+ * You should have received a copy of the GNU General Public License >+ * along with this program; if not, write to the Free Software >+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. >+ * >+ * This program is also available under a commercial proprietary license. >+ * For more information, contact us at license @ x265.com. >+ >*****************************************************************************/ >+ >+#include "asm-sve.S" >+#include "pixel-util-common.S" >+ >+.arch armv8-a+sve2+sve2-bitperm >+ >+#ifdef __APPLE__ >+.section __RODATA,__rodata >+#else >+.section .rodata >+#endif >+ >+.align 4 >+ >+.text >+ >+// int scanPosLast( >+// const uint16_t *scan, // x0 >+// const coeff_t *coeff, // x1 >+// uint16_t *coeffSign, // x2 >+// uint16_t *coeffFlag, // x3 >+// uint8_t *coeffNum, // x4 >+// int numSig, // x5 >+// const uint16_t* scanCG4x4, // x6 >+// const int trSize) // x7 >+function PFX(scanPosLast_sve2_bitperm) >+ // Convert unit of trSize stride from elements (int16) to bytes. >+ add x7, x7, x7 >+ >+ // Load scan table and convert to bytes. >+ ldp q0, q1, [x6] >+ uzp1 v0.16b, v0.16b, v1.16b // v0 - Zigzag scan table. >+ >+ movrel x10, g_SPL_and_mask >+ ldr q28, [x10] // v28 = mask for pmovmskb. >+ add x10, x7, x7 // 2*x7 >+ add x11, x7, x7, lsl #1 // 3*x7 >+ add x9, x4, #1 // CG count >+ >+1: >+ // Position of current CG. >+ ldrh w6, [x0], #32 >+ add x6, x1, x6, lsl #1 >+ >+ // Loading current CG and saturate to bytes. >+ ldr d2, [x6] >+ ldr d3, [x6, x7] >+ ldr d4, [x6, x10] >+ ldr d5, [x6, x11] >+ mov v2.d[1], v3.d[0] >+ mov v4.d[1], v5.d[0] >+ sqxtn v2.8b, v2.8h >+ sqxtn2 v2.16b, v4.8h >+ >+ // Apply zigzag. >+ tbl v3.16b, {v2.16b}, v0.16b >+ >+ // Get zero/sign. >+ cmeq v5.16b, v3.16b, #0 // v5 = zero >+ cmlt v3.16b, v3.16b, #0 // v3 = negative >+ >+ // val: v3.h[0] = pmovmskb(v3). >+ // mask: v3.h[1] = pmovmskb(v4). >+ and v3.16b, v3.16b, v28.16b >+ bic v4.16b, v28.16b, v5.16b >+ addp v3.16b, v3.16b, v4.16b >+ addp v3.16b, v3.16b, v3.16b >+ addp v3.16b, v3.16b, v3.16b >+ fmov w15, s3 >+ >+ // coeffNum = addv(v3 != 0) = 16 - addv(v5). >+ addv b5, v5.16b >+ smov w6, v5.b[0] >+ add w6, w6, #16 >+ sub x5, x5, x6 >+ strb w6, [x4], #1 >+ >+ // coeffFlag = reverse_bit(w15) in 16-bit. >+ rbit w12, w15 >+ strh w12, [x3], #2 >+ >+ // Pack bits from z3.h[0] into z30.h[0], based on z3.h[1] mask. >+ mov h31, v3.h[1] >+ bext z30.h, z3.h, z31.h >+ str h30, [x2], #2 >+ >+ cbnz x5, 1b >+ >+ // Count trailing zeros in (reversed) coeffFlag. >+ clz w13, w15 >+ lsr w12, w12, w13 >+ strh w12, [x3, #-2] >+ >+ // Get last pos. >+ sub x9, x4, x9 >+ eor w13, w13, #15 >+ add x0, x13, x9, lsl #4 >+ ret >+endfunc >+ >+const g_SPL_and_mask, align=8 >+.byte 0x1, 0x2, 0x4, 0x8, 0x10, 0x20, 0x40, 0x80, 0x1, 0x2, 0x4, 0x8, 0x10, >0x20, 0x40, 0x80 >+endconst >-- >2.43.0 >
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel