Add run-time CPU feature detection for AArch64 ISA extensions on Linux, enabled by the CMake option `AARCH64_RUNTIME_CPU_DETECT`. This option is enabled by default - for platforms with no detection method implemented we will fall back to compile-time CPU feature detection.
Also add logic to testbench to handle the case where the --cpuid parameter conflicts with the feature detection, to fail gracefully rather than SIGILL. --- source/CMakeLists.txt | 55 ++++++++++------ source/common/CMakeLists.txt | 18 ++++-- source/common/aarch64/cpu.h | 120 +++++++++++++++++++++++++++++++++++ source/common/cpu.cpp | 17 +---- source/common/param.cpp | 2 + source/test/testbench.cpp | 6 ++ 6 files changed, 177 insertions(+), 41 deletions(-) create mode 100644 source/common/aarch64/cpu.h diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 908980675..f177c4522 100755 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -87,12 +87,28 @@ elseif(ARM64MATCH GREATER "-1") option(AARCH64_WARNINGS_AS_ERRORS "Build with -Werror for AArch64 Intrinsics files" OFF) + option(AARCH64_RUNTIME_CPU_DETECT "Enable AArch64 run-time CPU feature detection" ON) + if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux") + set(AARCH64_RUNTIME_CPU_DETECT OFF CACHE BOOL "" FORCE) + message(STATUS "Run-time CPU feature detection unsupported on this platform") + endif() + # Options for manually enabling/disabling AArch64 SIMD extensions. option(ENABLE_NEON "Enable Neon" ON) option(ENABLE_NEON_DOTPROD "Enable Neon DotProd" ON) option(ENABLE_NEON_I8MM "Enable Neon I8MM" ON) option(ENABLE_SVE "Enable SVE" ON) option(ENABLE_SVE2 "Enable SVE2" ON) + + # Compiler flags for AArch64 extensions. + set(AARCH64_NEON_FLAG "-march=armv8-a") + # Neon DotProd is mandatory from Armv8.4. + set(AARCH64_NEON_DOTPROD_FLAG "-march=armv8.2-a+dotprod") + # Neon I8MM is mandatory from Armv8.6. + set(AARCH64_NEON_I8MM_FLAG "-march=armv8.2-a+dotprod+i8mm") + set(AARCH64_SVE_FLAG "-march=armv8.2-a+dotprod+i8mm+sve") + # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod and +sve. + set(AARCH64_SVE2_FLAG "-march=armv9-a+i8mm+sve2") else() message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown") message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}") @@ -268,7 +284,13 @@ if(GCC) set(CPU_HAS_NEON 1) add_definitions(-DX265_ARCH_ARM64=1) - if(CROSS_COMPILE_ARM64) + if (AARCH64_RUNTIME_CPU_DETECT) + add_definitions(-DAARCH64_RUNTIME_CPU_DETECT=1) + message(STATUS "Configuring build for run-time CPU feature detection") + endif() + + if(AARCH64_RUNTIME_CPU_DETECT OR CROSS_COMPILE_ARM64) + # Add all extensions when compiling for run-time CPU feature detection or cross compiling. set(CPU_HAS_NEON_DOTPROD 1) set(CPU_HAS_NEON_I8MM 1) set(CPU_HAS_SVE 1) @@ -280,7 +302,7 @@ if(GCC) find_package(SVE) find_package(SVE2) else() - message(STATUS "Compile time feature detection unsupported on this platform") + message(STATUS "Compile-time CPU feature detection unsupported on this platform") endif() endif() @@ -312,33 +334,25 @@ if(GCC) if(CPU_HAS_NEON) message(STATUS "Found Neon") - set(ARM_ARGS -O3 -march=armv8-a) add_definitions(-DHAVE_NEON=1) endif() if(CPU_HAS_NEON_DOTPROD) - # Neon DotProd is mandatory from Armv8.4. message(STATUS "Found Neon DotProd") - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod) add_definitions(-DHAVE_NEON_DOTPROD=1) endif() if(CPU_HAS_NEON_I8MM) - # Neon I8MM is mandatory from Armv8.6. message(STATUS "Found Neon I8MM") - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm) add_definitions(-DHAVE_NEON_I8MM=1) endif() if(CPU_HAS_SVE) message(STATUS "Found SVE") - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve) add_definitions(-DHAVE_SVE=1) endif() if(CPU_HAS_SVE2) message(STATUS "Found SVE2") - # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod - set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2) add_definitions(-DHAVE_SVE2=1) endif() - set(ARM_ARGS ${ARM_ARGS} -fPIC) + set(ARM_ARGS -O3 -fPIC) # Do not allow implicit vector type conversions in Clang builds (this # is already the default in GCC builds). check_cxx_compiler_flag(-flax-vector-conversions=none CC_HAS_FLAX_VEC_CONV_NONE) @@ -356,7 +370,8 @@ int main() { return 0; }") set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS}) # CMAKE_REQUIRED_FLAGS requires a space-delimited string, whereas # ARM_ARGS is defined and used elsewhere as a ;-list. - foreach(ARM_ARG ${ARM_ARGS}) + # Add `-march=...+sve` so the test functions correctly with Clang. + foreach(ARM_ARG ${ARM_ARGS} ${AARCH64_SVE_FLAG}) string(APPEND CMAKE_REQUIRED_FLAGS " ${ARM_ARG}") endforeach() check_c_source_compiles("${SVE_HEADER_TEST}" SVE_HEADER_C_TEST_COMPILED) @@ -705,18 +720,18 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) add_custom_command( OUTPUT ${ASM}.${SUFFIX} COMMAND ${CMAKE_CXX_COMPILER} - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} + ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_NEON_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} DEPENDS ${ASM_SRC}) endforeach() - if(CPU_HAS_SVE2) - foreach(ASM ${ARM_ASMS_SVE2}) + if(CPU_HAS_NEON_DOTPROD) + foreach(ASM ${ARM_ASMS_NEON_DOTPROD}) set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM}) list(APPEND ASM_SRCS ${ASM_SRC}) list(APPEND ASM_OBJS ${ASM}.${SUFFIX}) add_custom_command( OUTPUT ${ASM}.${SUFFIX} COMMAND ${CMAKE_CXX_COMPILER} - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} + ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_NEON_DOTPROD_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} DEPENDS ${ASM_SRC}) endforeach() endif() @@ -728,19 +743,19 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) add_custom_command( OUTPUT ${ASM}.${SUFFIX} COMMAND ${CMAKE_CXX_COMPILER} - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} + ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_SVE_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} DEPENDS ${ASM_SRC}) endforeach() endif() - if(CPU_HAS_NEON_DOTPROD) - foreach(ASM ${ARM_ASMS_NEON_DOTPROD}) + if(CPU_HAS_SVE2) + foreach(ASM ${ARM_ASMS_SVE2}) set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM}) list(APPEND ASM_SRCS ${ASM_SRC}) list(APPEND ASM_OBJS ${ASM}.${SUFFIX}) add_custom_command( OUTPUT ${ASM}.${SUFFIX} COMMAND ${CMAKE_CXX_COMPILER} - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} + ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_SVE2_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} DEPENDS ${ASM_SRC}) endforeach() endif() diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index dc4a74107..aacc0ef62 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -103,6 +103,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) add_definitions(-DAUTO_VECTORIZE=1) endif() + # Add Arm intrinsics files here. set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp mem-neon.h) set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp) set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp) @@ -110,11 +111,11 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) set(C_SRCS_SVE2 sao-prim-sve2.cpp) enable_language(ASM) - # add ARM assembly/intrinsic files here + # Add Arm assembly files here. set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S intrapred.S dct.S) + set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S) set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S) set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S ssd-a-sve2.S) - set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S) set(VEC_PRIMITIVES) set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") @@ -123,29 +124,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension") foreach(SRC ${C_SRCS_NEON}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_NEON_FLAG}) endforeach() - if(CPU_HAS_NEON_I8MM) - foreach(SRC ${C_SRCS_NEON_I8MM}) + if(CPU_HAS_NEON_DOTPROD) + foreach(SRC ${C_SRCS_NEON_DOTPROD}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_NEON_DOTPROD_FLAG}) endforeach() endif() - if(CPU_HAS_NEON_DOTPROD) - foreach(SRC ${C_SRCS_NEON_DOTPROD}) + if(CPU_HAS_NEON_I8MM) + foreach(SRC ${C_SRCS_NEON_I8MM}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_NEON_I8MM_FLAG}) endforeach() endif() if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE) foreach(SRC ${C_SRCS_SVE}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_SVE_FLAG}) endforeach() endif() if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE) foreach(SRC ${C_SRCS_SVE2}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_SVE2_FLAG}) endforeach() endif() diff --git a/source/common/aarch64/cpu.h b/source/common/aarch64/cpu.h new file mode 100644 index 000000000..88ce2e310 --- /dev/null +++ b/source/common/aarch64/cpu.h @@ -0,0 +1,120 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.lim...@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_COMMON_AARCH64_CPU_H +#define X265_COMMON_AARCH64_CPU_H + +#include "x265.h" + +#if AARCH64_RUNTIME_CPU_DETECT + +#if defined(__linux__) + +#include <sys/auxv.h> + +#define X265_AARCH64_HWCAP_ASIMDDP (1 << 20) +#define X265_AARCH64_HWCAP_SVE (1 << 22) +#define X265_AARCH64_HWCAP2_SVE2 (1 << 1) +#define X265_AARCH64_HWCAP2_I8MM (1 << 13) + +static inline int aarch64_get_cpu_flags() +{ + int flags = 0; + +#if HAVE_NEON_DOTPROD || HAVE_SVE + unsigned long hwcap = getauxval(AT_HWCAP); +#endif +#if HAVE_NEON_I8MM || HAVE_SVE2 + unsigned long hwcap2 = getauxval(AT_HWCAP2); +#endif + +#if HAVE_NEON + flags |= X265_CPU_NEON; +#endif +#if HAVE_NEON_DOTPROD + if (hwcap & X265_AARCH64_HWCAP_ASIMDDP) flags |= X265_CPU_NEON_DOTPROD; +#endif +#if HAVE_NEON_I8MM + if (hwcap2 & X265_AARCH64_HWCAP2_I8MM) flags |= X265_CPU_NEON_I8MM; +#endif +#if HAVE_SVE + if (hwcap & X265_AARCH64_HWCAP_SVE) flags |= X265_CPU_SVE; +#endif +#if HAVE_SVE2 + if (hwcap2 & X265_AARCH64_HWCAP2_SVE2) flags |= X265_CPU_SVE2; +#endif + + return flags; +} + +#else // defined(__linux__) +#error \ + "Run-time CPU feature detection selected, but no detection method" \ + "available for your platform. Rerun cmake configure with" \ + "-DAARCH64_RUNTIME_CPU_DETECT=OFF." +#endif // defined(__linux__) + +static inline int aarch64_cpu_detect() +{ + int flags = aarch64_get_cpu_flags(); + + // Restrict flags: FEAT_I8MM assumes that FEAT_DotProd is available. + if (!(flags & X265_CPU_NEON_DOTPROD)) flags &= ~X265_CPU_NEON_I8MM; + + // Restrict flags: SVE assumes that FEAT_{DotProd,I8MM} are available. + if (!(flags & X265_CPU_NEON_DOTPROD)) flags &= ~X265_CPU_SVE; + if (!(flags & X265_CPU_NEON_I8MM)) flags &= ~X265_CPU_SVE; + + // Restrict flags: SVE2 assumes that FEAT_SVE is available. + if (!(flags & X265_CPU_SVE)) flags &= ~X265_CPU_SVE2; + + return flags; +} + +#else // if AARCH64_RUNTIME_CPU_DETECT + +static inline int aarch64_cpu_detect() +{ + int flags = 0; + +#if HAVE_NEON + flags |= X265_CPU_NEON; +#endif +#if HAVE_NEON_DOTPROD + flags |= X265_CPU_NEON_DOTPROD; +#endif +#if HAVE_NEON_I8MM + flags |= X265_CPU_NEON_I8MM; +#endif +#if HAVE_SVE + flags |= X265_CPU_SVE; +#endif +#if HAVE_SVE2 + flags |= X265_CPU_SVE2; +#endif + return flags; +} + +#endif // if AARCH64_RUNTIME_CPU_DETECT + +#endif // ifndef X265_COMMON_AARCH64_CPU_H diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp index 485aa681f..ae0907890 100644 --- a/source/common/cpu.cpp +++ b/source/common/cpu.cpp @@ -390,6 +390,7 @@ uint32_t cpu_detect(bool benableavx512) } #elif X265_ARCH_ARM64 +#include "aarch64/cpu.h" uint32_t cpu_detect(bool benableavx512) { @@ -397,21 +398,7 @@ uint32_t cpu_detect(bool benableavx512) int flags = 0; #ifdef ENABLE_ASSEMBLY - #if HAVE_NEON - flags |= X265_CPU_NEON; - #endif - #if HAVE_NEON_DOTPROD - flags |= X265_CPU_NEON_DOTPROD; - #endif - #if HAVE_NEON_I8MM - flags |= X265_CPU_NEON_I8MM; - #endif - #if HAVE_SVE - flags |= X265_CPU_SVE; - #endif - #if HAVE_SVE2 - flags |= X265_CPU_SVE2; - #endif + flags = aarch64_cpu_detect(); #endif return flags; diff --git a/source/common/param.cpp b/source/common/param.cpp index da039d914..1beb3c056 100755 --- a/source/common/param.cpp +++ b/source/common/param.cpp @@ -1587,8 +1587,10 @@ int parseCpuName(const char* value, bool& bError, bool bEnableavx512) } free(buf); +#if X265_ARCH_X86 if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE2_IS_SLOW)) cpu |= X265_CPU_SSE2_IS_FAST; +#endif } return cpu; diff --git a/source/test/testbench.cpp b/source/test/testbench.cpp index ac93e37b3..b8ef760f2 100644 --- a/source/test/testbench.cpp +++ b/source/test/testbench.cpp @@ -120,6 +120,7 @@ int main(int argc, char *argv[]) } else if (!strncmp(name, "cpuid", strlen(name))) { + int cpu_detect_cpuid = cpuid; bool bError = false; cpuid = parseCpuName(value, bError, enableavx512); if (bError) @@ -127,6 +128,11 @@ int main(int argc, char *argv[]) printf("Invalid CPU name: %s\n", value); return 1; } + else if ((cpuid & cpu_detect_cpuid) != cpuid) + { + printf("Feature detection conflicts with provided --cpuid: %s\n", value); + return 1; + } i += 2; } else if (!strncmp(name, "testbench", strlen(name))) -- 2.42.1
>From f06b9a6e99231e4f295ba8c31452ee2b1240223f Mon Sep 17 00:00:00 2001 Message-ID: <f06b9a6e99231e4f295ba8c31452ee2b1240223f.1730906531.git.hari.lim...@arm.com> In-Reply-To: <cover.1729809914.git.hari.lim...@arm.com> From: Hari Limaye <hari.lim...@arm.com> Date: Thu, 10 Oct 2024 10:57:12 +0100 Subject: [PATCH v2 2/6] AArch64: Add run-time CPU feature detection Add run-time CPU feature detection for AArch64 ISA extensions on Linux, enabled by the CMake option `AARCH64_RUNTIME_CPU_DETECT`. This option is enabled by default - for platforms with no detection method implemented we will fall back to compile-time CPU feature detection. Also add logic to testbench to handle the case where the --cpuid parameter conflicts with the feature detection, to fail gracefully rather than SIGILL. --- source/CMakeLists.txt | 55 ++++++++++------ source/common/CMakeLists.txt | 18 ++++-- source/common/aarch64/cpu.h | 120 +++++++++++++++++++++++++++++++++++ source/common/cpu.cpp | 17 +---- source/common/param.cpp | 2 + source/test/testbench.cpp | 6 ++ 6 files changed, 177 insertions(+), 41 deletions(-) create mode 100644 source/common/aarch64/cpu.h diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 908980675..f177c4522 100755 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -87,12 +87,28 @@ elseif(ARM64MATCH GREATER "-1") option(AARCH64_WARNINGS_AS_ERRORS "Build with -Werror for AArch64 Intrinsics files" OFF) + option(AARCH64_RUNTIME_CPU_DETECT "Enable AArch64 run-time CPU feature detection" ON) + if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux") + set(AARCH64_RUNTIME_CPU_DETECT OFF CACHE BOOL "" FORCE) + message(STATUS "Run-time CPU feature detection unsupported on this platform") + endif() + # Options for manually enabling/disabling AArch64 SIMD extensions. option(ENABLE_NEON "Enable Neon" ON) option(ENABLE_NEON_DOTPROD "Enable Neon DotProd" ON) option(ENABLE_NEON_I8MM "Enable Neon I8MM" ON) option(ENABLE_SVE "Enable SVE" ON) option(ENABLE_SVE2 "Enable SVE2" ON) + + # Compiler flags for AArch64 extensions. + set(AARCH64_NEON_FLAG "-march=armv8-a") + # Neon DotProd is mandatory from Armv8.4. + set(AARCH64_NEON_DOTPROD_FLAG "-march=armv8.2-a+dotprod") + # Neon I8MM is mandatory from Armv8.6. + set(AARCH64_NEON_I8MM_FLAG "-march=armv8.2-a+dotprod+i8mm") + set(AARCH64_SVE_FLAG "-march=armv8.2-a+dotprod+i8mm+sve") + # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod and +sve. + set(AARCH64_SVE2_FLAG "-march=armv9-a+i8mm+sve2") else() message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown") message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}") @@ -268,7 +284,13 @@ if(GCC) set(CPU_HAS_NEON 1) add_definitions(-DX265_ARCH_ARM64=1) - if(CROSS_COMPILE_ARM64) + if (AARCH64_RUNTIME_CPU_DETECT) + add_definitions(-DAARCH64_RUNTIME_CPU_DETECT=1) + message(STATUS "Configuring build for run-time CPU feature detection") + endif() + + if(AARCH64_RUNTIME_CPU_DETECT OR CROSS_COMPILE_ARM64) + # Add all extensions when compiling for run-time CPU feature detection or cross compiling. set(CPU_HAS_NEON_DOTPROD 1) set(CPU_HAS_NEON_I8MM 1) set(CPU_HAS_SVE 1) @@ -280,7 +302,7 @@ if(GCC) find_package(SVE) find_package(SVE2) else() - message(STATUS "Compile time feature detection unsupported on this platform") + message(STATUS "Compile-time CPU feature detection unsupported on this platform") endif() endif() @@ -312,33 +334,25 @@ if(GCC) if(CPU_HAS_NEON) message(STATUS "Found Neon") - set(ARM_ARGS -O3 -march=armv8-a) add_definitions(-DHAVE_NEON=1) endif() if(CPU_HAS_NEON_DOTPROD) - # Neon DotProd is mandatory from Armv8.4. message(STATUS "Found Neon DotProd") - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod) add_definitions(-DHAVE_NEON_DOTPROD=1) endif() if(CPU_HAS_NEON_I8MM) - # Neon I8MM is mandatory from Armv8.6. message(STATUS "Found Neon I8MM") - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm) add_definitions(-DHAVE_NEON_I8MM=1) endif() if(CPU_HAS_SVE) message(STATUS "Found SVE") - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve) add_definitions(-DHAVE_SVE=1) endif() if(CPU_HAS_SVE2) message(STATUS "Found SVE2") - # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod - set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2) add_definitions(-DHAVE_SVE2=1) endif() - set(ARM_ARGS ${ARM_ARGS} -fPIC) + set(ARM_ARGS -O3 -fPIC) # Do not allow implicit vector type conversions in Clang builds (this # is already the default in GCC builds). check_cxx_compiler_flag(-flax-vector-conversions=none CC_HAS_FLAX_VEC_CONV_NONE) @@ -356,7 +370,8 @@ int main() { return 0; }") set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS}) # CMAKE_REQUIRED_FLAGS requires a space-delimited string, whereas # ARM_ARGS is defined and used elsewhere as a ;-list. - foreach(ARM_ARG ${ARM_ARGS}) + # Add `-march=...+sve` so the test functions correctly with Clang. + foreach(ARM_ARG ${ARM_ARGS} ${AARCH64_SVE_FLAG}) string(APPEND CMAKE_REQUIRED_FLAGS " ${ARM_ARG}") endforeach() check_c_source_compiles("${SVE_HEADER_TEST}" SVE_HEADER_C_TEST_COMPILED) @@ -705,18 +720,18 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) add_custom_command( OUTPUT ${ASM}.${SUFFIX} COMMAND ${CMAKE_CXX_COMPILER} - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} + ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_NEON_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} DEPENDS ${ASM_SRC}) endforeach() - if(CPU_HAS_SVE2) - foreach(ASM ${ARM_ASMS_SVE2}) + if(CPU_HAS_NEON_DOTPROD) + foreach(ASM ${ARM_ASMS_NEON_DOTPROD}) set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM}) list(APPEND ASM_SRCS ${ASM_SRC}) list(APPEND ASM_OBJS ${ASM}.${SUFFIX}) add_custom_command( OUTPUT ${ASM}.${SUFFIX} COMMAND ${CMAKE_CXX_COMPILER} - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} + ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_NEON_DOTPROD_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} DEPENDS ${ASM_SRC}) endforeach() endif() @@ -728,19 +743,19 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) add_custom_command( OUTPUT ${ASM}.${SUFFIX} COMMAND ${CMAKE_CXX_COMPILER} - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} + ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_SVE_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} DEPENDS ${ASM_SRC}) endforeach() endif() - if(CPU_HAS_NEON_DOTPROD) - foreach(ASM ${ARM_ASMS_NEON_DOTPROD}) + if(CPU_HAS_SVE2) + foreach(ASM ${ARM_ASMS_SVE2}) set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM}) list(APPEND ASM_SRCS ${ASM_SRC}) list(APPEND ASM_OBJS ${ASM}.${SUFFIX}) add_custom_command( OUTPUT ${ASM}.${SUFFIX} COMMAND ${CMAKE_CXX_COMPILER} - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} + ARGS ${ARM_ARGS} ${ASM_FLAGS} ${AARCH64_SVE2_FLAG} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} DEPENDS ${ASM_SRC}) endforeach() endif() diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index dc4a74107..aacc0ef62 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -103,6 +103,7 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) add_definitions(-DAUTO_VECTORIZE=1) endif() + # Add Arm intrinsics files here. set(C_SRCS_NEON asm-primitives.cpp pixel-prim.h pixel-prim.cpp filter-prim.h filter-prim.cpp dct-prim.h dct-prim.cpp loopfilter-prim.cpp loopfilter-prim.h intrapred-prim.cpp arm64-utils.cpp arm64-utils.h fun-decls.h sao-prim.cpp mem-neon.h) set(C_SRCS_NEON_DOTPROD filter-neon-dotprod.cpp) set(C_SRCS_NEON_I8MM filter-neon-i8mm.cpp) @@ -110,11 +111,11 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) set(C_SRCS_SVE2 sao-prim-sve2.cpp) enable_language(ASM) - # add ARM assembly/intrinsic files here + # Add Arm assembly files here. set(A_SRCS asm.S mc-a.S mc-a-common.S sad-a.S pixel-util.S pixel-util-common.S p2s.S p2s-common.S blockcopy8.S blockcopy8-common.S ssd-a.S ssd-a-common.S intrapred.S dct.S) + set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S) set(A_SRCS_SVE asm-sve.S blockcopy8-sve.S p2s-sve.S pixel-util-sve.S) set(A_SRCS_SVE2 mc-a-sve2.S pixel-util-sve2.S ssd-a-sve2.S) - set(A_SRCS_NEON_DOTPROD sad-neon-dotprod.S ssd-neon-dotprod.S) set(VEC_PRIMITIVES) set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") @@ -123,29 +124,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension") foreach(SRC ${C_SRCS_NEON}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_NEON_FLAG}) endforeach() - if(CPU_HAS_NEON_I8MM) - foreach(SRC ${C_SRCS_NEON_I8MM}) + if(CPU_HAS_NEON_DOTPROD) + foreach(SRC ${C_SRCS_NEON_DOTPROD}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_NEON_DOTPROD_FLAG}) endforeach() endif() - if(CPU_HAS_NEON_DOTPROD) - foreach(SRC ${C_SRCS_NEON_DOTPROD}) + if(CPU_HAS_NEON_I8MM) + foreach(SRC ${C_SRCS_NEON_I8MM}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_NEON_I8MM_FLAG}) endforeach() endif() if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE) foreach(SRC ${C_SRCS_SVE}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_SVE_FLAG}) endforeach() endif() if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE) foreach(SRC ${C_SRCS_SVE2}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + set_source_files_properties(aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${AARCH64_SVE2_FLAG}) endforeach() endif() diff --git a/source/common/aarch64/cpu.h b/source/common/aarch64/cpu.h new file mode 100644 index 000000000..88ce2e310 --- /dev/null +++ b/source/common/aarch64/cpu.h @@ -0,0 +1,120 @@ +/***************************************************************************** + * Copyright (C) 2024 MulticoreWare, Inc + * + * Authors: Hari Limaye <hari.lim...@arm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA. + * + * This program is also available under a commercial proprietary license. + * For more information, contact us at license @ x265.com. + *****************************************************************************/ + +#ifndef X265_COMMON_AARCH64_CPU_H +#define X265_COMMON_AARCH64_CPU_H + +#include "x265.h" + +#if AARCH64_RUNTIME_CPU_DETECT + +#if defined(__linux__) + +#include <sys/auxv.h> + +#define X265_AARCH64_HWCAP_ASIMDDP (1 << 20) +#define X265_AARCH64_HWCAP_SVE (1 << 22) +#define X265_AARCH64_HWCAP2_SVE2 (1 << 1) +#define X265_AARCH64_HWCAP2_I8MM (1 << 13) + +static inline int aarch64_get_cpu_flags() +{ + int flags = 0; + +#if HAVE_NEON_DOTPROD || HAVE_SVE + unsigned long hwcap = getauxval(AT_HWCAP); +#endif +#if HAVE_NEON_I8MM || HAVE_SVE2 + unsigned long hwcap2 = getauxval(AT_HWCAP2); +#endif + +#if HAVE_NEON + flags |= X265_CPU_NEON; +#endif +#if HAVE_NEON_DOTPROD + if (hwcap & X265_AARCH64_HWCAP_ASIMDDP) flags |= X265_CPU_NEON_DOTPROD; +#endif +#if HAVE_NEON_I8MM + if (hwcap2 & X265_AARCH64_HWCAP2_I8MM) flags |= X265_CPU_NEON_I8MM; +#endif +#if HAVE_SVE + if (hwcap & X265_AARCH64_HWCAP_SVE) flags |= X265_CPU_SVE; +#endif +#if HAVE_SVE2 + if (hwcap2 & X265_AARCH64_HWCAP2_SVE2) flags |= X265_CPU_SVE2; +#endif + + return flags; +} + +#else // defined(__linux__) +#error \ + "Run-time CPU feature detection selected, but no detection method" \ + "available for your platform. Rerun cmake configure with" \ + "-DAARCH64_RUNTIME_CPU_DETECT=OFF." +#endif // defined(__linux__) + +static inline int aarch64_cpu_detect() +{ + int flags = aarch64_get_cpu_flags(); + + // Restrict flags: FEAT_I8MM assumes that FEAT_DotProd is available. + if (!(flags & X265_CPU_NEON_DOTPROD)) flags &= ~X265_CPU_NEON_I8MM; + + // Restrict flags: SVE assumes that FEAT_{DotProd,I8MM} are available. + if (!(flags & X265_CPU_NEON_DOTPROD)) flags &= ~X265_CPU_SVE; + if (!(flags & X265_CPU_NEON_I8MM)) flags &= ~X265_CPU_SVE; + + // Restrict flags: SVE2 assumes that FEAT_SVE is available. + if (!(flags & X265_CPU_SVE)) flags &= ~X265_CPU_SVE2; + + return flags; +} + +#else // if AARCH64_RUNTIME_CPU_DETECT + +static inline int aarch64_cpu_detect() +{ + int flags = 0; + +#if HAVE_NEON + flags |= X265_CPU_NEON; +#endif +#if HAVE_NEON_DOTPROD + flags |= X265_CPU_NEON_DOTPROD; +#endif +#if HAVE_NEON_I8MM + flags |= X265_CPU_NEON_I8MM; +#endif +#if HAVE_SVE + flags |= X265_CPU_SVE; +#endif +#if HAVE_SVE2 + flags |= X265_CPU_SVE2; +#endif + return flags; +} + +#endif // if AARCH64_RUNTIME_CPU_DETECT + +#endif // ifndef X265_COMMON_AARCH64_CPU_H diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp index 485aa681f..ae0907890 100644 --- a/source/common/cpu.cpp +++ b/source/common/cpu.cpp @@ -390,6 +390,7 @@ uint32_t cpu_detect(bool benableavx512) } #elif X265_ARCH_ARM64 +#include "aarch64/cpu.h" uint32_t cpu_detect(bool benableavx512) { @@ -397,21 +398,7 @@ uint32_t cpu_detect(bool benableavx512) int flags = 0; #ifdef ENABLE_ASSEMBLY - #if HAVE_NEON - flags |= X265_CPU_NEON; - #endif - #if HAVE_NEON_DOTPROD - flags |= X265_CPU_NEON_DOTPROD; - #endif - #if HAVE_NEON_I8MM - flags |= X265_CPU_NEON_I8MM; - #endif - #if HAVE_SVE - flags |= X265_CPU_SVE; - #endif - #if HAVE_SVE2 - flags |= X265_CPU_SVE2; - #endif + flags = aarch64_cpu_detect(); #endif return flags; diff --git a/source/common/param.cpp b/source/common/param.cpp index da039d914..1beb3c056 100755 --- a/source/common/param.cpp +++ b/source/common/param.cpp @@ -1587,8 +1587,10 @@ int parseCpuName(const char* value, bool& bError, bool bEnableavx512) } free(buf); +#if X265_ARCH_X86 if ((cpu & X265_CPU_SSSE3) && !(cpu & X265_CPU_SSE2_IS_SLOW)) cpu |= X265_CPU_SSE2_IS_FAST; +#endif } return cpu; diff --git a/source/test/testbench.cpp b/source/test/testbench.cpp index ac93e37b3..b8ef760f2 100644 --- a/source/test/testbench.cpp +++ b/source/test/testbench.cpp @@ -120,6 +120,7 @@ int main(int argc, char *argv[]) } else if (!strncmp(name, "cpuid", strlen(name))) { + int cpu_detect_cpuid = cpuid; bool bError = false; cpuid = parseCpuName(value, bError, enableavx512); if (bError) @@ -127,6 +128,11 @@ int main(int argc, char *argv[]) printf("Invalid CPU name: %s\n", value); return 1; } + else if ((cpuid & cpu_detect_cpuid) != cpuid) + { + printf("Feature detection conflicts with provided --cpuid: %s\n", value); + return 1; + } i += 2; } else if (!strncmp(name, "testbench", strlen(name))) -- 2.42.1
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel