Fixed typos and updated TODO. Please find the updated patch below: >From c116db02bd50faa59c3d2b1c63bd6816d6dec2a0 Mon Sep 17 00:00:00 2001 From: Logaprakash Ramajayam <logaprakash.ramaja...@multicorewareinc.com> Date: Thu, 3 Oct 2024 05:24:16 -0700 Subject: [PATCH] AArch64: Runtime CPU feature detection
--- .../make-aarch64-w64-mingw32-Makefiles.sh | 8 ++ .../msys/toolchain-aarch64-w64-mingw32.cmake | 8 ++ source/CMakeLists.txt | 21 +++-- source/common/CMakeLists.txt | 5 ++ source/common/cpu.cpp | 81 ++++++++++++++++++- 5 files changed, 114 insertions(+), 9 deletions(-) create mode 100644 build/msys/make-aarch64-w64-mingw32-Makefiles.sh create mode 100644 build/msys/toolchain-aarch64-w64-mingw32.cmake diff --git a/build/msys/make-aarch64-w64-mingw32-Makefiles.sh b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh new file mode 100644 index 000000000..eceffa4a9 --- /dev/null +++ b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh @@ -0,0 +1,8 @@ +#!/bin/sh + +# This will generate a cross-compile environment, compiling an aarch64 +# Win64 target from a 32bit MinGW32 host environment. If your MinGW +# install is 64bit, you can use the native compiler batch file: +# make-Makefiles.sh + +cmake -G "MSYS Makefiles" -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64-w64-mingw32.cmake ../../source && cmake-gui ../../source diff --git a/build/msys/toolchain-aarch64-w64-mingw32.cmake b/build/msys/toolchain-aarch64-w64-mingw32.cmake new file mode 100644 index 000000000..6607bdf64 --- /dev/null +++ b/build/msys/toolchain-aarch64-w64-mingw32.cmake @@ -0,0 +1,8 @@ +SET(CMAKE_SYSTEM_NAME Windows) +set(CMAKE_SYSTEM_PROCESSOR aarch64) +SET(CMAKE_C_COMPILER aarch64-w64-mingw32-gcc) +SET(CMAKE_CXX_COMPILER aarch64-w64-mingw32-g++) +SET(CMAKE_RC_COMPILER aarch64-w64-mingw32-windres) +SET(CMAKE_RANLIB aarch64-w64-mingw32-ranlib) +SET(CMAKE_ASM_YASM_COMPILER yasm) +SET(CROSS_COMPILE_ARM64 1) diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt index 13bc8ccfe..d1fe38559 100755 --- a/source/CMakeLists.txt +++ b/source/CMakeLists.txt @@ -303,10 +303,12 @@ if(GCC) endif() endif() + set(ARM64_ARCH_ARGS "-O3") if(CPU_HAS_NEON_DOTPROD) # Neon DotProd is mandatory from Armv8.4. message(STATUS "Found Neon DotProd") - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod) + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod) + set(ARM_ARGS -O3) add_definitions(-DHAVE_NEON_DOTPROD=1) endif() if(CPU_HAS_NEON_I8MM) @@ -316,7 +318,8 @@ if(GCC) if(NOT CPU_HAS_NEON_DOTPROD) message(FATAL_ERROR "Unsupported AArch64 feature combination (Neon I8MM without Neon DotProd)") endif() - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm) + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm) + set(ARM_ARGS -O3) add_definitions(-DHAVE_NEON_I8MM=1) endif() if(CPU_HAS_SVE) @@ -325,13 +328,15 @@ if(GCC) if(NOT CPU_HAS_NEON_I8MM) message(FATAL_ERROR "Unsupported AArch64 feature combination (SVE without Neon I8MM)") endif() - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve) + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm+sve) + set(ARM_ARGS -O3) add_definitions(-DHAVE_SVE=1) endif() if(CPU_HAS_SVE2) message(STATUS "Found SVE2") # SVE2 is only available from Armv9.0, and armv9-a implies +dotprod - set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2) + set(ARM64_ARCH_ARGS -march=armv9-a+i8mm+sve2) + set(ARM_ARGS -O3) add_definitions(-DHAVE_SVE2=1) endif() set(ARM_ARGS ${ARM_ARGS} -fPIC) @@ -692,7 +697,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) add_custom_command( OUTPUT ${ASM}.${SUFFIX} COMMAND ${CMAKE_CXX_COMPILER} - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} DEPENDS ${ASM_SRC}) endforeach() if(CPU_HAS_SVE2) @@ -703,7 +708,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) add_custom_command( OUTPUT ${ASM}.${SUFFIX} COMMAND ${CMAKE_CXX_COMPILER} - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} DEPENDS ${ASM_SRC}) endforeach() endif() @@ -715,7 +720,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) add_custom_command( OUTPUT ${ASM}.${SUFFIX} COMMAND ${CMAKE_CXX_COMPILER} - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} DEPENDS ${ASM_SRC}) endforeach() endif() @@ -727,7 +732,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) add_custom_command( OUTPUT ${ASM}.${SUFFIX} COMMAND ${CMAKE_CXX_COMPILER} - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o ${ASM}.${SUFFIX} DEPENDS ${ASM_SRC}) endforeach() endif() diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt index dc4a74107..33025cada 100644 --- a/source/common/CMakeLists.txt +++ b/source/common/CMakeLists.txt @@ -123,29 +123,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR CROSS_COMPILE_ARM64)) set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL "Arm Assembly Sources that use the Neon DotProd extension") foreach(SRC ${C_SRCS_NEON}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + set_source_files_properties( ${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} ) endforeach() if(CPU_HAS_NEON_I8MM) foreach(SRC ${C_SRCS_NEON_I8MM}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + set_source_files_properties( aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} ) endforeach() endif() if(CPU_HAS_NEON_DOTPROD) foreach(SRC ${C_SRCS_NEON_DOTPROD}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + set_source_files_properties( aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} ) endforeach() endif() if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE) foreach(SRC ${C_SRCS_SVE}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + set_source_files_properties( aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} ) endforeach() endif() if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE) foreach(SRC ${C_SRCS_SVE2}) set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) + set_source_files_properties( aarch64/${SRC} PROPERTIES COMPILE_FLAGS ${ARM64_ARCH_ARGS} ) endforeach() endif() diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp index 61cdaadfb..2d4b15dc9 100644 --- a/source/common/cpu.cpp +++ b/source/common/cpu.cpp @@ -391,7 +391,8 @@ uint32_t cpu_detect(bool benableavx512) #elif X265_ARCH_ARM64 -uint32_t cpu_detect(bool benableavx512) +#if defined(_MSC_VER) || defined(__APPLE__) +uint32_t cpu_detect(bool /*benableavx512*/) { int flags = 0; @@ -414,6 +415,84 @@ uint32_t cpu_detect(bool benableavx512) return flags; } +// TODO: Remove isOryonCPU() once Windows defines PF_ flag for I8MM on supported ARM64 devices +#elif defined(__MINGW64__) // Windows+Aarch64 + +#include <windows.h> +#include <processthreadsapi.h> + +bool isOryonCPU() +{ + + char processorName[128]; + DWORD bufferSize = 128; + + LONG result = RegGetValue(HKEY_LOCAL_MACHINE, "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", "ProcessorNameString", RRF_RT_ANY, NULL, (PVOID)&processorName, &bufferSize); + if (strstr(processorName, "Oryon") != NULL) + { + return true; + } + else + { + return false; + } +} +uint32_t cpu_detect(bool /*benableavx512*/) +{ + + int flags = 0; + + #if HAVE_NEON + flags |= X265_CPU_NEON; // All of ARM64 has NEON + #endif + #if HAVE_NEON_DOTPROD && defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) + flags |= IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) ? X265_CPU_NEON_DOTPROD : 0; + #endif + #if HAVE_NEON_I8MM + flags |= isOryonCPU() ? X265_CPU_NEON_I8MM : 0; + #endif + #if HAVE_SVE && defined(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) + flags |= IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) ? X265_CPU_SVE : 0; + #endif + #if HAVE_SVE2 && defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) + flags |= IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) ? X265_CPU_SVE2 : 0; + #endif + + return flags; +} // end of Windows+Aarch64 + +#else // Linux+Aarch64 + +#include <asm/hwcap.h> +#include <sys/auxv.h> + +uint32_t cpu_detect(bool /*benableavx512*/) +{ + unsigned long hwcaps = getauxval(AT_HWCAP); + unsigned long hwcaps2 = getauxval(AT_HWCAP2); + + int flags = 0; + + #if HAVE_NEON + flags |= X265_CPU_NEON; // All of ARM64 has NEON + #endif + #if HAVE_NEON_DOTPROD + flags |= (hwcaps & HWCAP_ASIMDDP ? X265_CPU_NEON_DOTPROD : 0); + #endif + #if HAVE_NEON_I8MM + flags |= (hwcaps2 & HWCAP2_I8MM ? X265_CPU_NEON_I8MM : 0); + #endif + #if HAVE_SVE + flags |= (hwcaps & HWCAP_SVE ? X265_CPU_SVE : 0); + #endif + #if HAVE_SVE2 + flags |= (hwcaps2 & HWCAP2_SVE2 ? X265_CPU_SVE2 : 0); + #endif + + return flags; +} +#endif // end of Linux+AArch64 + #elif X265_ARCH_POWER8 uint32_t cpu_detect(bool benableavx512) -- 2.45.2 On Thu, Oct 3, 2024 at 3:26 PM Dash Santosh < dash.sathyanaraya...@multicorewareinc.com> wrote: > From 7d2353aaf7509721461c141f2800962c15ff440c Mon Sep 17 00:00:00 2001 > From: Logaprakash Ramajayam <logaprakash.ramaja...@multicorewareinc.com> > Date: Wed, 2 Oct 2024 21:59:59 -0700 > Subject: [PATCH] AArch64: Runtime CPU feature detection > > --- > .../make-aarch64-w64-mingw32-Makefiles.sh | 8 ++ > .../msys/toolchain-aarch64-w64-mingw32.cmake | 8 ++ > source/CMakeLists.txt | 21 +++-- > source/common/CMakeLists.txt | 5 ++ > source/common/cpu.cpp | 81 ++++++++++++++++++- > 5 files changed, 114 insertions(+), 9 deletions(-) > create mode 100644 build/msys/make-aarch64-w64-mingw32-Makefiles.sh > create mode 100644 build/msys/toolchain-aarch64-w64-mingw32.cmake > > diff --git a/build/msys/make-aarch64-w64-mingw32-Makefiles.sh > b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh > new file mode 100644 > index 000000000..eceffa4a9 > --- /dev/null > +++ b/build/msys/make-aarch64-w64-mingw32-Makefiles.sh > @@ -0,0 +1,8 @@ > +#!/bin/sh > + > +# This will generate a cross-compile environment, compiling an aarch64 > +# Win64 target from a 32bit MinGW32 host environment. If your MinGW > +# install is 64bit, you can use the native compiler batch file: > +# make-Makefiles.sh > + > +cmake -G "MSYS Makefiles" > -DCMAKE_TOOLCHAIN_FILE=toolchain-aarch64-w64-mingw32.cmake ../../source && > cmake-gui ../../source > diff --git a/build/msys/toolchain-aarch64-w64-mingw32.cmake > b/build/msys/toolchain-aarch64-w64-mingw32.cmake > new file mode 100644 > index 000000000..6607bdf64 > --- /dev/null > +++ b/build/msys/toolchain-aarch64-w64-mingw32.cmake > @@ -0,0 +1,8 @@ > +SET(CMAKE_SYSTEM_NAME Windows) > +set(CMAKE_SYSTEM_PROCESSOR aarch64) > +SET(CMAKE_C_COMPILER aarch64-w64-mingw32-gcc) > +SET(CMAKE_CXX_COMPILER aarch64-w64-mingw32-g++) > +SET(CMAKE_RC_COMPILER aarch64-w64-mingw32-windres) > +SET(CMAKE_RANLIB aarch64-w64-mingw32-ranlib) > +SET(CMAKE_ASM_YASM_COMPILER yasm) > +SET(CROSS_COMPILE_ARM64 1) > diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt > index 13bc8ccfe..d1fe38559 100755 > --- a/source/CMakeLists.txt > +++ b/source/CMakeLists.txt > @@ -303,10 +303,12 @@ if(GCC) > endif() > endif() > > + set(ARM64_ARCH_ARGS "-O3") > if(CPU_HAS_NEON_DOTPROD) > # Neon DotProd is mandatory from Armv8.4. > message(STATUS "Found Neon DotProd") > - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod) > + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod) > + set(ARM_ARGS -O3) > add_definitions(-DHAVE_NEON_DOTPROD=1) > endif() > if(CPU_HAS_NEON_I8MM) > @@ -316,7 +318,8 @@ if(GCC) > if(NOT CPU_HAS_NEON_DOTPROD) > message(FATAL_ERROR "Unsupported AArch64 feature > combination (Neon I8MM without Neon DotProd)") > endif() > - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm) > + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm) > + set(ARM_ARGS -O3) > add_definitions(-DHAVE_NEON_I8MM=1) > endif() > if(CPU_HAS_SVE) > @@ -325,13 +328,15 @@ if(GCC) > if(NOT CPU_HAS_NEON_I8MM) > message(FATAL_ERROR "Unsupported AArch64 feature > combination (SVE without Neon I8MM)") > endif() > - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve) > + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm+sve) > + set(ARM_ARGS -O3) > add_definitions(-DHAVE_SVE=1) > endif() > if(CPU_HAS_SVE2) > message(STATUS "Found SVE2") > # SVE2 is only available from Armv9.0, and armv9-a implies > +dotprod > - set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2) > + set(ARM64_ARCH_ARGS -march=armv9-a+i8mm+sve2) > + set(ARM_ARGS -O3) > add_definitions(-DHAVE_SVE2=1) > endif() > set(ARM_ARGS ${ARM_ARGS} -fPIC) > @@ -692,7 +697,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) > add_custom_command( > OUTPUT ${ASM}.${SUFFIX} > COMMAND ${CMAKE_CXX_COMPILER} > - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o > ${ASM}.${SUFFIX} > + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c > ${ASM_SRC} -o ${ASM}.${SUFFIX} > DEPENDS ${ASM_SRC}) > endforeach() > if(CPU_HAS_SVE2) > @@ -703,7 +708,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) > add_custom_command( > OUTPUT ${ASM}.${SUFFIX} > COMMAND ${CMAKE_CXX_COMPILER} > - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o > ${ASM}.${SUFFIX} > + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c > ${ASM_SRC} -o ${ASM}.${SUFFIX} > DEPENDS ${ASM_SRC}) > endforeach() > endif() > @@ -715,7 +720,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) > add_custom_command( > OUTPUT ${ASM}.${SUFFIX} > COMMAND ${CMAKE_CXX_COMPILER} > - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o > ${ASM}.${SUFFIX} > + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c > ${ASM_SRC} -o ${ASM}.${SUFFIX} > DEPENDS ${ASM_SRC}) > endforeach() > endif() > @@ -727,7 +732,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) > add_custom_command( > OUTPUT ${ASM}.${SUFFIX} > COMMAND ${CMAKE_CXX_COMPILER} > - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o > ${ASM}.${SUFFIX} > + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c > ${ASM_SRC} -o ${ASM}.${SUFFIX} > DEPENDS ${ASM_SRC}) > endforeach() > endif() > diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt > index dc4a74107..33025cada 100644 > --- a/source/common/CMakeLists.txt > +++ b/source/common/CMakeLists.txt > @@ -123,29 +123,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR > CROSS_COMPILE_ARM64)) > set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL > "Arm Assembly Sources that use the Neon DotProd extension") > foreach(SRC ${C_SRCS_NEON}) > set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) > + set_source_files_properties( ${SRC} PROPERTIES COMPILE_FLAGS > ${ARM64_ARCH_ARGS} ) > endforeach() > > if(CPU_HAS_NEON_I8MM) > foreach(SRC ${C_SRCS_NEON_I8MM}) > set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) > + set_source_files_properties( aarch64/${SRC} PROPERTIES > COMPILE_FLAGS ${ARM64_ARCH_ARGS} ) > endforeach() > endif() > > if(CPU_HAS_NEON_DOTPROD) > foreach(SRC ${C_SRCS_NEON_DOTPROD}) > set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) > + set_source_files_properties( aarch64/${SRC} PROPERTIES > COMPILE_FLAGS ${ARM64_ARCH_ARGS} ) > endforeach() > endif() > > if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE) > foreach(SRC ${C_SRCS_SVE}) > set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) > + set_source_files_properties( aarch64/${SRC} PROPERTIES > COMPILE_FLAGS ${ARM64_ARCH_ARGS} ) > endforeach() > endif() > > if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE) > foreach(SRC ${C_SRCS_SVE2}) > set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) > + set_source_files_properties( aarch64/${SRC} PROPERTIES > COMPILE_FLAGS ${ARM64_ARCH_ARGS} ) > endforeach() > endif() > > diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp > index 61cdaadfb..a2b0ac081 100644 > --- a/source/common/cpu.cpp > +++ b/source/common/cpu.cpp > @@ -391,7 +391,8 @@ uint32_t cpu_detect(bool benableavx512) > > #elif X265_ARCH_ARM64 > > -uint32_t cpu_detect(bool benableavx512) > +#if defined(_MSC_VER) || defined(__APPLE__) > +uint32_t cpu_detect(bool /*benableavx512*/) > { > int flags = 0; > > @@ -414,6 +415,84 @@ uint32_t cpu_detect(bool benableavx512) > return flags; > } > > +// TODO: Support ARM on Windows > +#elif defined(__MINGW64__) > + > +#include <windows.h> > +#include <processthreadsapi.h> > + > +bool isOryonCPU() > +{ > + > + char processorName[128]; > + DWORD bufferSize = 128; > + > + LONG result = RegGetValue(HKEY_LOCAL_MACHINE, > "HARDWARE\\DESCRIPTION\\System\\CentralProcessor\\0", > "ProcessorNameString", RRF_RT_ANY, NULL, (PVOID)&processorName, > &bufferSize); > + if (strstr(processorName, "Oryon") != NULL) > + { > + return true; > + } > + else > + { > + return false; > + } > +} > +uint32_t cpu_detect(bool /*benableavx512*/) > +{ > + > + int flags = 0; > + > + #if HAVE_NEON > + flags |= X265_CPU_NEON; // All of ARM64 has NEON > + #endif > + #if HAVE_NEON_DOTPROD && defined(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) > + flags |= > IsProcessorFeaturePresent(PF_ARM_V82_DP_INSTRUCTIONS_AVAILABLE) ? > X265_CPU_NEON_DOTPROD : 0; > + #endif > + #if HAVE_NEON_I8MM > + flags |= isOryonCPU() ? X265_CPU_NEON_I8MM : 0; > + #endif > + #if HAVE_SVE && defined(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) > + flags |= > IsProcessorFeaturePresent(PF_ARM_SVE_INSTRUCTIONS_AVAILABLE) ? X265_CPU_SVE > : 0; > + #endif > + #if HAVE_SVE2 && defined(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) > + flags |= > IsProcessorFeaturePresent(PF_ARM_SVE2_INSTRUCTIONS_AVAILABLE) ? > X265_CPU_SVE2 : 0; > + #endif > + > + return flags; > +} > + > +#else // Linux+Aarch64 > + > +#include <asm/hwcap.h> > +#include <sys/auxv.h> > + > +uint32_t cpu_detect(bool /*benableavx5128*/) > +{ > + unsigned long hwcaps = getauxval(AT_HWCAP); > + unsigned long hwcaps2 = getauxval(AT_HWCAP2); > + > + int flags = 0; > + > + #if HAVE_NEON > + flags |= X265_CPU_NEON; // All of ARM64 has NEON > + #endif > + #if HAVE_NEON_DOTPROD > + flags |= (hwcaps & HWCAP_ASIMDDP ? X265_CPU_NEON_DOTPROD : 0); > + #endif > + #if HAVE_NEON_I8MM > + flags |= (hwcaps2 & HWCAP2_I8MM ? X265_CPU_NEON_I8MM : 0); > + #endif > + #if HAVE_SVE > + flags |= (hwcaps & HWCAP_SVE ? X265_CPU_SVE : 0); > + #endif > + #if HAVE_SVE2 > + flags |= (hwcaps2 & HWCAP2_SVE2 ? X265_CPU_SVE2 : 0); > + #endif > + > + return flags; > +} > +#endif // end of Linux+AArch64 > + > #elif X265_ARCH_POWER8 > > uint32_t cpu_detect(bool benableavx512) > -- > 2.45.2 > > > On Thu, Oct 3, 2024 at 3:26 PM Dash Santosh < > dash.sathyanaraya...@multicorewareinc.com> wrote: > >> Hi Hari, >> Thanks for spotting this. Also added support for Windows on ARM. Please >> find below the updated patch: >> >> >> On Thu, Sep 26, 2024 at 11:43 AM Dash Santosh < >> dash.sathyanaraya...@multicorewareinc.com> wrote: >> >>> From e9614d170f93f3ad4f01e95abfed0a260f218bd5 Mon Sep 17 00:00:00 2001 >>> From: Min Chen <chenm...@163.com> >>> Date: Sat, 14 Sep 2024 14:25:28 -0700 >>> Subject: [PATCH] AArch64: Runtime CPU feature detection >>> >>> --- >>> source/CMakeLists.txt | 20 ++++++++++++-------- >>> source/common/CMakeLists.txt | 5 +++++ >>> source/common/cpu.cpp | 33 +++++++++++++++++++++++++++++++++ >>> 3 files changed, 50 insertions(+), 8 deletions(-) >>> >>> diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt >>> index 37b83f959..32a99206f 100755 >>> --- a/source/CMakeLists.txt >>> +++ b/source/CMakeLists.txt >>> @@ -306,7 +306,8 @@ if(GCC) >>> if(CPU_HAS_NEON_DOTPROD) >>> # Neon DotProd is mandatory from Armv8.4. >>> message(STATUS "Found Neon DotProd") >>> - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod) >>> + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod) >>> + set(ARM_ARGS -O3) >>> add_definitions(-DHAVE_NEON_DOTPROD=1) >>> endif() >>> if(CPU_HAS_NEON_I8MM) >>> @@ -316,7 +317,8 @@ if(GCC) >>> if(NOT CPU_HAS_NEON_DOTPROD) >>> message(FATAL_ERROR "Unsupported AArch64 feature >>> combination (Neon I8MM without Neon DotProd)") >>> endif() >>> - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm) >>> + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm) >>> + set(ARM_ARGS -O3) >>> add_definitions(-DHAVE_NEON_I8MM=1) >>> endif() >>> if(CPU_HAS_SVE) >>> @@ -325,13 +327,15 @@ if(GCC) >>> if(NOT CPU_HAS_NEON_I8MM) >>> message(FATAL_ERROR "Unsupported AArch64 feature >>> combination (SVE without Neon I8MM)") >>> endif() >>> - set(ARM_ARGS -O3 -march=armv8.2-a+dotprod+i8mm+sve) >>> + set(ARM64_ARCH_ARGS -march=armv8.2-a+dotprod+i8mm+sve) >>> + set(ARM_ARGS -O3) >>> add_definitions(-DHAVE_SVE=1) >>> endif() >>> if(CPU_HAS_SVE2) >>> message(STATUS "Found SVE2") >>> # SVE2 is only available from Armv9.0, and armv9-a implies >>> +dotprod >>> - set(ARM_ARGS -O3 -march=armv9-a+i8mm+sve2) >>> + set(ARM64_ARCH_ARGS -march=armv9-a+i8mm+sve2) >>> + set(ARM_ARGS -O3) >>> add_definitions(-DHAVE_SVE2=1) >>> endif() >>> set(ARM_ARGS ${ARM_ARGS} -fPIC) >>> @@ -692,7 +696,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) >>> add_custom_command( >>> OUTPUT ${ASM}.${SUFFIX} >>> COMMAND ${CMAKE_CXX_COMPILER} >>> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o >>> ${ASM}.${SUFFIX} >>> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c >>> ${ASM_SRC} -o ${ASM}.${SUFFIX} >>> DEPENDS ${ASM_SRC}) >>> endforeach() >>> if(CPU_HAS_SVE2) >>> @@ -703,7 +707,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) >>> add_custom_command( >>> OUTPUT ${ASM}.${SUFFIX} >>> COMMAND ${CMAKE_CXX_COMPILER} >>> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o >>> ${ASM}.${SUFFIX} >>> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c >>> ${ASM_SRC} -o ${ASM}.${SUFFIX} >>> DEPENDS ${ASM_SRC}) >>> endforeach() >>> endif() >>> @@ -715,7 +719,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) >>> add_custom_command( >>> OUTPUT ${ASM}.${SUFFIX} >>> COMMAND ${CMAKE_CXX_COMPILER} >>> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o >>> ${ASM}.${SUFFIX} >>> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c >>> ${ASM_SRC} -o ${ASM}.${SUFFIX} >>> DEPENDS ${ASM_SRC}) >>> endforeach() >>> endif() >>> @@ -727,7 +731,7 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) >>> add_custom_command( >>> OUTPUT ${ASM}.${SUFFIX} >>> COMMAND ${CMAKE_CXX_COMPILER} >>> - ARGS ${ARM_ARGS} ${ASM_FLAGS} -c ${ASM_SRC} -o >>> ${ASM}.${SUFFIX} >>> + ARGS ${ARM_ARGS} ${ARM64_ARCH_ARGS} ${ASM_FLAGS} -c >>> ${ASM_SRC} -o ${ASM}.${SUFFIX} >>> DEPENDS ${ASM_SRC}) >>> endforeach() >>> endif() >>> diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt >>> index dc4a74107..33025cada 100644 >>> --- a/source/common/CMakeLists.txt >>> +++ b/source/common/CMakeLists.txt >>> @@ -123,29 +123,34 @@ if(ENABLE_ASSEMBLY AND (ARM64 OR >>> CROSS_COMPILE_ARM64)) >>> set(ARM_ASMS_NEON_DOTPROD "${A_SRCS_NEON_DOTPROD}" CACHE INTERNAL >>> "Arm Assembly Sources that use the Neon DotProd extension") >>> foreach(SRC ${C_SRCS_NEON}) >>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) >>> + set_source_files_properties( ${SRC} PROPERTIES COMPILE_FLAGS >>> ${ARM64_ARCH_ARGS} ) >>> endforeach() >>> >>> if(CPU_HAS_NEON_I8MM) >>> foreach(SRC ${C_SRCS_NEON_I8MM}) >>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) >>> + set_source_files_properties( aarch64/${SRC} PROPERTIES >>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} ) >>> endforeach() >>> endif() >>> >>> if(CPU_HAS_NEON_DOTPROD) >>> foreach(SRC ${C_SRCS_NEON_DOTPROD}) >>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) >>> + set_source_files_properties( aarch64/${SRC} PROPERTIES >>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} ) >>> endforeach() >>> endif() >>> >>> if(CPU_HAS_SVE AND HAVE_SVE_BRIDGE) >>> foreach(SRC ${C_SRCS_SVE}) >>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) >>> + set_source_files_properties( aarch64/${SRC} PROPERTIES >>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} ) >>> endforeach() >>> endif() >>> >>> if(CPU_HAS_SVE2 AND HAVE_SVE_BRIDGE) >>> foreach(SRC ${C_SRCS_SVE2}) >>> set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) >>> + set_source_files_properties( aarch64/${SRC} PROPERTIES >>> COMPILE_FLAGS ${ARM64_ARCH_ARGS} ) >>> endforeach() >>> endif() >>> >>> diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp >>> index 61cdaadfb..24c60ff0e 100644 >>> --- a/source/common/cpu.cpp >>> +++ b/source/common/cpu.cpp >>> @@ -391,6 +391,8 @@ uint32_t cpu_detect(bool benableavx512) >>> >>> #elif X265_ARCH_ARM64 >>> >>> +// TODO: Support ARM on Windows >>> +#if _MSC_VER >>> uint32_t cpu_detect(bool benableavx512) >>> { >>> int flags = 0; >>> @@ -413,6 +415,37 @@ uint32_t cpu_detect(bool benableavx512) >>> >>> return flags; >>> } >>> +#else // Linux+Aarch64 >>> + >>> +#include <asm/hwcap.h> >>> +#include <sys/auxv.h> >>> + >>> +uint32_t cpu_detect(bool benableavx512) >>> +{ >>> + unsigned long hwcaps = getauxval(AT_HWCAP); >>> + unsigned long hwcaps2 = getauxval(AT_HWCAP2); >>> + >>> + int flags = 0; >>> + >>> + #if HAVE_NEON >>> + flags |= X265_CPU_NEON; // All of ARM64 has NEON >>> + #endif >>> + #if HAVE_NEON_DOTPROD >>> + flags |= (hwcaps & HWCAP_ASIMDDP ? X265_CPU_NEON_DOTPROD : 0); >>> + #endif >>> + #if HAVE_NEON_I8MM >>> + flags |= (hwcaps2 & HWCAP2_SVEI8MM ? X265_CPU_NEON_I8MM : 0); >>> + #endif >>> + #if HAVE_SVE >>> + flags |= (hwcaps & HWCAP_SVE ? X265_CPU_SVE : 0); >>> + #endif >>> + #if HAVE_SVE2 >>> + flags |= (hwcaps2 & HWCAP2_SVE2 ? X265_CPU_SVE2 : 0); >>> + #endif >>> + >>> + return flags; >>> +} >>> +#endif // end of Linux+AArch64 >>> >>> #elif X265_ARCH_POWER8 >>> >>> -- >>> 2.43.0.windows.1 >>> >>> >>> -- >>> >>> * <https://multicorewareinc.com/>* >>> <https://www.linkedin.com/company/multicoreware-inc/> >>> <https://twitter.com/MulticoreWare> >>> <https://www.facebook.com/multicoreware> >>> <https://www.youtube.com/channel/UCXZ1A1MzS5JwBqwBkNfsBBw?sub_confirmation=1> >>> <https://www.instagram.com/multicoreware.inc/> >>> >>> *Dash Santosh* >>> >>> *Research Engineer, Video Engineering* >>> >>> Mobile: +91 78679 43737 >>> >>> IndiQube Echo Point, Avinashi Road >>> >>> Coimbatore - 641 014 >>> >>> >>> >>> >>>
v3-0001-AArch64-Runtime-CPU-feature-detection.patch
Description: Binary data
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel