On Tue, Mar 17, 2020 at 9:50 AM Suyimeng <yimeng...@huawei.com> wrote:
> > > *From:* x265-devel [mailto:x265-devel-boun...@videolan.org] *On Behalf Of > *Gopi Satykrishna Akisetty > *Sent:* Monday, March 16, 2020 9:37 PM > *To:* Development for x265 <x265-devel@videolan.org> > *Subject:* Re: [x265] [PATCH] Add aarch64 support - Part 2 > > > > > > > > On Thu, Feb 27, 2020 at 8:04 AM Xiyuan Wang <wangxiyuan1...@gmail.com> > wrote: > > From: wangxiyuan <wangxiy...@huawei.com> > > > This patch adds aarch64 build & compile support. This patch must be > merged after the patch Part 1. > --- > build/aarch64-linux/crosscompile.cmake | 15 ++ > build/aarch64-linux/make-Makefiles.bash | 4 + > source/CMakeLists.txt | 38 +++- > source/common/CMakeLists.txt | 35 ++- > source/common/arm/asm-primitives.cpp | 291 ++++++++++++------------ > source/common/cpu.cpp | 4 + > source/common/pixel.cpp | 9 + > source/common/primitives.h | 11 + > source/test/CMakeLists.txt | 16 +- > source/test/testbench.cpp | 16 ++ > source/test/testharness.h | 5 + > 11 files changed, 274 insertions(+), 170 deletions(-) > create mode 100644 build/aarch64-linux/crosscompile.cmake > create mode 100644 build/aarch64-linux/make-Makefiles.bash > > diff --git a/build/aarch64-linux/crosscompile.cmake > b/build/aarch64-linux/crosscompile.cmake > new file mode 100644 > index 000000000..41c8217f2 > --- /dev/null > +++ b/build/aarch64-linux/crosscompile.cmake > @@ -0,0 +1,15 @@ > +# CMake toolchain file for cross compiling x265 for aarch64 > +# This feature is only supported as experimental. Use with caution. > +# Please report bugs on bitbucket > +# Run cmake with: cmake -DCMAKE_TOOLCHAIN_FILE=crosscompile.cmake -G > "Unix Makefiles" ../../source && ccmake ../../source > + > +set(CROSS_COMPILE_ARM 1) > +set(CMAKE_SYSTEM_NAME Linux) > +set(CMAKE_SYSTEM_PROCESSOR aarch64) > + > +# specify the cross compiler > +set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc) > +set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++) > + > +# specify the target environment > +SET(CMAKE_FIND_ROOT_PATH /usr/aarch64-linux-gnu) > diff --git a/build/aarch64-linux/make-Makefiles.bash > b/build/aarch64-linux/make-Makefiles.bash > new file mode 100644 > index 000000000..c9582da0a > --- /dev/null > +++ b/build/aarch64-linux/make-Makefiles.bash > @@ -0,0 +1,4 @@ > +#!/bin/bash > +# Run this from within a bash shell > + > +cmake -DCMAKE_TOOLCHAIN_FILE="crosscompile.cmake" -G "Unix Makefiles" > ../../source && ccmake ../../source > diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt > index 5d2474d97..7734eafbb 100644 > --- a/source/CMakeLists.txt > +++ b/source/CMakeLists.txt > @@ -40,7 +40,7 @@ SET(CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake" > "${CMAKE_MODULE_PATH}") > # System architecture detection > string(TOLOWER "${CMAKE_SYSTEM_PROCESSOR}" SYSPROC) > set(X86_ALIASES x86 i386 i686 x86_64 amd64) > -set(ARM_ALIASES armv6l armv7l) > +set(ARM_ALIASES armv6l armv7l aarch64) > list(FIND X86_ALIASES "${SYSPROC}" X86MATCH) > list(FIND ARM_ALIASES "${SYSPROC}" ARMMATCH) > set(POWER_ALIASES ppc64 ppc64le) > @@ -70,9 +70,15 @@ elseif(ARMMATCH GREATER "-1") > else() > set(CROSS_COMPILE_ARM 0) > endif() > - message(STATUS "Detected ARM target processor") > set(ARM 1) > - add_definitions(-DX265_ARCH_ARM=1 -DHAVE_ARMV6=1) > + if("${CMAKE_SIZEOF_VOID_P}" MATCHES 8) > + message(STATUS "Detected ARM64 target processor") > + set(ARM64 1) > + add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=1 > -DHAVE_ARMV6=0) > + else() > + message(STATUS "Detected ARM target processor") > + add_definitions(-DX265_ARCH_ARM=1 -DX265_ARCH_ARM64=0 > -DHAVE_ARMV6=1) > + endif() > else() > message(STATUS "CMAKE_SYSTEM_PROCESSOR value > `${CMAKE_SYSTEM_PROCESSOR}` is unknown") > message(STATUS "Please add this value near > ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}") > @@ -231,14 +237,24 @@ if(GCC) > endif() > endif() > if(ARM AND CROSS_COMPILE_ARM) > - set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm -fPIC) > + if(ARM64) > + set(ARM_ARGS -fPIC) > + else() > + set(ARM_ARGS -march=armv6 -mfloat-abi=soft -mfpu=vfp -marm > -fPIC) > + endif() > + message(STATUS "cross compile arm") > elseif(ARM) > - find_package(Neon) > - if(CPU_HAS_NEON) > - set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon -marm > -fPIC) > + if(ARM64) > + set(ARM_ARGS -fPIC) > add_definitions(-DHAVE_NEON) > else() > - set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp -marm) > + find_package(Neon) > + if(CPU_HAS_NEON) > + set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=neon > -marm -fPIC) > + add_definitions(-DHAVE_NEON) > + else() > + set(ARM_ARGS -mcpu=native -mfloat-abi=hard -mfpu=vfp > -marm) > + endif() > endif() > endif() > add_definitions(${ARM_ARGS}) > @@ -518,7 +534,11 @@ if((MSVC_IDE OR XCODE OR GCC) AND ENABLE_ASSEMBLY) > # compile ARM arch asm files here > enable_language(ASM) > foreach(ASM ${ARM_ASMS}) > - set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM}) > + if(ARM64) > + set(ASM_SRC > ${CMAKE_CURRENT_SOURCE_DIR}/common/aarch64/${ASM}) > + else() > + set(ASM_SRC ${CMAKE_CURRENT_SOURCE_DIR}/common/arm/${ASM}) > + endif() > list(APPEND ASM_SRCS ${ASM_SRC}) > list(APPEND ASM_OBJS ${ASM}.${SUFFIX}) > add_custom_command( > diff --git a/source/common/CMakeLists.txt b/source/common/CMakeLists.txt > index c70bb108c..c021e603e 100644 > --- a/source/common/CMakeLists.txt > +++ b/source/common/CMakeLists.txt > @@ -14,7 +14,7 @@ if(EXTRA_LIB) > endif(EXTRA_LIB) > > if(ENABLE_ASSEMBLY) > - set_source_files_properties(threading.cpp primitives.cpp PROPERTIES > COMPILE_FLAGS -DENABLE_ASSEMBLY=1) > + set_source_files_properties(threading.cpp primitives.cpp pixel.cpp > PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1) > list(APPEND VFLAGS "-DENABLE_ASSEMBLY=1") > endif(ENABLE_ASSEMBLY) > > @@ -84,16 +84,33 @@ if(ENABLE_ASSEMBLY AND X86) > endif(ENABLE_ASSEMBLY AND X86) > > if(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM)) > - set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h > dct8.h loopfilter.h) > + if(ARM64) > + if(GCC AND (CMAKE_CXX_FLAGS_RELEASE MATCHES "-O3")) > + message(STATUS "Detected CXX compiler using -O3 optimization > level") > + add_definitions(-DAUTO_VECTORIZE=1) > + endif() > + set(C_SRCS asm-primitives.cpp pixel.h ipfilter8.h) > > - # add ARM assembly/intrinsic files here > - set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S > blockcopy8.S ipfilter8.S dct-a.S) > - set(VEC_PRIMITIVES) > + # add ARM assembly/intrinsic files here > + set(A_SRCS asm.S mc-a.S sad-a.S pixel-util.S ipfilter8.S) > + set(VEC_PRIMITIVES) > > - set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") > - foreach(SRC ${C_SRCS}) > - set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC}) > - endforeach() > + set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") > + foreach(SRC ${C_SRCS}) > + set(ASM_PRIMITIVES ${ASM_PRIMITIVES} aarch64/${SRC}) > + endforeach() > + else() > + set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h > blockcopy8.h dct8.h loopfilter.h) > + > + # add ARM assembly/intrinsic files here > + set(A_SRCS asm.S cpu-a.S mc-a.S sad-a.S pixel-util.S ssd-a.S > blockcopy8.S ipfilter8.S dct-a.S) > + set(VEC_PRIMITIVES) > + > + set(ARM_ASMS "${A_SRCS}" CACHE INTERNAL "ARM Assembly Sources") > + foreach(SRC ${C_SRCS}) > + set(ASM_PRIMITIVES ${ASM_PRIMITIVES} arm/${SRC}) > + endforeach() > + endif() > source_group(Assembly FILES ${ASM_PRIMITIVES}) > endif(ENABLE_ASSEMBLY AND (ARM OR CROSS_COMPILE_ARM)) > > diff --git a/source/common/arm/asm-primitives.cpp > b/source/common/arm/asm-primitives.cpp > index 422217845..7f11503f9 100644 > --- a/source/common/arm/asm-primitives.cpp > +++ b/source/common/arm/asm-primitives.cpp > @@ -5,6 +5,7 @@ > * Praveen Kumar Tiwari <prav...@multicorewareinc.com> > * Min Chen <chenm...@163.com> <min.c...@multicorewareinc.com> > * Dnyaneshwar Gorade <dnyanesh...@multicorewareinc.com> > + * Hongbin Liu<liuhongb...@huawei.com> > * > * This program is free software; you can redistribute it and/or modify > * it under the terms of the GNU General Public License as published by > @@ -48,77 +49,77 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int > cpuMask) > p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon); > > // addAvg > - p.pu[LUMA_4x4].addAvg = PFX(addAvg_4x4_neon); > - p.pu[LUMA_4x8].addAvg = PFX(addAvg_4x8_neon); > - p.pu[LUMA_4x16].addAvg = PFX(addAvg_4x16_neon); > - p.pu[LUMA_8x4].addAvg = PFX(addAvg_8x4_neon); > - p.pu[LUMA_8x8].addAvg = PFX(addAvg_8x8_neon); > - p.pu[LUMA_8x16].addAvg = PFX(addAvg_8x16_neon); > - p.pu[LUMA_8x32].addAvg = PFX(addAvg_8x32_neon); > - p.pu[LUMA_12x16].addAvg = PFX(addAvg_12x16_neon); > - p.pu[LUMA_16x4].addAvg = PFX(addAvg_16x4_neon); > - p.pu[LUMA_16x8].addAvg = PFX(addAvg_16x8_neon); > - p.pu[LUMA_16x12].addAvg = PFX(addAvg_16x12_neon); > - p.pu[LUMA_16x16].addAvg = PFX(addAvg_16x16_neon); > - p.pu[LUMA_16x32].addAvg = PFX(addAvg_16x32_neon); > - p.pu[LUMA_16x64].addAvg = PFX(addAvg_16x64_neon); > - p.pu[LUMA_24x32].addAvg = PFX(addAvg_24x32_neon); > - p.pu[LUMA_32x8].addAvg = PFX(addAvg_32x8_neon); > - p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_neon); > - p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_neon); > - p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_neon); > - p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_neon); > - p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_neon); > - p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_neon); > - p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_neon); > - p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_neon); > - p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_neon); > + p.pu[LUMA_4x4].addAvg[NONALIGNED] = PFX(addAvg_4x4_neon); > + p.pu[LUMA_4x8].addAvg[NONALIGNED] = PFX(addAvg_4x8_neon); > + p.pu[LUMA_4x16].addAvg[NONALIGNED] = PFX(addAvg_4x16_neon); > + p.pu[LUMA_8x4].addAvg[NONALIGNED] = PFX(addAvg_8x4_neon); > + p.pu[LUMA_8x8].addAvg[NONALIGNED] = PFX(addAvg_8x8_neon); > + p.pu[LUMA_8x16].addAvg[NONALIGNED] = PFX(addAvg_8x16_neon); > + p.pu[LUMA_8x32].addAvg[NONALIGNED] = PFX(addAvg_8x32_neon); > + p.pu[LUMA_12x16].addAvg[NONALIGNED] = PFX(addAvg_12x16_neon); > + p.pu[LUMA_16x4].addAvg[NONALIGNED] = PFX(addAvg_16x4_neon); > + p.pu[LUMA_16x8].addAvg[NONALIGNED] = PFX(addAvg_16x8_neon); > + p.pu[LUMA_16x12].addAvg[NONALIGNED] = PFX(addAvg_16x12_neon); > + p.pu[LUMA_16x16].addAvg[NONALIGNED] = PFX(addAvg_16x16_neon); > + p.pu[LUMA_16x32].addAvg[NONALIGNED] = PFX(addAvg_16x32_neon); > + p.pu[LUMA_16x64].addAvg[NONALIGNED] = PFX(addAvg_16x64_neon); > + p.pu[LUMA_24x32].addAvg[NONALIGNED] = PFX(addAvg_24x32_neon); > + p.pu[LUMA_32x8].addAvg[NONALIGNED] = PFX(addAvg_32x8_neon); > + p.pu[LUMA_32x16].addAvg[NONALIGNED] = PFX(addAvg_32x16_neon); > + p.pu[LUMA_32x24].addAvg[NONALIGNED] = PFX(addAvg_32x24_neon); > + p.pu[LUMA_32x32].addAvg[NONALIGNED] = PFX(addAvg_32x32_neon); > + p.pu[LUMA_32x64].addAvg[NONALIGNED] = PFX(addAvg_32x64_neon); > + p.pu[LUMA_48x64].addAvg[NONALIGNED] = PFX(addAvg_48x64_neon); > + p.pu[LUMA_64x16].addAvg[NONALIGNED] = PFX(addAvg_64x16_neon); > + p.pu[LUMA_64x32].addAvg[NONALIGNED] = PFX(addAvg_64x32_neon); > + p.pu[LUMA_64x48].addAvg[NONALIGNED] = PFX(addAvg_64x48_neon); > + p.pu[LUMA_64x64].addAvg[NONALIGNED] = PFX(addAvg_64x64_neon); > > // chroma addAvg > - p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg = > PFX(addAvg_4x2_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg = > PFX(addAvg_4x4_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg = > PFX(addAvg_4x8_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg = > PFX(addAvg_4x16_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg = > PFX(addAvg_6x8_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg = > PFX(addAvg_8x2_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg = > PFX(addAvg_8x4_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg = > PFX(addAvg_8x6_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg = > PFX(addAvg_8x8_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg = > PFX(addAvg_8x16_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg = > PFX(addAvg_8x32_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg = > PFX(addAvg_12x16_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg = > PFX(addAvg_16x4_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg = > PFX(addAvg_16x8_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg = > PFX(addAvg_16x12_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg = > PFX(addAvg_16x16_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg = > PFX(addAvg_16x32_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg = > PFX(addAvg_24x32_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = > PFX(addAvg_32x8_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = > PFX(addAvg_32x16_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = > PFX(addAvg_32x24_neon); > - p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = > PFX(addAvg_32x32_neon); > - > - p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg = > PFX(addAvg_4x8_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg = > PFX(addAvg_4x16_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg = > PFX(addAvg_4x32_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg = > PFX(addAvg_6x16_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg = > PFX(addAvg_8x4_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg = > PFX(addAvg_8x8_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg = > PFX(addAvg_8x12_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg = > PFX(addAvg_8x16_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg = > PFX(addAvg_8x32_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg = > PFX(addAvg_8x64_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg = > PFX(addAvg_12x32_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg = > PFX(addAvg_16x8_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg = > PFX(addAvg_16x16_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg = > PFX(addAvg_16x24_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg = > PFX(addAvg_16x32_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg = > PFX(addAvg_16x64_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg = > PFX(addAvg_24x64_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = > PFX(addAvg_32x16_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = > PFX(addAvg_32x32_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = > PFX(addAvg_32x48_neon); > - p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = > PFX(addAvg_32x64_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].addAvg[NONALIGNED] = > PFX(addAvg_4x2_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].addAvg[NONALIGNED] = > PFX(addAvg_4x4_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].addAvg[NONALIGNED] = > PFX(addAvg_4x8_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].addAvg[NONALIGNED] = > PFX(addAvg_4x16_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].addAvg[NONALIGNED] = > PFX(addAvg_6x8_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].addAvg[NONALIGNED] = > PFX(addAvg_8x2_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].addAvg[NONALIGNED] = > PFX(addAvg_8x4_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].addAvg[NONALIGNED] = > PFX(addAvg_8x6_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].addAvg[NONALIGNED] = > PFX(addAvg_8x8_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].addAvg[NONALIGNED] = > PFX(addAvg_8x16_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].addAvg[NONALIGNED] = > PFX(addAvg_8x32_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].addAvg[NONALIGNED] = > PFX(addAvg_12x16_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].addAvg[NONALIGNED] = > PFX(addAvg_16x4_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].addAvg[NONALIGNED] = > PFX(addAvg_16x8_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].addAvg[NONALIGNED] = > PFX(addAvg_16x12_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].addAvg[NONALIGNED] = > PFX(addAvg_16x16_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].addAvg[NONALIGNED] = > PFX(addAvg_16x32_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].addAvg[NONALIGNED] = > PFX(addAvg_24x32_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg[NONALIGNED] = > PFX(addAvg_32x8_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg[NONALIGNED] = > PFX(addAvg_32x16_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg[NONALIGNED] = > PFX(addAvg_32x24_neon); > + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg[NONALIGNED] = > PFX(addAvg_32x32_neon); > + > + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].addAvg[NONALIGNED] = > PFX(addAvg_4x8_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].addAvg[NONALIGNED] = > PFX(addAvg_4x16_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].addAvg[NONALIGNED] = > PFX(addAvg_4x32_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].addAvg[NONALIGNED] = > PFX(addAvg_6x16_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].addAvg[NONALIGNED] = > PFX(addAvg_8x4_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].addAvg[NONALIGNED] = > PFX(addAvg_8x8_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].addAvg[NONALIGNED] = > PFX(addAvg_8x12_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].addAvg[NONALIGNED] = > PFX(addAvg_8x16_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].addAvg[NONALIGNED] = > PFX(addAvg_8x32_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].addAvg[NONALIGNED] = > PFX(addAvg_8x64_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].addAvg[NONALIGNED] = > PFX(addAvg_12x32_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].addAvg[NONALIGNED] = > PFX(addAvg_16x8_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].addAvg[NONALIGNED] = > PFX(addAvg_16x16_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].addAvg[NONALIGNED] = > PFX(addAvg_16x24_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].addAvg[NONALIGNED] = > PFX(addAvg_16x32_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].addAvg[NONALIGNED] = > PFX(addAvg_16x64_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].addAvg[NONALIGNED] = > PFX(addAvg_24x64_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg[NONALIGNED] = > PFX(addAvg_32x16_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg[NONALIGNED] = > PFX(addAvg_32x32_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg[NONALIGNED] = > PFX(addAvg_32x48_neon); > + p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg[NONALIGNED] = > PFX(addAvg_32x64_neon); > > // quant > p.quant = PFX(quant_neon); > @@ -402,7 +403,7 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int > cpuMask) > p.scale2D_64to32 = PFX(scale2D_64to32_neon); > > // scale1D_128to64 > - p.scale1D_128to64 = PFX(scale1D_128to64_neon); > + p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_neon); > > // copy_count > p.cu[BLOCK_4x4].copy_cnt = PFX(copy_cnt_4_neon); > @@ -411,37 +412,37 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, > int cpuMask) > p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_neon); > > // filterPixelToShort > - p.pu[LUMA_4x4].convert_p2s = PFX(filterPixelToShort_4x4_neon); > - p.pu[LUMA_4x8].convert_p2s = PFX(filterPixelToShort_4x8_neon); > - p.pu[LUMA_4x16].convert_p2s = PFX(filterPixelToShort_4x16_neon); > - p.pu[LUMA_8x4].convert_p2s = PFX(filterPixelToShort_8x4_neon); > - p.pu[LUMA_8x8].convert_p2s = PFX(filterPixelToShort_8x8_neon); > - p.pu[LUMA_8x16].convert_p2s = PFX(filterPixelToShort_8x16_neon); > - p.pu[LUMA_8x32].convert_p2s = PFX(filterPixelToShort_8x32_neon); > - p.pu[LUMA_12x16].convert_p2s = PFX(filterPixelToShort_12x16_neon); > - p.pu[LUMA_16x4].convert_p2s = PFX(filterPixelToShort_16x4_neon); > - p.pu[LUMA_16x8].convert_p2s = PFX(filterPixelToShort_16x8_neon); > - p.pu[LUMA_16x12].convert_p2s = PFX(filterPixelToShort_16x12_neon); > - p.pu[LUMA_16x16].convert_p2s = PFX(filterPixelToShort_16x16_neon); > - p.pu[LUMA_16x32].convert_p2s = PFX(filterPixelToShort_16x32_neon); > - p.pu[LUMA_16x64].convert_p2s = PFX(filterPixelToShort_16x64_neon); > - p.pu[LUMA_24x32].convert_p2s = PFX(filterPixelToShort_24x32_neon); > - p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_neon); > - p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_neon); > - p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_neon); > - p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_neon); > - p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_neon); > - p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_neon); > - p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_neon); > - p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_neon); > - p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_neon); > - p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_neon); > + p.pu[LUMA_4x4].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_4x4_neon); > + p.pu[LUMA_4x8].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_4x8_neon); > + p.pu[LUMA_4x16].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_4x16_neon); > + p.pu[LUMA_8x4].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_8x4_neon); > + p.pu[LUMA_8x8].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_8x8_neon); > + p.pu[LUMA_8x16].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_8x16_neon); > + p.pu[LUMA_8x32].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_8x32_neon); > + p.pu[LUMA_12x16].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_12x16_neon); > + p.pu[LUMA_16x4].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_16x4_neon); > + p.pu[LUMA_16x8].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_16x8_neon); > + p.pu[LUMA_16x12].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_16x12_neon); > + p.pu[LUMA_16x16].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_16x16_neon); > + p.pu[LUMA_16x32].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_16x32_neon); > + p.pu[LUMA_16x64].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_16x64_neon); > + p.pu[LUMA_24x32].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_24x32_neon); > + p.pu[LUMA_32x8].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_32x8_neon); > + p.pu[LUMA_32x16].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_32x16_neon); > + p.pu[LUMA_32x24].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_32x24_neon); > + p.pu[LUMA_32x32].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_32x32_neon); > + p.pu[LUMA_32x64].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_32x64_neon); > + p.pu[LUMA_48x64].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_48x64_neon); > + p.pu[LUMA_64x16].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_64x16_neon); > + p.pu[LUMA_64x32].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_64x32_neon); > + p.pu[LUMA_64x48].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_64x48_neon); > + p.pu[LUMA_64x64].convert_p2s[NONALIGNED] = > PFX(filterPixelToShort_64x64_neon); > > // Block_fill > - p.cu[BLOCK_4x4].blockfill_s = PFX(blockfill_s_4x4_neon); > - p.cu[BLOCK_8x8].blockfill_s = PFX(blockfill_s_8x8_neon); > - p.cu[BLOCK_16x16].blockfill_s = PFX(blockfill_s_16x16_neon); > - p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_neon); > + p.cu[BLOCK_4x4].blockfill_s[NONALIGNED] = > PFX(blockfill_s_4x4_neon); > + p.cu[BLOCK_8x8].blockfill_s[NONALIGNED] = > PFX(blockfill_s_8x8_neon); > + p.cu[BLOCK_16x16].blockfill_s[NONALIGNED] = > PFX(blockfill_s_16x16_neon); > + p.cu[BLOCK_32x32].blockfill_s[NONALIGNED] = > PFX(blockfill_s_32x32_neon); > > // Blockcopy_ss > p.cu[BLOCK_4x4].copy_ss = PFX(blockcopy_ss_4x4_neon); > @@ -495,21 +496,21 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, > int cpuMask) > p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = > PFX(blockcopy_sp_32x64_neon); > > // pixel_add_ps > - p.cu[BLOCK_4x4].add_ps = PFX(pixel_add_ps_4x4_neon); > - p.cu[BLOCK_8x8].add_ps = PFX(pixel_add_ps_8x8_neon); > - p.cu[BLOCK_16x16].add_ps = PFX(pixel_add_ps_16x16_neon); > - p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_neon); > - p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_neon); > + p.cu[BLOCK_4x4].add_ps[NONALIGNED] = > PFX(pixel_add_ps_4x4_neon); > + p.cu[BLOCK_8x8].add_ps[NONALIGNED] = > PFX(pixel_add_ps_8x8_neon); > + p.cu[BLOCK_16x16].add_ps[NONALIGNED] = > PFX(pixel_add_ps_16x16_neon); > + p.cu[BLOCK_32x32].add_ps[NONALIGNED] = > PFX(pixel_add_ps_32x32_neon); > + p.cu[BLOCK_64x64].add_ps[NONALIGNED] = > PFX(pixel_add_ps_64x64_neon); > > // chroma add_ps > - p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps = > PFX(pixel_add_ps_4x4_neon); > - p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps = > PFX(pixel_add_ps_8x8_neon); > - p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps = > PFX(pixel_add_ps_16x16_neon); > - p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = > PFX(pixel_add_ps_32x32_neon); > - p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps = > PFX(pixel_add_ps_4x8_neon); > - p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps = > PFX(pixel_add_ps_8x16_neon); > - p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps = > PFX(pixel_add_ps_16x32_neon); > - p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = > PFX(pixel_add_ps_32x64_neon); > + p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[NONALIGNED] = > PFX(pixel_add_ps_4x4_neon); > + p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[NONALIGNED] = > PFX(pixel_add_ps_8x8_neon); > + p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[NONALIGNED] = > PFX(pixel_add_ps_16x16_neon); > + p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[NONALIGNED] = > PFX(pixel_add_ps_32x32_neon); > + p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[NONALIGNED] = > PFX(pixel_add_ps_4x8_neon); > + p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[NONALIGNED] = > PFX(pixel_add_ps_8x16_neon); > + p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[NONALIGNED] = > PFX(pixel_add_ps_16x32_neon); > + p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[NONALIGNED] = > PFX(pixel_add_ps_32x64_neon); > > // cpy2Dto1D_shr > p.cu[BLOCK_4x4].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_4x4_neon); > @@ -518,10 +519,10 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, > int cpuMask) > p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_neon); > > // ssd_s > - p.cu[BLOCK_4x4].ssd_s = PFX(pixel_ssd_s_4x4_neon); > - p.cu[BLOCK_8x8].ssd_s = PFX(pixel_ssd_s_8x8_neon); > - p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16x16_neon); > - p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32x32_neon); > + p.cu[BLOCK_4x4].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_4x4_neon); > + p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_neon); > + p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = > PFX(pixel_ssd_s_16x16_neon); > + p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = > PFX(pixel_ssd_s_32x32_neon); > > // sse_ss > p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_neon); > @@ -548,10 +549,10 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, > int cpuMask) > p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = > PFX(pixel_sub_ps_32x64_neon); > > // calc_Residual > - p.cu[BLOCK_4x4].calcresidual = PFX(getResidual4_neon); > - p.cu[BLOCK_8x8].calcresidual = PFX(getResidual8_neon); > - p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_neon); > - p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_neon); > + p.cu[BLOCK_4x4].calcresidual[NONALIGNED] = > PFX(getResidual4_neon); > + p.cu[BLOCK_8x8].calcresidual[NONALIGNED] = > PFX(getResidual8_neon); > + p.cu[BLOCK_16x16].calcresidual[NONALIGNED] = > PFX(getResidual16_neon); > + p.cu[BLOCK_32x32].calcresidual[NONALIGNED] = > PFX(getResidual32_neon); > > // sse_pp > p.cu[BLOCK_4x4].sse_pp = PFX(pixel_sse_pp_4x4_neon); > @@ -722,31 +723,31 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, > int cpuMask) > p.pu[LUMA_64x64].sad_x4 = PFX(sad_x4_64x64_neon); > > // pixel_avg_pp > - p.pu[LUMA_4x4].pixelavg_pp = PFX(pixel_avg_pp_4x4_neon); > - p.pu[LUMA_4x8].pixelavg_pp = PFX(pixel_avg_pp_4x8_neon); > - p.pu[LUMA_4x16].pixelavg_pp = PFX(pixel_avg_pp_4x16_neon); > - p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_pp_8x4_neon); > - p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_pp_8x8_neon); > - p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_pp_8x16_neon); > - p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_pp_8x32_neon); > - p.pu[LUMA_12x16].pixelavg_pp = PFX(pixel_avg_pp_12x16_neon); > - p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_pp_16x4_neon); > - p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_pp_16x8_neon); > - p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_pp_16x12_neon); > - p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_pp_16x16_neon); > - p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_pp_16x32_neon); > - p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_pp_16x64_neon); > - p.pu[LUMA_24x32].pixelavg_pp = PFX(pixel_avg_pp_24x32_neon); > - p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_pp_32x8_neon); > - p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_pp_32x16_neon); > - p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_pp_32x24_neon); > - p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_pp_32x32_neon); > - p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_pp_32x64_neon); > - p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_pp_48x64_neon); > - p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_pp_64x16_neon); > - p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_pp_64x32_neon); > - p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_pp_64x48_neon); > - p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_pp_64x64_neon); > + p.pu[LUMA_4x4].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_4x4_neon); > + p.pu[LUMA_4x8].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_4x8_neon); > + p.pu[LUMA_4x16].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_4x16_neon); > + p.pu[LUMA_8x4].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_8x4_neon); > + p.pu[LUMA_8x8].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_8x8_neon); > + p.pu[LUMA_8x16].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_8x16_neon); > + p.pu[LUMA_8x32].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_8x32_neon); > + p.pu[LUMA_12x16].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_12x16_neon); > + p.pu[LUMA_16x4].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_16x4_neon); > + p.pu[LUMA_16x8].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_16x8_neon); > + p.pu[LUMA_16x12].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_16x12_neon); > + p.pu[LUMA_16x16].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_16x16_neon); > + p.pu[LUMA_16x32].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_16x32_neon); > + p.pu[LUMA_16x64].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_16x64_neon); > + p.pu[LUMA_24x32].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_24x32_neon); > + p.pu[LUMA_32x8].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_32x8_neon); > + p.pu[LUMA_32x16].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_32x16_neon); > + p.pu[LUMA_32x24].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_32x24_neon); > + p.pu[LUMA_32x32].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_32x32_neon); > + p.pu[LUMA_32x64].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_32x64_neon); > + p.pu[LUMA_48x64].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_48x64_neon); > + p.pu[LUMA_64x16].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_64x16_neon); > + p.pu[LUMA_64x32].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_64x32_neon); > + p.pu[LUMA_64x48].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_64x48_neon); > + p.pu[LUMA_64x64].pixelavg_pp[NONALIGNED] = > PFX(pixel_avg_pp_64x64_neon); > > // planecopy > p.planecopy_cp = PFX(pixel_planecopy_cp_neon); > diff --git a/source/common/cpu.cpp b/source/common/cpu.cpp > index 26c82ea50..2eacfe4a9 100644 > --- a/source/common/cpu.cpp > +++ b/source/common/cpu.cpp > @@ -5,6 +5,8 @@ > * Laurent Aimar <fen...@via.ecp.fr> > * Fiona Glaser <fi...@x264.com> > * Steve Borho <st...@borho.org> > + * Hongbin Liu <liuhongb...@huawei.com> > + * Yimeng Su <yimeng...@huawei.com> > * > * This program is free software; you can redistribute it and/or modify > * it under the terms of the GNU General Public License as published by > @@ -367,6 +369,8 @@ uint32_t cpu_detect(bool benableavx512) > flags |= PFX(cpu_fast_neon_mrc_test)() ? X265_CPU_FAST_NEON_MRC : 0; > #endif > // TODO: write dual issue test? currently it's A8 (dual issue) vs. A9 > (fast mrc) > +#elif X265_ARCH_ARM64 > + flags |= X265_CPU_NEON; > #endif // if HAVE_ARMV6 > return flags; > } > diff --git a/source/common/pixel.cpp b/source/common/pixel.cpp > index 99b84449c..e4f890cd5 100644 > --- a/source/common/pixel.cpp > +++ b/source/common/pixel.cpp > @@ -5,6 +5,7 @@ > * Mandar Gurav <man...@multicorewareinc.com> > * Mahesh Pittala <mah...@multicorewareinc.com> > * Min Chen <min.c...@multicorewareinc.com> > + * Hongbin Liu<liuhongb...@huawei.com> > * > * This program is free software; you can redistribute it and/or modify > * it under the terms of the GNU General Public License as published by > @@ -265,6 +266,10 @@ int satd4(const pixel* pix1, intptr_t stride_pix1, > const pixel* pix2, intptr_t s > { > int satd = 0; > > +#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 > + pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon; > +#endif > > is there any specific reason why the above code is added?? is this a kind > of a temporary fix for the output mismatch between c and asm code? > > No, c and asm output is matched. Currently we only complete partial satd > primatives. This is a workaround that improve all satd primitives with asm > code. Maybe there is a bad code style. > If I understand correctly, you are trying to use a combination of c and asm code for all other kernel sizes that you have not completed asm implementation yet? > > > + > for (int row = 0; row < h; row += 4) > for (int col = 0; col < w; col += 4) > satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1, > @@ -279,6 +284,10 @@ int satd8(const pixel* pix1, intptr_t stride_pix1, > const pixel* pix2, intptr_t s > { > int satd = 0; > > +#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 > + pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon; > +#endif > + > > Same comment as above. > > Same response. > > for (int row = 0; row < h; row += 4) > for (int col = 0; col < w; col += 8) > satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1, > diff --git a/source/common/primitives.h b/source/common/primitives.h > index 5c64952fb..0b52f84de 100644 > --- a/source/common/primitives.h > +++ b/source/common/primitives.h > @@ -8,6 +8,8 @@ > * Rajesh Paulraj <raj...@multicorewareinc.com> > * Praveen Kumar Tiwari <prav...@multicorewareinc.com> > * Min Chen <chenm...@163.com> > + * Hongbin Liu<liuhongb...@huawei.com> > + * Yimeng Su <yimeng...@huawei.com> > * > * This program is free software; you can redistribute it and/or modify > * it under the terms of the GNU General Public License as published by > @@ -467,6 +469,9 @@ void setupCPrimitives(EncoderPrimitives &p); > void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask); > void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask); > void setupAliasPrimitives(EncoderPrimitives &p); > +#if X265_ARCH_ARM64 > +void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives > &asmp, int cpuMask); > +#endif > #if HAVE_ALTIVEC > void setupPixelPrimitives_altivec(EncoderPrimitives &p); > void setupDCTPrimitives_altivec(EncoderPrimitives &p); > @@ -481,4 +486,10 @@ extern const char* PFX(version_str); > extern const char* PFX(build_info_str); > #endif > > +#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 > +extern "C" { > +#include "aarch64/pixel-util.h" > +} > +#endif > + > #endif // ifndef X265_PRIMITIVES_H > diff --git a/source/test/CMakeLists.txt b/source/test/CMakeLists.txt > index 260195f53..9abaf31ff 100644 > --- a/source/test/CMakeLists.txt > +++ b/source/test/CMakeLists.txt > @@ -23,13 +23,15 @@ endif(X86) > > # add ARM assembly files > if(ARM OR CROSS_COMPILE_ARM) > - enable_language(ASM) > - set(NASM_SRC checkasm-arm.S) > - add_custom_command( > - OUTPUT checkasm-arm.obj > - COMMAND ${CMAKE_CXX_COMPILER} > - ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S -o > checkasm-arm.obj > - DEPENDS checkasm-arm.S) > + if(NOT ARM64) > + enable_language(ASM) > + set(NASM_SRC checkasm-arm.S) > + add_custom_command( > + OUTPUT checkasm-arm.obj > + COMMAND ${CMAKE_CXX_COMPILER} > + ARGS ${NASM_FLAGS} ${CMAKE_CURRENT_SOURCE_DIR}/checkasm-arm.S > -o checkasm-arm.obj > + DEPENDS checkasm-arm.S) > + endif() > endif(ARM OR CROSS_COMPILE_ARM) > > # add PowerPC assembly files > diff --git a/source/test/testbench.cpp b/source/test/testbench.cpp > index ac14f9710..8db8c0c25 100644 > --- a/source/test/testbench.cpp > +++ b/source/test/testbench.cpp > @@ -5,6 +5,7 @@ > * Mandar Gurav <man...@multicorewareinc.com> > * Mahesh Pittala <mah...@multicorewareinc.com> > * Min Chen <chenm...@163.com> > + * Yimeng Su <yimeng...@huawei.com> > * > * This program is free software; you can redistribute it and/or modify > * it under the terms of the GNU General Public License as published by > @@ -208,6 +209,14 @@ int main(int argc, char *argv[]) > EncoderPrimitives asmprim; > memset(&asmprim, 0, sizeof(asmprim)); > setupAssemblyPrimitives(asmprim, test_arch[i].flag); > + > +#if X265_ARCH_ARM64 > + /* Temporary workaround because luma_vsp assembly primitive has > not been completed > + * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly > primitive. > + * Otherwise, segment fault occurs. */ > + setupAliasCPrimitives(cprim, asmprim, test_arch[i].flag); > +#endif > + > setupAliasPrimitives(asmprim); > memcpy(&primitives, &asmprim, sizeof(EncoderPrimitives)); > for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); > h++) > @@ -232,6 +241,13 @@ int main(int argc, char *argv[]) > #endif > setupAssemblyPrimitives(optprim, cpuid); > > +#if X265_ARCH_ARM64 > + /* Temporary workaround because luma_vsp assembly primitive has not > been completed > + * but interp_8tap_hv_pp_cpu uses mixed C primitive and assembly > primitive. > + * Otherwise, segment fault occurs. */ > + setupAliasCPrimitives(cprim, optprim, cpuid); > +#endif > + > /* Note that we do not setup aliases for performance tests, that > would be > * redundant. The testbench only verifies they are correctly aliased > */ > > diff --git a/source/test/testharness.h b/source/test/testharness.h > index 771551583..6e680953f 100644 > --- a/source/test/testharness.h > +++ b/source/test/testharness.h > @@ -3,6 +3,7 @@ > * > * Authors: Steve Borho <st...@borho.org> > * Min Chen <chenm...@163.com> > + * Yimeng Su <yimeng...@huawei.com> > * > * This program is free software; you can redistribute it and/or modify > * it under the terms of the GNU General Public License as published by > @@ -81,11 +82,15 @@ static inline uint32_t __rdtsc(void) > #if X265_ARCH_X86 > asm volatile("rdtsc" : "=a" (a) ::"edx"); > #elif X265_ARCH_ARM > +#if X265_ARCH_ARM64 > + asm volatile("mrs %0, cntvct_el0" : "=r"(a)); > +#else > // TOD-DO: verify following inline asm to get cpu Timestamp Counter > for ARM arch > // asm volatile("mrc p15, 0, %0, c9, c13, 0" : "=r"(a)); > > // TO-DO: replace clock() function with appropriate ARM cpu > instructions > a = clock(); > +#endif > #endif > return a; > } > -- > 2.21.0.windows.1 > > _______________________________________________ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > > _______________________________________________ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel >
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel