[x265] [PATCH ] Install symbol files
# HG changeset patch # User Mythreyi P # Date 1510294573 -19800 # Fri Nov 10 11:46:13 2017 +0530 # Node ID fa556484e9663a65aabc839f333e0e98fd6f80f2 # Parent bd438ce108435deb4f0063fca9a9e14a75e8de38 Install symbol files In debug and RelWithDebInfo configuration, .pdb files are installed. diff -r bd438ce10843 -r fa556484e966 source/CMakeLists.txt --- a/source/CMakeLists.txt Wed Nov 08 16:18:29 2017 +0530 +++ b/source/CMakeLists.txt Fri Nov 10 11:46:13 2017 +0530 @@ -546,6 +546,10 @@ ARCHIVE DESTINATION ${LIB_INSTALL_DIR}) endif() install(FILES x265.h "${PROJECT_BINARY_DIR}/x265_config.h" DESTINATION include) +install(FILES "${PROJECT_BINARY_DIR}/Debug/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS Debug) +install(FILES "${PROJECT_BINARY_DIR}/RelWithDebInfo/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS RelWithDebInfo) +install(FILES "${PROJECT_BINARY_DIR}/Debug/libx265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS Debug OPTIONAL NAMELINK_ONLY) +install(FILES "${PROJECT_BINARY_DIR}/RelWithDebInfo/libx265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS RelWithDebInfo OPTIONAL NAMELINK_ONLY) if(CMAKE_RC_COMPILER) # The resource compiler does not need CFLAGS or macro defines. It ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH ] Add support for uninstall in windows
# HG changeset patch # User Mythreyi P # Date 1510550268 -19800 # Mon Nov 13 10:47:48 2017 +0530 # Node ID 198fac8283efc3e3665842eadbf7e03af5987637 # Parent fa556484e9663a65aabc839f333e0e98fd6f80f2 Add support for uninstall in windows Files installed in the INSTALL target are uninstalled. diff -r fa556484e966 -r 198fac8283ef source/CMakeLists.txt --- a/source/CMakeLists.txt Fri Nov 10 11:46:13 2017 +0530 +++ b/source/CMakeLists.txt Mon Nov 13 10:47:48 2017 +0530 @@ -726,3 +726,13 @@ set(PLATFORM_LIBS ${PLATFORM_LIBS} PARENT_SCOPE) endif(PLATFORM_LIBS) endif(hasParent) + +# uninstall target +if(NOT TARGET UNINSTALL) +configure_file( +"${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in" +"${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" +IMMEDIATE @ONLY) +add_custom_target(UNINSTALL +COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake) +endif() diff -r fa556484e966 -r 198fac8283ef source/cmake/cmake_uninstall.cmake.in --- a/source/cmake/cmake_uninstall.cmake.in Fri Nov 10 11:46:13 2017 +0530 +++ b/source/cmake/cmake_uninstall.cmake.in Mon Nov 13 10:47:48 2017 +0530 @@ -17,3 +17,7 @@ message(STATUS "File '$ENV{DESTDIR}${file}' does not exist.") endif() endforeach(file) + +if(EXISTS "${CMAKE_CURRENT_BINARY_DIR}/install_manifest.txt") +file(REMOVE "${CMAKE_CURRENT_BINARY_DIR}/install_manifest.txt") +endif() ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH ] Fix Build fails for MacOS
# HG changeset patch # User Mythreyi P # Date 1510905431 -19800 # Fri Nov 17 13:27:11 2017 +0530 # Node ID e7254af562ee4cf29d559f85dbd2f3d17791b6d4 # Parent 06979c0423504a324ea05ca3de59769c6d0fba0d Fix Build fails for MacOS Duplicate uninstall targets were removed. diff -r 06979c042350 -r e7254af562ee source/CMakeLists.txt --- a/source/CMakeLists.txt Thu Nov 16 20:23:14 2017 +0530 +++ b/source/CMakeLists.txt Fri Nov 17 13:27:11 2017 +0530 @@ -650,13 +650,11 @@ DESTINATION "${LIB_INSTALL_DIR}/pkgconfig") endif() -if(NOT WIN32) configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in" "${CMAKE_CURRENT_BINARY_DIR}/cmake/cmake_uninstall.cmake" IMMEDIATE @ONLY) add_custom_target(uninstall "${CMAKE_COMMAND}" -P "${CMAKE_CURRENT_BINARY_DIR}/cmake/cmake_uninstall.cmake") -endif() # Main CLI application set(ENABLE_CLI ON CACHE BOOL "Build standalone CLI application") @@ -725,13 +723,3 @@ set(PLATFORM_LIBS ${PLATFORM_LIBS} PARENT_SCOPE) endif(PLATFORM_LIBS) endif(hasParent) - -# uninstall target -if(NOT TARGET UNINSTALL) -configure_file( -"${CMAKE_CURRENT_SOURCE_DIR}/cmake/cmake_uninstall.cmake.in" -"${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" -IMMEDIATE @ONLY) -add_custom_target(UNINSTALL -COMMAND ${CMAKE_COMMAND} -P ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake) -endif() ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH ] Fix install symbol files
# HG changeset patch # User Mythreyi P # Date 1510910772 -19800 # Fri Nov 17 14:56:12 2017 +0530 # Node ID 4ad31bed996f295f0ef1049f45a19bf326811afb # Parent e7254af562ee4cf29d559f85dbd2f3d17791b6d4 Fix install symbol files .pdb files are installed only in Windows. diff -r e7254af562ee -r 4ad31bed996f source/CMakeLists.txt --- a/source/CMakeLists.txt Fri Nov 17 13:27:11 2017 +0530 +++ b/source/CMakeLists.txt Fri Nov 17 14:56:12 2017 +0530 @@ -546,10 +546,14 @@ ARCHIVE DESTINATION ${LIB_INSTALL_DIR}) endif() install(FILES x265.h "${PROJECT_BINARY_DIR}/x265_config.h" DESTINATION include) -install(FILES "${PROJECT_BINARY_DIR}/Debug/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS Debug) -install(FILES "${PROJECT_BINARY_DIR}/RelWithDebInfo/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS RelWithDebInfo) -install(FILES "${PROJECT_BINARY_DIR}/Debug/libx265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS Debug OPTIONAL NAMELINK_ONLY) -install(FILES "${PROJECT_BINARY_DIR}/RelWithDebInfo/libx265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS RelWithDebInfo OPTIONAL NAMELINK_ONLY) + +if(WIN32) +install(FILES "${PROJECT_BINARY_DIR}/Debug/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS Debug) +install(FILES "${PROJECT_BINARY_DIR}/RelWithDebInfo/x265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS RelWithDebInfo) +install(FILES "${PROJECT_BINARY_DIR}/Debug/libx265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS Debug OPTIONAL NAMELINK_ONLY) +install(FILES "${PROJECT_BINARY_DIR}/RelWithDebInfo/libx265.pdb" DESTINATION ${BIN_INSTALL_DIR} CONFIGURATIONS RelWithDebInfo OPTIONAL NAMELINK_ONLY) +endif() + if(CMAKE_RC_COMPILER) # The resource compiler does not need CFLAGS or macro defines. It # often breaks them ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 003 of 307] x86: AVX-512 pixel_satd for 4x4, 4x8, 4x16, 8x4, 8x8, 8x16, 16x8, and 16x16
# HG changeset patch # User Jayashri Murugan # Date 1498029186 -19800 # Wed Jun 21 12:43:06 2017 +0530 # Node ID 6ce366c4e4919a4f1641234824f6bf4f128df400 # Parent daee70fc99daabd85cfc1245cf257e8b77a158e8 x86: AVX-512 pixel_satd for 4x4, 4x8, 4x16, 8x4, 8x8, 8x16, 16x8, and 16x16. diff -r daee70fc99da -r 6ce366c4e491 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Jun 21 15:37:19 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Jun 21 12:43:06 2017 +0530 @@ -3720,6 +3720,35 @@ p.integral_inith[INTEGRAL_32] = PFX(integral32h_avx2); } +if (cpuMask & X265_CPU_AVX512) +{ +p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_avx512); +p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_avx512); +p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_avx512); +p.pu[LUMA_8x4].satd = PFX(pixel_satd_8x4_avx512); +p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx512); +p.pu[LUMA_8x16].satd = PFX(pixel_satd_8x16_avx512); +p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_avx512); +p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_avx512); + +p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = PFX(pixel_satd_4x4_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = PFX(pixel_satd_8x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = PFX(pixel_satd_8x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_avx512); + +} #endif } #endif // if HIGH_BIT_DEPTH diff -r daee70fc99da -r 6ce366c4e491 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Wed Jun 21 15:37:19 2017 +0530 +++ b/source/common/x86/pixel-a.asm Wed Jun 21 12:43:06 2017 +0530 @@ -8145,6 +8145,211 @@ %endif ; ARCH_X86_64=1 %endif ; HIGH_BIT_DEPTH +%macro SATD_AVX512_LOAD4 2 ; size, opmask +vpbroadcast%1 m0, [r0] +vpbroadcast%1 m0 {%2}, [r0+2*r1] +vpbroadcast%1 m2, [r2] +vpbroadcast%1 m2 {%2}, [r2+2*r3] +add r0, r1 +add r2, r3 +vpbroadcast%1 m1, [r0] +vpbroadcast%1 m1 {%2}, [r0+2*r1] +vpbroadcast%1 m3, [r2] +vpbroadcast%1 m3 {%2}, [r2+2*r3] +%endmacro + +%macro SATD_AVX512_LOAD8 5 ; size, halfreg, opmask1, opmask2, opmask3 +vpbroadcast%1 %{2}0, [r0] +vpbroadcast%1 %{2}0 {%3}, [r0+2*r1] +vpbroadcast%1 %{2}2, [r2] +vpbroadcast%1 %{2}2 {%3}, [r2+2*r3] +vpbroadcast%1m0 {%4}, [r0+4*r1] +vpbroadcast%1m2 {%4}, [r2+4*r3] +vpbroadcast%1m0 {%5}, [r0+2*r4] +vpbroadcast%1m2 {%5}, [r2+2*r5] +vpbroadcast%1 %{2}1, [r0+r1] +vpbroadcast%1 %{2}1 {%3}, [r0+r4] +vpbroadcast%1 %{2}3, [r2+r3] +vpbroadcast%1 %{2}3 {%3}, [r2+r5] +lea r0, [r0+4*r1] +lea r2, [r2+4*r3] +vpbroadcast%1m1 {%4}, [r0+r1] +vpbroadcast%1m3 {%4}, [r2+r3] +vpbroadcast%1m1 {%5}, [r0+r4] +vpbroadcast%1m3 {%5}, [r2+r5] +%endmacro + +%macro SATD_AVX512_PACKED 0 +DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 +SUMSUB_BA w, 0, 1, 2 +SBUTTERFLY qdq, 0, 1, 2 +SUMSUB_BA w, 0, 1, 2 +HMAXABSW2 0, 1, 2, 3 +%endmacro + +%macro SATD_AVX512_END 0 +paddw m0 {k1}{z}, m1 ; zero-extend to dwords +%if ARCH_X86_64 +%if mmsize == 64 +vextracti32x8 ym1, m0, 1 +paddd ym0, ym1 +%endif +%if mmsize >= 32 +vextracti128 xm1, ym0, 1 +padddxmm0, xm0, xm1 +%endif +punpckhqdq xmm1, xmm0, xmm0 +padddxmm0, xmm1 +movq rax, xmm0 +rorx rdx, rax, 32 +add eax, edx +%else +HADDD m0, m1 +movd eax, xm0 +%endif +RET +%endmacro + +%macro HMAXABSW2 4 ; a, b, tmp1, tmp2 +pabsw m%1, m%1 +pabsw m%2, m%2 +psrldqm%3, m%1, 2 +psrld m%4, m%2, 16 +pmaxswm%1, m%3 +pmaxswm%2, m%4 +%endmacro + +INIT_ZMM avx512 +cglobal
[x265] [PATCH 005 of 307] x86: AVX-512 pixel_avg_weight_w16
# HG changeset patch # User Vignesh Vijayakumar # Date 1498474278 -19800 # Mon Jun 26 16:21:18 2017 +0530 # Node ID 5309fe76c442d720d2d3419eefab11f2a1f9731a # Parent 2e5128235d577806f16e5cf93266dcd7f4155a63 x86: AVX-512 pixel_avg_weight_w16 diff -r 2e5128235d57 -r 5309fe76c442 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jun 23 17:25:27 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jun 26 16:21:18 2017 +0530 @@ -3754,6 +3754,8 @@ p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512); p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512); +p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx512); +p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx512); } #endif } diff -r 2e5128235d57 -r 5309fe76c442 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmFri Jun 23 17:25:27 2017 +0530 +++ b/source/common/x86/mc-a.asmMon Jun 26 16:21:18 2017 +0530 @@ -3367,11 +3367,11 @@ %endmacro %endif -%macro AVG_END 0 +%macro AVG_END 0-1 2;rows +lea t2, [t2+t3*2*SIZEOF_PIXEL] lea t4, [t4+t5*2*SIZEOF_PIXEL] -lea t2, [t2+t3*2*SIZEOF_PIXEL] lea t0, [t0+t1*2*SIZEOF_PIXEL] -sub eax, 2 +sub eax, %1 jg .height_loop %ifidn movu,movq ; detect MMX EMMS @@ -3434,17 +3434,24 @@ %endmacro %macro BIWEIGHT_START_SSSE3 0 -movzx t6d, byte r6m ; FIXME x86_64 -movt7d, 64 -subt7d, t6d -shlt7d, 8 -addt6d, t7d -movam4, [pw_512] -movd xm3, t6d +movzx t6d, byte r6m ; FIXME x86_64 +%if mmsize > 16 +vbroadcasti128 m4, [pw_512] +%else +mova m4, [pw_512] +%endif +lea t7d, [t6+(64<<8)] +shl t6d, 8 +sub t7d, t6d +%if cpuflag(avx512) +vpbroadcastw m3, t7d +%else +movd xm3, t7d %if cpuflag(avx2) -vpbroadcastw m3, xm3 +vpbroadcastw m3, xm3 %else -SPLATW m3, m3 ; weight_dst,src +SPLATW m3, m3 ; weight_dst,src +%endif %endif %endmacro @@ -3586,6 +3593,34 @@ vextracti128 [t0+t1], m0, 1 AVG_END +INIT_ZMM avx512 + cglobal pixel_avg_weight_w16 +BIWEIGHT_START +AVG_START 5 +.height_loop: +movuxm0, [t2] +movuxm1, [t4] +vinserti128 ym0, [t2+t3], 1 +vinserti128 ym1, [t4+t5], 1 +lea t2, [t2+t3*2] +lea t4, [t4+t5*2] +vinserti32x4 m0, [t2], 2 +vinserti32x4 m1, [t4], 2 +vinserti32x4 m0, [t2+t3], 3 +vinserti32x4 m1, [t4+t5], 3 +SBUTTERFLY bw, 0, 1, 2 +pmaddubswm0, m3 +pmaddubswm1, m3 +pmulhrsw m0, m4 +pmulhrsw m1, m4 +packuswb m0, m1 +mova [t0], xm0 +vextracti128 [t0+t1], ym0, 1 +lea t0, [t0+t1*2] +vextracti32x4 [t0], m0, 2 +vextracti32x4 [t0+t1], m0, 3 +AVG_END 4 + cglobal pixel_avg_weight_w32 BIWEIGHT_START AVG_START 5 @@ -4345,6 +4380,10 @@ AVGH 16, 8 AVGH 16, 4 +INIT_XMM avx512 +AVGH 16, 16 +AVGH 16, 8 + %endif ;HIGH_BIT_DEPTH ;--- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 001 of 307] x86: AVX-512 support
# HG changeset patch # User Vignesh Vijayakumar # Date 1498107357 -19800 # Thu Jun 22 10:25:57 2017 +0530 # Node ID d7e105cac1d01fa74adc8f7f7431d33b7e261b4f # Parent e1ed4d609b52a361e758a66f45e8c070dd245211 x86: AVX-512 support diff -r e1ed4d609b52 -r d7e105cac1d0 source/common/cpu.cpp --- a/source/common/cpu.cpp Tue Apr 03 13:49:25 2018 +0530 +++ b/source/common/cpu.cpp Thu Jun 22 10:25:57 2017 +0530 @@ -61,7 +61,7 @@ const cpu_name_t cpu_names[] = { #if X265_ARCH_X86 -#define MMX2 X265_CPU_MMX | X265_CPU_MMX2 | X265_CPU_CMOV +#define MMX2 X265_CPU_MMX | X265_CPU_MMX2 { "MMX2",MMX2 }, { "MMXEXT", MMX2 }, { "SSE", MMX2 | X265_CPU_SSE }, @@ -84,13 +84,13 @@ { "BMI2",AVX | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 }, #define AVX2 AVX | X265_CPU_FMA3 | X265_CPU_LZCNT | X265_CPU_BMI1 | X265_CPU_BMI2 | X265_CPU_AVX2 { "AVX2", AVX2}, +{ "AVX512", AVX2 | X265_CPU_AVX512 }, #undef AVX2 #undef AVX #undef SSE2 #undef MMX2 { "Cache32", X265_CPU_CACHELINE_32 }, { "Cache64", X265_CPU_CACHELINE_64 }, -{ "SlowCTZ", X265_CPU_SLOW_CTZ }, { "SlowAtom",X265_CPU_SLOW_ATOM }, { "SlowPshufb", X265_CPU_SLOW_PSHUFB }, { "SlowPalignr", X265_CPU_SLOW_PALIGNR }, @@ -115,7 +115,7 @@ /* cpu-a.asm */ int PFX(cpu_cpuid_test)(void); void PFX(cpu_cpuid)(uint32_t op, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx); -void PFX(cpu_xgetbv)(uint32_t op, uint32_t *eax, uint32_t *edx); +uint64_t PFX(cpu_xgetbv)(int xcr); } #if defined(_MSC_VER) @@ -129,14 +129,14 @@ uint32_t eax, ebx, ecx, edx; uint32_t vendor[4] = { 0 }; uint32_t max_extended_cap, max_basic_cap; +uint64_t xcr0 = 0; #if !X86_64 if (!PFX(cpu_cpuid_test)()) return 0; #endif -PFX(cpu_cpuid)(0, &eax, vendor + 0, vendor + 2, vendor + 1); -max_basic_cap = eax; +PFX(cpu_cpuid)(0, &max_basic_cap, vendor + 0, vendor + 2, vendor + 1); if (max_basic_cap == 0) return 0; @@ -147,27 +147,24 @@ return cpu; if (edx & 0x0200) cpu |= X265_CPU_MMX2 | X265_CPU_SSE; -if (edx & 0x8000) -cpu |= X265_CPU_CMOV; -else -return cpu; if (edx & 0x0400) cpu |= X265_CPU_SSE2; if (ecx & 0x0001) cpu |= X265_CPU_SSE3; if (ecx & 0x0200) -cpu |= X265_CPU_SSSE3; +cpu |= X265_CPU_SSSE3 | X265_CPU_SSE2_IS_FAST; if (ecx & 0x0008) cpu |= X265_CPU_SSE4; if (ecx & 0x0010) cpu |= X265_CPU_SSE42; -/* Check OXSAVE and AVX bits */ -if ((ecx & 0x1800) == 0x1800) + +if (ecx & 0x0800) /* XGETBV supported and XSAVE enabled by OS */ { /* Check for OS support */ -PFX(cpu_xgetbv)(0, &eax, &edx); -if ((eax & 0x6) == 0x6) +xcr0 = PFX(cpu_xgetbv)(0); +if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */ { +if (ecx & 0x1000) cpu |= X265_CPU_AVX; if (ecx & 0x1000) cpu |= X265_CPU_FMA3; @@ -178,19 +175,24 @@ { PFX(cpu_cpuid)(7, &eax, &ebx, &ecx, &edx); /* AVX2 requires OS support, but BMI1/2 don't. */ -if ((cpu & X265_CPU_AVX) && (ebx & 0x0020)) -cpu |= X265_CPU_AVX2; if (ebx & 0x0008) +cpu |= X265_CPU_BMI1; +if (ebx & 0x0100) +cpu |= X265_CPU_BMI2; + +if ((xcr0 & 0x6) == 0x6) /* XMM/YMM state */ { -cpu |= X265_CPU_BMI1; -if (ebx & 0x0100) -cpu |= X265_CPU_BMI2; +if (ebx & 0x0020) +cpu |= X265_CPU_AVX2; + +if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */ +{ +if ((ebx & 0xD003) == 0xD003) +cpu |= X265_CPU_AVX512; +} } } -if (cpu & X265_CPU_SSSE3) -cpu |= X265_CPU_SSE2_IS_FAST; - PFX(cpu_cpuid)(0x8000, &eax, &ebx, &ecx, &edx); max_extended_cap = eax; @@ -230,8 +232,6 @@ { if (edx & 0x0040) cpu |= X265_CPU_MMX2; -if (!(cpu & X265_CPU_LZCNT)) -cpu |= X265_CPU_SLOW_CTZ; if ((cpu & X265_CPU_SSE2) && !(cpu & X265_CPU_SSE2_IS_FAST)) cpu |= X265_CPU_SSE2_IS_SLOW; /* AMD CPUs come in two types: terrible at SSE and great at it */ } @@ -256,7 +256,6 @@ else if (model == 28) { cpu |= X265_CPU_SLOW_ATOM; -cpu |= X265_CPU_SLOW_CTZ; cpu |= X265_CPU_SLOW_PSHUFB; } diff -r e1ed4d609b52 -r d7e105cac1d0 source/common/x86/cpu-a.asm --- a/source/common/x86/cpu-a.asm Tue Apr 03 13:49:25 2018 +0530 +++ b/source/common/x86/cpu-a.asm Thu Jun 22 10:25:57 2017 +0530 @@ -54,18 +54,16 @@ RET ;--
[x265] [PATCH 002 of 307] x86: Faster SSE2 pixel_sad_16x16 and 16x8
# HG changeset patch # User Vignesh Vijayakumar # Date 1498039639 -19800 # Wed Jun 21 15:37:19 2017 +0530 # Node ID daee70fc99daabd85cfc1245cf257e8b77a158e8 # Parent d7e105cac1d01fa74adc8f7f7431d33b7e261b4f x86: Faster SSE2 pixel_sad_16x16 and 16x8 diff -r d7e105cac1d0 -r daee70fc99da source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Thu Jun 22 10:25:57 2017 +0530 +++ b/source/common/x86/sad-a.asm Wed Jun 21 15:37:19 2017 +0530 @@ -378,111 +378,60 @@ lea r0, [r0 + r1] %endmacro -%macro SAD_W16 0 -;- -; int pixel_sad_16x16( uint8_t *, intptr_t, uint8_t *, intptr_t ) -;- -cglobal pixel_sad_16x16, 4,4,8 -movum0, [r2] -movum1, [r2+r3] -lea r2, [r2+2*r3] -movum2, [r2] -movum3, [r2+r3] -lea r2, [r2+2*r3] -psadbw m0, [r0] -psadbw m1, [r0+r1] -lea r0, [r0+2*r1] -movum4, [r2] -paddw m0, m1 -psadbw m2, [r0] -psadbw m3, [r0+r1] -lea r0, [r0+2*r1] -movum5, [r2+r3] -lea r2, [r2+2*r3] -paddw m2, m3 -movum6, [r2] -movum7, [r2+r3] -lea r2, [r2+2*r3] -paddw m0, m2 -psadbw m4, [r0] -psadbw m5, [r0+r1] -lea r0, [r0+2*r1] -movum1, [r2] -paddw m4, m5 -psadbw m6, [r0] -psadbw m7, [r0+r1] -lea r0, [r0+2*r1] -movum2, [r2+r3] -lea r2, [r2+2*r3] -paddw m6, m7 -movum3, [r2] -paddw m0, m4 -movum4, [r2+r3] -lea r2, [r2+2*r3] -paddw m0, m6 -psadbw m1, [r0] -psadbw m2, [r0+r1] -lea r0, [r0+2*r1] -movum5, [r2] -paddw m1, m2 -psadbw m3, [r0] -psadbw m4, [r0+r1] -lea r0, [r0+2*r1] -movum6, [r2+r3] -lea r2, [r2+2*r3] -paddw m3, m4 -movum7, [r2] -paddw m0, m1 -movum1, [r2+r3] -paddw m0, m3 -psadbw m5, [r0] -psadbw m6, [r0+r1] -lea r0, [r0+2*r1] -paddw m5, m6 -psadbw m7, [r0] -psadbw m1, [r0+r1] -paddw m7, m1 -paddw m0, m5 -paddw m0, m7 -SAD_END_SSE2 - -;- -; int pixel_sad_16x8( uint8_t *, intptr_t, uint8_t *, intptr_t ) -;- -cglobal pixel_sad_16x8, 4,4 -movum0, [r2] -movum2, [r2+r3] -lea r2, [r2+2*r3] -movum3, [r2] -movum4, [r2+r3] -psadbw m0, [r0] -psadbw m2, [r0+r1] -lea r0, [r0+2*r1] -psadbw m3, [r0] -psadbw m4, [r0+r1] -lea r0, [r0+2*r1] -lea r2, [r2+2*r3] -paddw m0, m2 -paddw m3, m4 -paddw m0, m3 -movum1, [r2] -movum2, [r2+r3] -lea r2, [r2+2*r3] -movum3, [r2] -movum4, [r2+r3] -psadbw m1, [r0] -psadbw m2, [r0+r1] -lea r0, [r0+2*r1] -psadbw m3, [r0] -psadbw m4, [r0+r1] -lea r0, [r0+2*r1] -lea r2, [r2+2*r3] -paddw m1, m2 -paddw m3, m4 -paddw m0, m1 -paddw m0, m3 -SAD_END_SSE2 - +%macro SAD_W16 1 ; h +cglobal pixel_sad_16x%1, 4,4 +%assign %%i 0 +%if ARCH_X86_64 +lea r6, [3*r1] ; r6 results in fewer REX prefixes than r4 and both are volatile +lea r5, [3*r3] +%rep %1/4 +movu m1, [r2] +psadbw m1, [r0] +movu m3, [r2+r3] +psadbw m3, [r0+r1] +movu m2, [r2+2*r3] +psadbw m2, [r0+2*r1] +movu m4, [r2+r5] +psadbw m4, [r0+r6] +%if %%i != %1/4-1 +lea r2, [r2+4*r3] +lea r0, [r0+4*r1] +%endif +paddwm1, m3 +paddwm2, m4 +ACCUM paddw, 0, 1, %%i +paddwm0, m2 +%assign %%i %%i+1 +%endrep +%else ; The cost of having to save and restore registers on x86-32 +%rep %1/2 ; nullifies the benefit of having 3*stride in registers. +movu m1, [r2] +psadbw m1, [r0] +movu m2, [r2+r3] +psadbw m2, [r0+r1] +%if %%i != %1/2-1 +lea r2, [r2+2*r3] +lea r0, [r0+2*r1] +%endif +ACCUM paddw, 0, 1, %%i +paddwm0, m2 +%assign %%i %%i+1 +%endrep +%endif + SAD_END_SSE2 + %endmacro + +INIT_XMM sse2 +SAD_W16 8 +SAD_W16 16 +INIT_XMM sse3 +SAD_W16 8 +SAD_W16 16 +INIT_XMM sse2, aligned +SAD_W16 8 +SAD_W16 16 + +%macro SAD_Wx 0 ;- ; int pixel_sad_16x12( uint8_t *, intptr_t, uint8_t *, intptr_t ) ;- @@ -808,11 +757,11 @@ %endmacro INIT_XMM sse2 -SAD_W16 +SAD_Wx INIT_XMM sse3 -SAD_W16 +SAD_Wx INIT_XMM sse2, aligned -SAD_W16 +SAD_Wx %macro SAD_INC_4x8P_SSE 1 movqm1, [r0] ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan
[x265] [PATCH 000 of 307 ] AVX-512 implementataion in x265
This series of patches enables AVX-512 in x265. USe CLI option --asm avx512 to enable AVX-512 kernels. ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 007 of 307] x86: AVX-512 pixel_sa8d_8x8
# HG changeset patch # User Jayashri Murugan # Date 1498473664 -19800 # Mon Jun 26 16:11:04 2017 +0530 # Node ID 03a532a9ab714b0081aede28e1773022d2be20b6 # Parent c7b36dac20317b3819fb30cf437a029a2ce7ca99 x86: AVX-512 pixel_sa8d_8x8 diff -r c7b36dac2031 -r 03a532a9ab71 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jun 26 16:31:02 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jun 26 16:11:04 2017 +0530 @@ -3752,6 +3752,9 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_avx512); +p.cu[BLOCK_8x8].sa8d = PFX(pixel_sa8d_8x8_avx512); +p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sa8d = PFX(pixel_sa8d_8x8_avx512); + p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512); p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512); p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx512); @@ -3759,6 +3762,7 @@ p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_avx512); p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx512); p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx512); + } #endif } diff -r c7b36dac2031 -r 03a532a9ab71 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Mon Jun 26 16:31:02 2017 +0530 +++ b/source/common/x86/pixel-a.asm Mon Jun 26 16:11:04 2017 +0530 @@ -8187,7 +8187,7 @@ HMAXABSW2 0, 1, 2, 3 %endmacro -%macro SATD_AVX512_END 0 +%macro SATD_AVX512_END 0-1 0 ; sa8d paddw m0 {k1}{z}, m1 ; zero-extend to dwords %if ARCH_X86_64 %if mmsize == 64 @@ -8202,10 +8202,19 @@ padddxmm0, xmm1 movq rax, xmm0 rorx rdx, rax, 32 +%if %1 +lea eax, [rax+rdx+1] +shr eax, 1 +%else add eax, edx +%endif %else HADDD m0, m1 movd eax, xm0 +%if %1 +inc eax +shr eax, 1 +%endif %endif RET %endmacro @@ -8350,6 +8359,29 @@ SWAP 0, 1 SATD_AVX512_END +INIT_ZMM avx512 +cglobal pixel_sa8d_8x8, 4,6 +vbroadcasti64x4 m4, [hmul_16p] +mov r4d, 0x +kmovdk1, r4d ; 01010101 +kshiftlb k2, k1, 5 ; 1010 +kshiftlb k3, k1, 4 ; 0101 +lea r4, [3*r1] +lea r5, [3*r3] +SATD_AVX512_LOAD8 q, ym, k1, k2, k3 ; 2 0 2 0 6 4 6 4 +DIFF_SUMSUB_SSSE3 0, 2, 1, 3, 4 ; 3 1 3 1 7 5 7 5 +SUMSUB_BA w, 0, 1, 2 +SBUTTERFLY qdq, 0, 1, 2 +SUMSUB_BA w, 0, 1, 2 +shufpsm2, m0, m1, q2020 +shufpsm1, m0, m1, q3131 +SUMSUB_BA w, 2, 1, 0 +vshufi32x4m0, m2, m1, q1010 +vshufi32x4m1, m2, m1, q3232 +SUMSUB_BA w, 0, 1, 2 +HMAXABSW2 0, 1, 2, 3 +SATD_AVX512_END 1 + ; Input 10bit, Output 8bit ; ;void planecopy_sc(uint16_t *src, intptr_t srcStride, pixel *dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 004 of 307] x86: AVX-512 pixel_var_8x8 and 16x16
# HG changeset patch # User Vignesh Vijayakumar # Date 1498218927 -19800 # Fri Jun 23 17:25:27 2017 +0530 # Node ID 2e5128235d577806f16e5cf93266dcd7f4155a63 # Parent 6ce366c4e4919a4f1641234824f6bf4f128df400 x86: AVX-512 pixel_var_8x8 and 16x16 diff -r 6ce366c4e491 -r 2e5128235d57 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Jun 21 12:43:06 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jun 23 17:25:27 2017 +0530 @@ -2188,6 +2188,10 @@ p.costCoeffNxN = PFX(costCoeffNxN_avx2_bmi2); } } +if (cpuMask & X265_CPU_AVX512) +{ +p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512); +} } #else // if HIGH_BIT_DEPTH @@ -3748,6 +3752,8 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_avx512); +p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512); +p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512); } #endif } diff -r 6ce366c4e491 -r 2e5128235d57 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Wed Jun 21 12:43:06 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Fri Jun 23 17:25:27 2017 +0530 @@ -28,6 +28,8 @@ SECTION_RODATA 32 +var_shuf_avx512: db 0,-1, 1,-1, 2,-1, 3,-1, 4,-1, 5,-1, 6,-1, 7,-1 + db 8,-1, 9,-1,10,-1,11,-1,12,-1,13,-1,14,-1,15,-1 %if BIT_DEPTH == 12 ssim_c1: times 4 dd 107321.76; .01*.01*4095*4095*64 ssim_c2: times 4 dd 60851437.92 ; .03*.03*4095*4095*64*63 @@ -5757,7 +5759,7 @@ %if HIGH_BIT_DEPTH == 0 %if %1 mova m7, [pw_00ff] -%elif mmsize < 32 +%elif mmsize == 16 pxor m7, m7; zero %endif %endif ; !HIGH_BIT_DEPTH @@ -6476,6 +6478,118 @@ RET %endif ; !HIGH_BIT_DEPTH +%macro VAR_AVX512_CORE 1 ; accum +%if %1 +paddwm0, m2 +pmaddwd m2, m2 +paddwm0, m3 +pmaddwd m3, m3 +padddm1, m2 +padddm1, m3 +%else +paddwm0, m2, m3 +pmaddwd m2, m2 +pmaddwd m3, m3 +padddm1, m2, m3 +%endif +%endmacro + +%macro VAR_AVX512_CORE_16x16 1 ; accum +%if HIGH_BIT_DEPTH +movaym2, [r0] +vinserti64x4 m2, [r0+r1], 1 +movaym3, [r0+2*r1] +vinserti64x4 m3, [r0+r3], 1 +%else +vbroadcasti64x2 ym2, [r0] +vbroadcasti64x2 m2 {k1}, [r0+r1] +vbroadcasti64x2 ym3, [r0+2*r1] +vbroadcasti64x2 m3 {k1}, [r0+r3] +pshufb m2, m4 +pshufb m3, m4 +%endif +VAR_AVX512_CORE %1 +%endmacro + +%macro VAR_AVX512_CORE_8x8 1 ; accum +%if HIGH_BIT_DEPTH +movaxm2, [r0] +movaxm3, [r0+r1] +%else +movqxm2, [r0] +movqxm3, [r0+r1] +%endif +vinserti128 ym2, [r0+2*r1], 1 +vinserti128 ym3, [r0+r2], 1 +lea r0, [r0+4*r1] +vinserti32x4 m2, [r0], 2 +vinserti32x4 m3, [r0+r1], 2 +vinserti32x4 m2, [r0+2*r1], 3 +vinserti32x4 m3, [r0+r2], 3 +%if HIGH_BIT_DEPTH == 0 +punpcklbwm2, m4 +punpcklbwm3, m4 +%endif +VAR_AVX512_CORE %1 +%endmacro + +INIT_ZMM avx512 +cglobal pixel_var_16x16, 2,4 +FIX_STRIDES r1 +movr2d, 0xf0 +lea r3, [3*r1] +%if HIGH_BIT_DEPTH == 0 +vbroadcasti64x4 m4, [var_shuf_avx512] +kmovb k1, r2d +%endif +VAR_AVX512_CORE_16x16 0 +.loop: +lea r0, [r0+4*r1] +VAR_AVX512_CORE_16x16 1 +subr2d, 0x50 +jg .loop +%if ARCH_X86_64 == 0 +popr3d +%assign regs_used 3 +%endif +var_avx512_end: +vbroadcasti32x4 m2, [pw_1] +pmaddwd m0, m2 +SBUTTERFLY dq, 0, 1, 2 +paddd m0, m1 +vextracti32x8 ym1, m0, 1 +paddd ym0, ym1 +vextracti128 xm1, ym0, 1 +paddd xmm0, xm0, xm1 +punpckhqdqxmm1, xmm0, xmm0 +paddd xmm0, xmm1 +%if ARCH_X86_64 +movq rax, xmm0 +%else +movd eax, xmm0 +pextrd edx, xmm0, 1 + %endif + RET + +%if HIGH_BIT_DEPTH == 0 ; 8x8 doesn't benefit from AVX-512 in high bit-depth +cglobal pixel_var_8x8, 2,3 +lea r2, [3*r1] +pxor xm4, xm4 +VAR_AVX512_CORE_8x8 0 +jmp var_avx512_end +%endif + +cglobal pixel_var_8x16, 2,3 +FIX_STRIDES r1 +lea r2, [3*r1] +%if HIGH_BIT_DEPTH == 0 +pxor xm4, xm4 +%endif +VAR_AVX512_CORE_8x8 0 +lea r0, [r0+4*r1] +VAR_AVX512_CORE_8x8 1 +jmp var_avx512_end + %macro VAR2_END 3 HADDW %2, xm1 movd r1d, %2 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 006 of 307] x86: AVX-512 pixel_avg_weight_w8
# HG changeset patch # User Vignesh Vijayakumar # Date 1498474862 -19800 # Mon Jun 26 16:31:02 2017 +0530 # Node ID c7b36dac20317b3819fb30cf437a029a2ce7ca99 # Parent 5309fe76c442d720d2d3419eefab11f2a1f9731a x86: AVX-512 pixel_avg_weight_w8 diff -r 5309fe76c442 -r c7b36dac2031 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jun 26 16:21:18 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jun 26 16:31:02 2017 +0530 @@ -3756,6 +3756,9 @@ p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512); p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx512); p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx512); +p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_avx512); +p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx512); +p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx512); } #endif } diff -r 5309fe76c442 -r c7b36dac2031 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmMon Jun 26 16:21:18 2017 +0530 +++ b/source/common/x86/mc-a.asmMon Jun 26 16:31:02 2017 +0530 @@ -3574,6 +3574,38 @@ AVG_WEIGHT 24, 7 AVG_WEIGHT 48, 7 +INIT_YMM avx512 +cglobal pixel_avg_weight_w8 +BIWEIGHT_START +kxnorb k1, k1, k1 +kaddb k1, k1, k1 +AVG_START 5 +.height_loop: +movq xm0, [t2] +movq xm2, [t4] +movq xm1, [t2+t3] +movq xm5, [t4+t5] +leat2, [t2+t3*2] +leat4, [t4+t5*2] +vpbroadcastq m0 {k1}, [t2] +vpbroadcastq m2 {k1}, [t4] +vpbroadcastq m1 {k1}, [t2+t3] +vpbroadcastq m5 {k1}, [t4+t5] +punpcklbw m0, m2 +punpcklbw m1, m5 +pmaddubsw m0, m3 +pmaddubsw m1, m3 +pmulhrsw m0, m4 +pmulhrsw m1, m4 +packuswb m0, m1 +vextracti128 xmm1, m0, 1 +movq [t0], xm0 +movhps[t0+t1], xm0 +leat0, [t0+t1*2] +movq [t0], xmm1 +movhps[t0+t1], xmm1 +AVG_END 4 + INIT_YMM avx2 cglobal pixel_avg_weight_w16 BIWEIGHT_START @@ -4383,6 +4415,9 @@ INIT_XMM avx512 AVGH 16, 16 AVGH 16, 8 +AVGH 8, 16 +AVGH 8, 8 +AVGH 8, 4 %endif ;HIGH_BIT_DEPTH ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 011 of 307] x86: AVX512 blockcopy_sp_64x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1499320932 -19800 # Thu Jul 06 11:32:12 2017 +0530 # Node ID 1321369efdf990d960db9a6fbe0181f086ba90f9 # Parent 328d10aa0ff4d3097ff4941e224d2cdf6774a7c8 x86: AVX512 blockcopy_sp_64x64 AVX2 performance over C code : 6.77x AVX512 performance over C code : 8.46x diff -r 328d10aa0ff4 -r 1321369efdf9 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 06 17:03:15 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jul 06 11:32:12 2017 +0530 @@ -3777,6 +3777,8 @@ p.pu[LUMA_16x8].sad = PFX(pixel_sad_16x8_avx512); p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx512); +p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512); + } #endif } diff -r 328d10aa0ff4 -r 1321369efdf9 source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Thu Jul 06 17:03:15 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Thu Jul 06 11:32:12 2017 +0530 @@ -2121,6 +2121,53 @@ BLOCKCOPY_SP_W64_H4_avx2 64, 64 +%macro BLOCKCOPY_SP_W64_H4_avx512 2 +INIT_ZMM avx512 +cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride +movr4d, %2/4 +addr3, r3 +lear5, [3 * r3] +lear6, [3 * r1] + +.loop: +movu m0, [r2] +movu m1, [r2 + 64] +movu m2, [r2 + r3] +movu m3, [r2 + r3 + 64] + +packuswb m0, m1 +packuswb m2, m3 +vpermq m0, m0, 11011000b +vpermq m2, m2, 11011000b +vshufi64x2 m0, m0, 11011000b +vshufi64x2 m2, m2, 11011000b +movu [r0], m0 +movu [r0 + r1], m2 + +movu m0, [r2 + 2 * r3] +movu m1, [r2 + 2 * r3 + 64] +movu m2, [r2 + r5] +movu m3, [r2 + r5 + 64] + +packuswb m0, m1 +packuswb m2, m3 +vpermq m0, m0, 11011000b +vpermq m2, m2, 11011000b +vshufi64x2 m0, m0, 11011000b +vshufi64x2 m2, m2, 11011000b +movu [r0 + 2 * r1], m0 +movu [r0 + r6], m2 + +lear0, [r0 + 4 * r1] +lear2, [r2 + 4 * r3] + +decr4d +jnz.loop +RET +%endmacro + +BLOCKCOPY_SP_W64_H4_avx512 64, 64 + ;- ; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val) ;- diff -r 328d10aa0ff4 -r 1321369efdf9 source/common/x86/blockcopy8.h --- a/source/common/x86/blockcopy8.hThu Jul 06 17:03:15 2017 +0530 +++ b/source/common/x86/blockcopy8.hThu Jul 06 11:32:12 2017 +0530 @@ -57,6 +57,7 @@ FUNCDEF_PU(void, blockcopy_sp, sse2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); FUNCDEF_PU(void, blockcopy_sp, sse4, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); FUNCDEF_PU(void, blockcopy_sp, avx2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); +FUNCDEF_PU(void, blockcopy_sp, avx512, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); FUNCDEF_PU(void, blockcopy_ps, sse2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); FUNCDEF_PU(void, blockcopy_ps, sse4, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); FUNCDEF_PU(void, blockcopy_ps, avx2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 008 of 307] x86: AVX-512 pixel_sad for 4x4, 4x8, 4x16, 8x4, 8x8, 8x16, 16x8, and 16x16
# HG changeset patch # User Jayashri Murugan # Date 1498474912 -19800 # Mon Jun 26 16:31:52 2017 +0530 # Node ID 69b61721fa2ffdf1a0f6609a299c4e0104b48628 # Parent 03a532a9ab714b0081aede28e1773022d2be20b6 x86: AVX-512 pixel_sad for 4x4, 4x8, 4x16, 8x4, 8x8, 8x16, 16x8, and 16x16. diff -r 03a532a9ab71 -r 69b61721fa2f source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jun 26 16:11:04 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jun 26 16:31:52 2017 +0530 @@ -3763,6 +3763,15 @@ p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx512); p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx512); +p.pu[LUMA_4x4].sad = PFX(pixel_sad_4x4_avx512); +p.pu[LUMA_4x8].sad = PFX(pixel_sad_4x8_avx512); +p.pu[LUMA_4x16].sad = PFX(pixel_sad_4x16_avx512); +p.pu[LUMA_8x4].sad = PFX(pixel_sad_8x4_avx512); +p.pu[LUMA_8x8].sad = PFX(pixel_sad_8x8_avx512); +p.pu[LUMA_8x16].sad = PFX(pixel_sad_8x16_avx512); +p.pu[LUMA_16x8].sad = PFX(pixel_sad_16x8_avx512); +p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx512); + } #endif } diff -r 03a532a9ab71 -r 69b61721fa2f source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Mon Jun 26 16:11:04 2017 +0530 +++ b/source/common/x86/sad-a.asm Mon Jun 26 16:31:52 2017 +0530 @@ -380,6 +380,9 @@ %macro SAD_W16 1 ; h cglobal pixel_sad_16x%1, 4,4 +%ifidn cpuname, sse2 +.skip_prologue: +%endif %assign %%i 0 %if ARCH_X86_64 lea r6, [3*r1] ; r6 results in fewer REX prefixes than r4 and both are volatile @@ -790,7 +793,132 @@ SAD_INC_4x8P_SSE 1 SAD_INC_4x8P_SSE 1 SAD_END_SSE2 + +%macro SAD_W48_AVX512 3 ; w, h, d/q +cglobal pixel_sad_%1x%2, 4,4 +kxnorbk1, k1, k1 +kaddb k1, k1, k1 +%assign %%i 0 +%if ARCH_X86_64 && %2 != 4 +lea r6, [3*r1] +lea r5, [3*r3] +%rep %2/4 +mov%3 m1, [r0] +vpbroadcast%3 m1 {k1}, [r0+r1] +mov%3 m3, [r2] +vpbroadcast%3 m3 {k1}, [r2+r3] +mov%3 m2, [r0+2*r1] +vpbroadcast%3 m2 {k1}, [r0+r6] +mov%3 m4, [r2+2*r3] +vpbroadcast%3 m4 {k1}, [r2+r5] +%if %%i != %2/4-1 +lea r0, [r0+4*r1] +lea r2, [r2+4*r3] +%endif +psadbwm1, m3 +psadbwm2, m4 +ACCUM paddd, 0, 1, %%i +paddd m0, m2 +%assign %%i %%i+1 +%endrep +%else +%rep %2/2 +mov%3 m1, [r0] +vpbroadcast%3 m1 {k1}, [r0+r1] +mov%3 m2, [r2] +vpbroadcast%3 m2 {k1}, [r2+r3] +%if %%i != %2/2-1 +lea r0, [r0+2*r1] +lea r2, [r2+2*r3] +%endif +psadbwm1, m2 +ACCUM paddd, 0, 1, %%i +%assign %%i %%i+1 +%endrep +%endif +%if %1 == 8 +punpckhqdqm1, m0, m0 +paddd m0, m1 +%endif +movd eax, m0 RET +%endmacro + +INIT_XMM avx512 +SAD_W48_AVX512 4, 4, d +SAD_W48_AVX512 4, 8, d +SAD_W48_AVX512 4, 16, d +SAD_W48_AVX512 8, 4, q +SAD_W48_AVX512 8, 8, q +SAD_W48_AVX512 8, 16, q + +%macro SAD_W16_AVX512_START 1 ; h +cmp r1d, 16 ; optimized for width = 16, which has the +jne pixel_sad_16x%1_sse2.skip_prologue ; rows laid out contiguously in memory +lea r1, [3*r3] +%endmacro + +%macro SAD_W16_AVX512_END 0 +paddd m0, m1 +paddd m0, m2 +paddd m0, m3 +%if mmsize == 64 +vextracti32x8 ym1, m0, 1 +paddd ym0, ym1 +%endif +vextracti128 xm1, ym0, 1 +padddxmm0, xm0, xm1 +punpckhqdq xmm1, xmm0, xmm0 +padddxmm0, xmm1 +movd eax, xmm0 +RET +%endmacro + +INIT_YMM avx512 +cglobal pixel_sad_16x8, 4,4 +SAD_W16_AVX512_START 8 +movu xm0, [r2] +vinserti128 m0, [r2+r3], 1 +psadbwm0, [r0+0*32] +movu xm1, [r2+2*r3] +vinserti128 m1, [r2+r1], 1 +lea r2, [r2+4*r3] +psadbwm1, [r0+1*32] +movu xm2, [r2] +vinserti128 m2, [r2+r3], 1 +psadbwm2, [r0+2*32] +movu xm3, [r2+2*r3] +vinserti128 m3, [r2+r1], 1 +psadbwm3, [r0+3*32] +SAD_W16_AVX512_END + +INIT_ZMM avx512 +cglobal pixel_sad_16x16, 4,4 +SAD_W16_AVX512_START 16 +movu xm0, [r2] +vinserti128 ym0, [r2+r3], 1 +movu xm1, [r2+4*r3] +vinserti32x4 m0, [r2+2*r3], 2 +vinserti32x4 m1, [r2+2*r1], 2 +vinserti32x4 m0, [r2+r1], 3 +lear2, [r2+4*r3] +vinserti32x4 m1, [r2+r3], 1 +psadbw m0, [r0+0*64] +vinserti32x4 m1, [r2+r1], 3 +lear2, [r2+4*r3] +psadbw m1, [r0+1*64] +movu xm2, [r2] +vinserti128 ym2, [r2+r3], 1 +movu xm3, [r2+4*r3] +vinserti32x4 m2, [r2+2*r3], 2 +vinserti32x4 m3, [r2+2*r1], 2 +vinserti32x4 m2, [r2+r1], 3 +lear2, [r2+4*r3] +vinserti32x4
[x265] [PATCH 009 of 307] x86: AVX512 pixel_avg_weight 8x32, 16x4, 16x12, 16x32, 16x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1499231221 -19800 # Wed Jul 05 10:37:01 2017 +0530 # Node ID 84757e275a5427f0875fc4fd651bd1c48d534e8f # Parent 69b61721fa2ffdf1a0f6609a299c4e0104b48628 x86: AVX512 pixel_avg_weight 8x32, 16x4, 16x12, 16x32, 16x64 diff -r 69b61721fa2f -r 84757e275a54 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jun 26 16:31:52 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Jul 05 10:37:01 2017 +0530 @@ -3757,8 +3757,13 @@ p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512); p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512); +p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx512); +p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx512); p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx512); +p.pu[LUMA_16x12].pixelavg_pp = PFX(pixel_avg_16x12_avx512); p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx512); +p.pu[LUMA_16x4].pixelavg_pp = PFX(pixel_avg_16x4_avx512); +p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_avx512); p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_avx512); p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx512); p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx512); diff -r 69b61721fa2f -r 84757e275a54 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmMon Jun 26 16:31:52 2017 +0530 +++ b/source/common/x86/mc-a.asmWed Jul 05 10:37:01 2017 +0530 @@ -4413,8 +4413,13 @@ AVGH 16, 4 INIT_XMM avx512 +AVGH 16, 64 +AVGH 16, 32 AVGH 16, 16 +AVGH 16, 12 AVGH 16, 8 +AVGH 16, 4 +AVGH 8, 32 AVGH 8, 16 AVGH 8, 8 AVGH 8, 4 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 010 of 307] Correct the asm type of pixel_avg_weight_w32 and pixel_avg_weight_w64
# HG changeset patch # User Vignesh Vijayakumar # Date 1499340795 -19800 # Thu Jul 06 17:03:15 2017 +0530 # Node ID 328d10aa0ff4d3097ff4941e224d2cdf6774a7c8 # Parent 84757e275a5427f0875fc4fd651bd1c48d534e8f Correct the asm type of pixel_avg_weight_w32 and pixel_avg_weight_w64 diff -r 84757e275a54 -r 328d10aa0ff4 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmWed Jul 05 10:37:01 2017 +0530 +++ b/source/common/x86/mc-a.asmThu Jul 06 17:03:15 2017 +0530 @@ -3653,6 +3653,7 @@ vextracti32x4 [t0+t1], m0, 3 AVG_END 4 +INIT_YMM avx2 cglobal pixel_avg_weight_w32 BIWEIGHT_START AVG_START 5 @@ -3668,6 +3669,7 @@ mova[t0], m0 AVG_END +INIT_YMM avx2 cglobal pixel_avg_weight_w64 BIWEIGHT_START AVG_START 5 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 012 of 307] x86: AVX512 blockcopy_ps_32xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1499340573 -19800 # Thu Jul 06 16:59:33 2017 +0530 # Node ID f5c54a1c4a550e9c6df6a1ef0a4462fd23c4a530 # Parent 1321369efdf990d960db9a6fbe0181f086ba90f9 x86: AVX512 blockcopy_ps_32xN AVX2 performance over C code:2.39x AVX512 performance over C code : 3.62x diff -r 1321369efdf9 -r f5c54a1c4a55 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 06 11:32:12 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jul 06 16:59:33 2017 +0530 @@ -3778,6 +3778,9 @@ p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx512); p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512); +p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512); +p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512); +p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx512); } #endif diff -r 1321369efdf9 -r f5c54a1c4a55 source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Thu Jul 06 11:32:12 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Thu Jul 06 16:59:33 2017 +0530 @@ -3124,6 +3124,36 @@ BLOCKCOPY_PS_W32_H4_avx2 32, 32 BLOCKCOPY_PS_W32_H4_avx2 32, 64 +%macro BLOCKCOPY_PS_W32_H4_avx512 2 +INIT_ZMM avx512 +cglobal blockcopy_ps_%1x%2, 4, 7, 4 +add r1, r1 +mov r4d, %2/8 +lea r5, [3 * r3] +lea r6, [3 * r1] +.loop: +%rep 2 +pmovzxbw m0, [r2] +pmovzxbw m1, [r2 + r3] +pmovzxbw m2, [r2 + r3 * 2] +pmovzxbw m3, [r2 + r5] + +movu [r0], m0 +movu [r0 + r1], m1 +movu [r0 + r1 * 2], m2 +movu [r0 + r6], m3 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +%endrep +dec r4d +jnz .loop +RET +%endmacro + +BLOCKCOPY_PS_W32_H4_avx512 32, 32 +BLOCKCOPY_PS_W32_H4_avx512 32, 64 + ;- ; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;- diff -r 1321369efdf9 -r f5c54a1c4a55 source/common/x86/blockcopy8.h --- a/source/common/x86/blockcopy8.hThu Jul 06 11:32:12 2017 +0530 +++ b/source/common/x86/blockcopy8.hThu Jul 06 16:59:33 2017 +0530 @@ -61,5 +61,6 @@ FUNCDEF_PU(void, blockcopy_ps, sse2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); FUNCDEF_PU(void, blockcopy_ps, sse4, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); FUNCDEF_PU(void, blockcopy_ps, avx2, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); +FUNCDEF_PU(void, blockcopy_ps, avx512, int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); #endif // ifndef X265_I386_PIXEL_H ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 013 of 307] x86: AVX512 blockcopy_ps_64x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1499340609 -19800 # Thu Jul 06 17:00:09 2017 +0530 # Node ID e59a457cfe6c0e2cd4137bf3337a2a2d0a815850 # Parent f5c54a1c4a550e9c6df6a1ef0a4462fd23c4a530 x86: AVX512 blockcopy_ps_64x64 AVX2 performance over C code:1.82x AVX512 performance over C code : 3.51x diff -r f5c54a1c4a55 -r e59a457cfe6c source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 06 16:59:33 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jul 06 17:00:09 2017 +0530 @@ -3781,6 +3781,7 @@ p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512); p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512); p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx512); +p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx512); } #endif diff -r f5c54a1c4a55 -r e59a457cfe6c source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Thu Jul 06 16:59:33 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Thu Jul 06 17:00:09 2017 +0530 @@ -3340,6 +3340,42 @@ RET ;- +; void blockcopy_ps_%1x%2(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); +;- +INIT_ZMM avx512 +cglobal blockcopy_ps_64x64, 4, 7, 4 +add r1, r1 +mov r4d, 64/8 +lea r5, [3 * r3] +lea r6, [3 * r1] +.loop: +%rep 2 +pmovzxbw m0, [r2] +pmovzxbw m1, [r2 + 32] +pmovzxbw m2, [r2 + r3] +pmovzxbw m3, [r2 + r3 + 32] +movu [r0], m0 +movu [r0 + 64], m1 +movu [r0 + r1], m2 +movu [r0 + r1 + 64], m3 + +pmovzxbw m0, [r2 + r3 * 2] +pmovzxbw m1, [r2 + r3 * 2 + 32] +pmovzxbw m2, [r2 + r5] +pmovzxbw m3, [r2 + r5 + 32] +movu [r0 + r1 * 2], m0 +movu [r0 + r1 * 2 + 64], m1 +movu [r0 + r6], m2 +movu [r0 + r6 + 64], m3 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +%endrep +dec r4d +jnz .loop +RET + +;- ; void blockcopy_ss_2x4(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;- INIT_XMM sse2 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 016 of 307] x86: AVX512 addAvg_W64
# HG changeset patch # User Vignesh Vijayakumar # Date 1499410957 -19800 # Fri Jul 07 12:32:37 2017 +0530 # Node ID abdbd144d5b4cdfc7f84b540d713147d9b5143fc # Parent cc3a93869b28b7d5b3478a2524d07e7e630a0eca x86: AVX512 addAvg_W64 Size| AVX2 performance | AVX512 performance -- 64x16| 14.46x | 22.25x 64x32| 13.93x | 23.96x 64x48| 13.90x | 24.27x 64x64| 14.74x | 24.31x diff -r cc3a93869b28 -r abdbd144d5b4 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jul 07 12:32:37 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jul 07 12:32:37 2017 +0530 @@ -3790,6 +3790,11 @@ p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512); p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512); +p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_avx512); +p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx512); +p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512); +p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512); + } #endif } diff -r cc3a93869b28 -r abdbd144d5b4 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmFri Jul 07 12:32:37 2017 +0530 +++ b/source/common/x86/mc-a.asmFri Jul 07 12:32:37 2017 +0530 @@ -2951,6 +2951,65 @@ ADDAVG_W64_H2_AVX512 48 ADDAVG_W64_H2_AVX512 64 +%macro ADDAVG_W64_H2_AVX512 1 +INIT_ZMM avx512 +cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride +vbroadcasti32x8 m4, [pw_256] +vbroadcasti32x8 m5, [pw_128] +add r3, r3 +add r4, r4 +mov r6d, %1/16 + +.loop: +%rep 8 +movum0, [r0] +movum1, [r1] +movum2, [r0 + 64] +movum3, [r1 + 64] +paddw m0, m1 +pmulhrswm0, m4 +paddw m0, m5 +paddw m2, m3 +pmulhrswm2, m4 +paddw m2, m5 + +packuswbm0, m2 +vpermq m0, m0, 11011000b +vshufi64x2 m0, m0, 11011000b +movu[r2], m0 + + +movum0, [r0 + r3] +movum1, [r1 + r4] +movum2, [r0 + r3 + 64] +movum3, [r1 + r4 + 64] +paddw m0, m1 +pmulhrswm0, m4 +paddw m0, m5 +paddw m2, m3 +pmulhrswm2, m4 +paddw m2, m5 + +packuswbm0, m2 +vpermq m0, m0, 11011000b +vshufi64x2 m0, m0, 11011000b +movu[r2 + r5], m0 + +lea r2, [r2 + 2 * r5] +lea r0, [r0 + 2 * r3] +lea r1, [r1 + 2 * r4] +%endrep + +dec r6d +jnz .loop +RET +%endmacro + +ADDAVG_W64_H2_AVX512 16 +ADDAVG_W64_H2_AVX512 32 +ADDAVG_W64_H2_AVX512 48 +ADDAVG_W64_H2_AVX512 64 + %macro ADDAVG_W48_H2_AVX2 1 INIT_YMM avx2 cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 018 of 307] x86: AVX512 pixel_sad_64xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1499689280 -19800 # Mon Jul 10 17:51:20 2017 +0530 # Node ID 7d0bff5d6f2e1d2fe8609a3e498b1ccc149a10e9 # Parent 6c409d2363c42f485748c5a9d3f4b209f58e6aa5 x86: AVX512 pixel_sad_64xN Size| AVX2 performance | AVX512 performance 64x16 | 53.37x | 87.20x 64x32 | 63.88x | 104.01x 64x48 | 71.80x | 111.25x 64x64 | 74.98x | 118.60x diff -r 6c409d2363c4 -r 7d0bff5d6f2e source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jul 10 12:10:44 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jul 10 17:51:20 2017 +0530 @@ -3726,6 +3726,11 @@ } if (cpuMask & X265_CPU_AVX512) { +p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512); +p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512); +p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); +p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512); + p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_avx512); p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_avx512); p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_avx512); diff -r 6c409d2363c4 -r 7d0bff5d6f2e source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Mon Jul 10 12:10:44 2017 +0530 +++ b/source/common/x86/sad-a.asm Mon Jul 10 17:51:20 2017 +0530 @@ -6215,4 +6215,42 @@ movdeax, xm0 RET +;- +; int pixel_sad_64x%1( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;- +%macro PIXEL_SAD_W64_AVX512 1 +INIT_ZMM avx512 +cglobal pixel_sad_64x%1, 4,5,6 +xorps m0, m0 +xorps m5, m5 + +%rep %1/2 +movu m1, [r0] ; first 64 of row 0 of pix0 +movu m2, [r2] ; first 64 of row 0 of pix1 +movu m3, [r0 + r1] ; first 64 of row 1 of pix0 +movu m4, [r2 + r3] ; first 64 of row 1 of pix1 +psadbw m1, m2 +psadbw m3, m4 +paddd m0, m1 +paddd m5, m3 +lear2, [r2 + 2 * r3] +lear0, [r0 + 2 * r1] +%endrep + +paddd m0, m5 +vextracti32x8 ym1, m0, 1 +paddd ym0, ym1 +vextracti64x2 xm1, m0, 1 +paddd xm0, xm1 +pshufd xm1, xm0, 2 +paddd xm0, xm1 +movd eax, xm0 +RET +%endmacro + +PIXEL_SAD_W64_AVX512 16 +PIXEL_SAD_W64_AVX512 32 +PIXEL_SAD_W64_AVX512 48 +PIXEL_SAD_W64_AVX512 64 + %endif ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 017 of 307] Backed out changeset: 5f3ebc0b5512
# HG changeset patch # User Praveen Tiwari # Date 1499668844 -19800 # Mon Jul 10 12:10:44 2017 +0530 # Node ID 6c409d2363c42f485748c5a9d3f4b209f58e6aa5 # Parent abdbd144d5b4cdfc7f84b540d713147d9b5143fc Backed out changeset: 5f3ebc0b5512 diff -r abdbd144d5b4 -r 6c409d2363c4 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jul 07 12:32:37 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jul 10 12:10:44 2017 +0530 @@ -3790,11 +3790,6 @@ p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512); p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512); -p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_avx512); -p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx512); -p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512); -p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512); - } #endif } diff -r abdbd144d5b4 -r 6c409d2363c4 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmFri Jul 07 12:32:37 2017 +0530 +++ b/source/common/x86/mc-a.asmMon Jul 10 12:10:44 2017 +0530 @@ -2951,65 +2951,6 @@ ADDAVG_W64_H2_AVX512 48 ADDAVG_W64_H2_AVX512 64 -%macro ADDAVG_W64_H2_AVX512 1 -INIT_ZMM avx512 -cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride -vbroadcasti32x8 m4, [pw_256] -vbroadcasti32x8 m5, [pw_128] -add r3, r3 -add r4, r4 -mov r6d, %1/16 - -.loop: -%rep 8 -movum0, [r0] -movum1, [r1] -movum2, [r0 + 64] -movum3, [r1 + 64] -paddw m0, m1 -pmulhrswm0, m4 -paddw m0, m5 -paddw m2, m3 -pmulhrswm2, m4 -paddw m2, m5 - -packuswbm0, m2 -vpermq m0, m0, 11011000b -vshufi64x2 m0, m0, 11011000b -movu[r2], m0 - - -movum0, [r0 + r3] -movum1, [r1 + r4] -movum2, [r0 + r3 + 64] -movum3, [r1 + r4 + 64] -paddw m0, m1 -pmulhrswm0, m4 -paddw m0, m5 -paddw m2, m3 -pmulhrswm2, m4 -paddw m2, m5 - -packuswbm0, m2 -vpermq m0, m0, 11011000b -vshufi64x2 m0, m0, 11011000b -movu[r2 + r5], m0 - -lea r2, [r2 + 2 * r5] -lea r0, [r0 + 2 * r3] -lea r1, [r1 + 2 * r4] -%endrep - -dec r6d -jnz .loop -RET -%endmacro - -ADDAVG_W64_H2_AVX512 16 -ADDAVG_W64_H2_AVX512 32 -ADDAVG_W64_H2_AVX512 48 -ADDAVG_W64_H2_AVX512 64 - %macro ADDAVG_W48_H2_AVX2 1 INIT_YMM avx2 cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 019 of 307] x86: AVX512 pixel_sad_32xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1499747730 -19800 # Tue Jul 11 10:05:30 2017 +0530 # Node ID 40ab4480d070fca77c35c97c7c229b25d9a98a8a # Parent 7d0bff5d6f2e1d2fe8609a3e498b1ccc149a10e9 x86: AVX512 pixel_sad_32xN Size| AVX2 performance | AVX512 performance 32x8| 40.52x | 53.46x 32x16 | 58.49x | 52.20x 32x24 | 60.62x | 70.37x 32x32 | 52.25x | 58.86x 32x64 | 68.28x | 64.03x diff -r 7d0bff5d6f2e -r 40ab4480d070 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jul 10 17:51:20 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 11 10:05:30 2017 +0530 @@ -3726,6 +3726,11 @@ } if (cpuMask & X265_CPU_AVX512) { +p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512); +p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512); +p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512); +p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512); +p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512); p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512); p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512); p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); diff -r 7d0bff5d6f2e -r 40ab4480d070 source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Mon Jul 10 17:51:20 2017 +0530 +++ b/source/common/x86/sad-a.asm Tue Jul 11 10:05:30 2017 +0530 @@ -6253,4 +6253,45 @@ PIXEL_SAD_W64_AVX512 48 PIXEL_SAD_W64_AVX512 64 +%macro PIXEL_SAD_W32_AVX512 1 +INIT_ZMM avx512 +cglobal pixel_sad_32x%1, 4,7,5 +xorps m0, m0 +lea r5, [r1 * 3] +lea r6, [r3 * 3] + +%rep %1/4 +movu ym1, [r0] ; row 0 of pix0 +movu ym2, [r2] ; row 0 of pix1 +vinserti32x8m1, [r0 + r1], 1 ; row 1 of pix0 +vinserti32x8m2, [r2 + r3], 1 ; row 1 of pix1 +movu ym3, [r0 + 2 * r1] ; row 2 of pix0 +movu ym4, [r2 + 2 * r3] ; row 2 of pix1 +vinserti32x8m3, [r0 + r5], 1 ; row 3 of pix0 +vinserti32x8m4, [r2 + r6], 1 ; row 3 of pix1 + +psadbw m1, m2 +psadbw m3, m4 +paddd m0, m1 +paddd m0, m3 + +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +%endrep + +vextracti32x8 ym1, m0, 1 +paddd ym0, ym1 +vextracti64x2 xm1, m0, 1 +paddd xm0, xm1 +pshufd xm1, xm0, 2 +paddd xm0, xm1 +movd eax, xm0 +RET +%endmacro + +PIXEL_SAD_W32_AVX512 8 +PIXEL_SAD_W32_AVX512 16 +PIXEL_SAD_W32_AVX512 24 +PIXEL_SAD_W32_AVX512 32 +PIXEL_SAD_W32_AVX512 64 %endif ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 014 of 307] x86: AVX512 scale1D_128to64
# HG changeset patch # User Vignesh Vijayakumar # Date 1499342544 -19800 # Thu Jul 06 17:32:24 2017 +0530 # Node ID 7283818f2dd7191c8258030c7424fa6b4ed5330f # Parent e59a457cfe6c0e2cd4137bf3337a2a2d0a815850 x86: AVX512 scale1D_128to64 AVX2 performance over C code : 14.61x AVX512 performance over C code : 16.19x diff -r e59a457cfe6c -r 7283818f2dd7 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 06 17:00:09 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jul 06 17:32:24 2017 +0530 @@ -3783,6 +3783,8 @@ p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx512); p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_avx512); +p.scale1D_128to64 = PFX(scale1D_128to64_avx512); + } #endif } diff -r e59a457cfe6c -r 7283818f2dd7 source/common/x86/pixel-util.h --- a/source/common/x86/pixel-util.hThu Jul 06 17:00:09 2017 +0530 +++ b/source/common/x86/pixel-util.hThu Jul 06 17:32:24 2017 +0530 @@ -44,6 +44,7 @@ DEFINE_UTILS(ssse3); DEFINE_UTILS(sse4); DEFINE_UTILS(avx2); +DEFINE_UTILS(avx512); #undef DEFINE_UTILS diff -r e59a457cfe6c -r 7283818f2dd7 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Thu Jul 06 17:00:09 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Thu Jul 06 17:32:24 2017 +0530 @@ -4062,6 +4062,39 @@ RET %endif +%if HIGH_BIT_DEPTH == 0 +INIT_ZMM avx512 +cglobal scale1D_128to64, 2, 2, 6 +pxorm4, m4 +vbroadcasti32x8 m5, [pb_1] + +;Top pixel +movum0, [r1] +movum1, [r1 + 64] +movum2, [r1 + 128] +movum3, [r1 + 192] + +pmaddubsw m0, m0, m5 +pavgw m0, m4 +pmaddubsw m1, m1, m5 +pavgw m1, m4 +packuswbm0, m1 +vpermq m0, m0, q3120 +vshufi64x2 m0, m0, q3120 +movu[r0], m0 + +;Left pixel +pmaddubsw m2, m2, m5 +pavgw m2, m4 +pmaddubsw m3, m3, m5 +pavgw m3, m4 +packuswbm2, m3 +vpermq m2, m2, q3120 +vshufi64x2 m2, m2, q3120 +movu[r0 + 64], m2 +RET +%endif + ;- ; void scale2D_64to32(pixel *dst, pixel *src, intptr_t stride) ;- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 021 of 307] x86: AVX512 pixel_add_ps_64x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1499923196 -19800 # Thu Jul 13 10:49:56 2017 +0530 # Node ID 238c5ee9ad24dc6b283bb399eb013d937bc9ac1e # Parent a32718b2358bab3f19861d8402fe9adc8e312633 x86: AVX512 pixel_add_ps_64x64 AVX2 performance:13.99x AVX512 performance: 21.64x diff -r a32718b2358b -r 238c5ee9ad24 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 11 12:24:29 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jul 13 10:49:56 2017 +0530 @@ -3805,6 +3805,8 @@ p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512); p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512); +p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512); + } #endif } diff -r a32718b2358b -r 238c5ee9ad24 source/common/x86/pixeladd8.asm --- a/source/common/x86/pixeladd8.asm Tue Jul 11 12:24:29 2017 +0530 +++ b/source/common/x86/pixeladd8.asm Thu Jul 13 10:49:56 2017 +0530 @@ -1145,3 +1145,147 @@ RET %endif + +;- +; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;- +%macro PROCESS_ADD_PS_64x8_AVX512 0 +pmovzxbwm0, [r2] +pmovzxbwm1, [r2 + 32] +movum2, [r3] +movum3, [r3 + 64] +pmovzxbwm4, [r2 + r4] +pmovzxbwm5, [r2 + r4 + 32] +movum6, [r3 + r5] +movum7, [r3 + r5 + 64] + +paddw m0, m2 +paddw m1, m3 +paddw m4, m6 +paddw m5, m7 +packuswbm0, m1 +packuswbm4, m5 +vpermq m0, m0, 11011000b +vpermq m4, m4, 11011000b +vshufi64x2 m0, m0, 11011000b +vshufi64x2 m4, m4, 11011000b +movu[r0], m0 +movu[r0 + r1], m4 + +lea r2, [r2 + r4 * 2] +lea r3, [r3 + r5 * 2] +lea r0, [r0 + r1 * 2] + +pmovzxbwm0, [r2] +pmovzxbwm1, [r2 + 32] +movum2, [r3] +movum3, [r3 + 64] +pmovzxbwm4, [r2 + r4] +pmovzxbwm5, [r2 + r4 + 32] +movum6, [r3 + r5] +movum7, [r3 + r5 + 64] + +paddw m0, m2 +paddw m1, m3 +paddw m4, m6 +paddw m5, m7 +packuswbm0, m1 +packuswbm4, m5 +vpermq m0, m0, 11011000b +vpermq m4, m4, 11011000b +vshufi64x2 m0, m0, 11011000b +vshufi64x2 m4, m4, 11011000b +movu[r0], m0 +movu[r0 + r1], m4 + +lea r2, [r2 + r4 * 2] +lea r3, [r3 + r5 * 2] +lea r0, [r0 + r1 * 2] + +pmovzxbwm0, [r2] +pmovzxbwm1, [r2 + 32] +movum2, [r3] +movum3, [r3 + 64] +pmovzxbwm4, [r2 + r4] +pmovzxbwm5, [r2 + r4 + 32] +movum6, [r3 + r5] +movum7, [r3 + r5 + 64] + +paddw m0, m2 +paddw m1, m3 +paddw m4, m6 +paddw m5, m7 +packuswbm0, m1 +packuswbm4, m5 +vpermq m0, m0, 11011000b +vpermq m4, m4, 11011000b +vshufi64x2 m0, m0, 11011000b +vshufi64x2 m4, m4, 11011000b +movu[r0], m0 +movu[r0 + r1], m4 + +lea r2, [r2 + r4 * 2] +lea r3, [r3 + r5 * 2] +lea r0, [r0 + r1 * 2] + +pmovzxbwm0, [r2] +pmovzxbwm1, [r2 + 32] +movum2, [r3] +movum3, [r3 + 64] +pmovzxbwm4, [r2 + r4] +pmovzxbwm5, [r2 + r4 + 32] +movum6, [r3 + r5] +movum7, [r3 + r5 + 64] + +paddw m0, m2 +paddw m1, m3 +paddw m4, m6 +paddw m5, m7 +packuswbm0, m1 +packuswbm4, m5 +vpermq m0, m0, 11011000b +vpermq m4, m4, 11011000b +vshufi64x2 m0, m0, 11011000b +vshufi64x2 m4, m4, 11011000b +movu[r0], m0 +movu[r0 + r1], m4 +%endmacro + +%if ARCH_X86_64 +%if HIGH_BIT_DEPTH==0 +INIT_ZMM avx512 +cglobal pixel_add_ps_64x64, 6, 7, 8 +add r5, r5 +PROCESS_ADD_PS_64x8_AVX512 +lea r2, [r2 + r4 * 2] +lea r3, [r3 + r5 * 2] +lea r0,
[x265] [PATCH 015 of 307] x86: AVX512 addAvg_W64
# HG changeset patch # User Vignesh Vijayakumar # Date 1499410957 -19800 # Fri Jul 07 12:32:37 2017 +0530 # Node ID cc3a93869b28b7d5b3478a2524d07e7e630a0eca # Parent 7283818f2dd7191c8258030c7424fa6b4ed5330f x86: AVX512 addAvg_W64 Size| AVX2 performance | AVX512 performance -- 64x16| 14.46x | 22.25x 64x32| 13.93x | 23.96x 64x48| 13.90x | 24.27x 64x64| 14.74x | 24.31x diff -r 7283818f2dd7 -r cc3a93869b28 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 06 17:32:24 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jul 07 12:32:37 2017 +0530 @@ -3785,6 +3785,11 @@ p.scale1D_128to64 = PFX(scale1D_128to64_avx512); +p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_avx512); +p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx512); +p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512); +p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512); + } #endif } diff -r 7283818f2dd7 -r cc3a93869b28 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmThu Jul 06 17:32:24 2017 +0530 +++ b/source/common/x86/mc-a.asmFri Jul 07 12:32:37 2017 +0530 @@ -2892,6 +2892,65 @@ ADDAVG_W64_H2_AVX2 48 ADDAVG_W64_H2_AVX2 64 +%macro ADDAVG_W64_H2_AVX512 1 +INIT_ZMM avx512 +cglobal addAvg_64x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride +vbroadcasti32x8 m4, [pw_256] +vbroadcasti32x8 m5, [pw_128] +add r3, r3 +add r4, r4 +mov r6d, %1/16 + +.loop: +%rep 8 +movum0, [r0] +movum1, [r1] +movum2, [r0 + 64] +movum3, [r1 + 64] +paddw m0, m1 +pmulhrswm0, m4 +paddw m0, m5 +paddw m2, m3 +pmulhrswm2, m4 +paddw m2, m5 + +packuswbm0, m2 +vpermq m0, m0, 11011000b +vshufi64x2 m0, m0, 11011000b +movu[r2], m0 + + +movum0, [r0 + r3] +movum1, [r1 + r4] +movum2, [r0 + r3 + 64] +movum3, [r1 + r4 + 64] +paddw m0, m1 +pmulhrswm0, m4 +paddw m0, m5 +paddw m2, m3 +pmulhrswm2, m4 +paddw m2, m5 + +packuswbm0, m2 +vpermq m0, m0, 11011000b +vshufi64x2 m0, m0, 11011000b +movu[r2 + r5], m0 + +lea r2, [r2 + 2 * r5] +lea r0, [r0 + 2 * r3] +lea r1, [r1 + 2 * r4] +%endrep + +dec r6d +jnz .loop +RET +%endmacro + +ADDAVG_W64_H2_AVX512 16 +ADDAVG_W64_H2_AVX512 32 +ADDAVG_W64_H2_AVX512 48 +ADDAVG_W64_H2_AVX512 64 + %macro ADDAVG_W48_H2_AVX2 1 INIT_YMM avx2 cglobal addAvg_48x%1, 6,7,6, pSrc0, src0, src1, dst, src0Stride, src1tride, dstStride ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 023 of 307] x86: AVX512 pixel_sub_ps_64x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1499853711 -19800 # Wed Jul 12 15:31:51 2017 +0530 # Node ID fda2f079d3358900506a7965569c6a9a39d15eb4 # Parent c1b7926fb590752578aa8cd17f4b86a7f743791b x86: AVX512 pixel_sub_ps_64x64 AVX2 performance : 2.41x AVX512 performance: 4.33x diff -r c1b7926fb590 -r fda2f079d335 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 13 11:09:49 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Jul 12 15:31:51 2017 +0530 @@ -3810,6 +3810,8 @@ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2); p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx2); +p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512); + } #endif } diff -r c1b7926fb590 -r fda2f079d335 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Thu Jul 13 11:09:49 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Wed Jul 12 15:31:51 2017 +0530 @@ -5782,6 +5782,132 @@ jnz .loop RET %endif + +;- +; void pixel_sub_ps_64x64(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); +;- +%macro PROCESS_SUB_PS_64x8_AVX512 0 +pmovzxbwm0, [r2] +pmovzxbwm1, [r2 + 32] +pmovzxbwm2, [r3] +pmovzxbwm3, [r3 + 32] +pmovzxbwm4, [r2 + r4] +pmovzxbwm5, [r2 + r4 + 32] +pmovzxbwm6, [r3 + r5] +pmovzxbwm7, [r3 + r5 + 32] + +psubw m0, m2 +psubw m1, m3 +psubw m4, m6 +psubw m5, m7 +movu[r0], m0 +movu[r0 + 64], m1 +movu[r0 + 2 * r1], m4 +movu[r0 + 2 * r1 + 64], m5 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 2 * r4] +lea r3, [r3 + 2 * r5] + +pmovzxbwm0, [r2] +pmovzxbwm1, [r2 + 32] +pmovzxbwm2, [r3] +pmovzxbwm3, [r3 + 32] +pmovzxbwm4, [r2 + r4] +pmovzxbwm5, [r2 + r4 + 32] +pmovzxbwm6, [r3 + r5] +pmovzxbwm7, [r3 + r5 + 32] + +psubw m0, m2 +psubw m1, m3 +psubw m4, m6 +psubw m5, m7 +movu[r0], m0 +movu[r0 + 64], m1 +movu[r0 + 2 * r1], m4 +movu[r0 + 2 * r1 + 64], m5 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 2 * r4] +lea r3, [r3 + 2 * r5] + +pmovzxbwm0, [r2] +pmovzxbwm1, [r2 + 32] +pmovzxbwm2, [r3] +pmovzxbwm3, [r3 + 32] +pmovzxbwm4, [r2 + r4] +pmovzxbwm5, [r2 + r4 + 32] +pmovzxbwm6, [r3 + r5] +pmovzxbwm7, [r3 + r5 + 32] + +psubw m0, m2 +psubw m1, m3 +psubw m4, m6 +psubw m5, m7 +movu[r0], m0 +movu[r0 + 64], m1 +movu[r0 + 2 * r1], m4 +movu[r0 + 2 * r1 + 64], m5 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 2 * r4] +lea r3, [r3 + 2 * r5] + +pmovzxbwm0, [r2] +pmovzxbwm1, [r2 + 32] +pmovzxbwm2, [r3] +pmovzxbwm3, [r3 + 32] +pmovzxbwm4, [r2 + r4] +pmovzxbwm5, [r2 + r4 + 32] +pmovzxbwm6, [r3 + r5] +pmovzxbwm7, [r3 + r5 + 32] + +psubw m0, m2 +psubw m1, m3 +psubw m4, m6 +psubw m5, m7 +movu[r0], m0 +movu[r0 + 64], m1 +movu[r0 + 2 * r1], m4 +movu[r0 + 2 * r1 + 64], m5 +%endmacro + +%if HIGH_BIT_DEPTH==0 +%if ARCH_X86_64 +INIT_ZMM avx512 +cglobal pixel_sub_ps_64x64, 6, 7, 8 +PROCESS_SUB_PS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 2 * r4] +lea r3, [r3 + 2 * r5] +PROCESS_SUB_PS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 2 * r4] +lea r3, [r3 + 2 * r5] +PROCESS_SUB_PS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 2 * r4] +lea r3, [r3 + 2 * r5] +PROCESS_SUB_PS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 2 * r4] +lea r3, [r3 + 2 * r5] +PROCESS_SUB_PS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 2 * r4] +lea r3, [r3 + 2 * r5] +PROCESS_SUB_PS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 2 * r4] +lea r3,
[x265] [PATCH 022 of 307] x86: AVX512 pixel_add_ps_32xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1499924389 -19800 # Thu Jul 13 11:09:49 2017 +0530 # Node ID c1b7926fb590752578aa8cd17f4b86a7f743791b # Parent 238c5ee9ad24dc6b283bb399eb013d937bc9ac1e x86: AVX512 pixel_add_ps_32xN AVX2 performance:14.81x AVX512 performance: 18.01x diff -r 238c5ee9ad24 -r c1b7926fb590 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 13 10:49:56 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jul 13 11:09:49 2017 +0530 @@ -3806,6 +3806,9 @@ p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512); p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512); +p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512); +p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2); +p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx2); } #endif diff -r 238c5ee9ad24 -r c1b7926fb590 source/common/x86/pixeladd8.asm --- a/source/common/x86/pixeladd8.asm Thu Jul 13 10:49:56 2017 +0530 +++ b/source/common/x86/pixeladd8.asm Thu Jul 13 11:09:49 2017 +0530 @@ -768,6 +768,131 @@ PIXEL_ADD_PS_W32_H4_avx2 32 PIXEL_ADD_PS_W32_H4_avx2 64 +;- +; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;- +%macro PROCESS_ADD_PS_32x8_AVX512 0 +pmovzxbwm0, [r2]; row 0 of src0 +movum1, [r3]; row 0 of src1 +pmovzxbwm2, [r2 + r4] ; row 1 of src0 +movum3, [r3 + r5] ; row 1 of src1 +pmovzxbwm4, [r2 + r4 * 2] ; row 2 of src0 +movum5, [r3 + r5 * 2] ; row 2 of src1 +pmovzxbwm6, [r2 + r7] ; row 3 of src0 +movum7, [r3 + r8] ; row 3 of src1 + +paddw m0, m1 +paddw m2, m3 +paddw m4, m5 +paddw m6, m7 +packuswbm0, m2 +packuswbm4, m6 +vpermq m0, m0, 11011000b +vpermq m4, m4, 11011000b +vshufi64x2 m0, m0, 11011000b +vshufi64x2 m4, m4, 11011000b +movu[r0],ym0; row 0 of dst +movu[r0 + r1 * 2], ym4; row 2 of dst +vshufi64x2 m0, m0, 01001110b +vshufi64x2 m4, m4, 01001110b +movu[r0 + r1], ym0; row 1 of dst +movu[r0 + r9], ym4; row 3 of dst + +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +lea r0, [r0 + r1 * 4] + +pmovzxbwm0, [r2]; row 4 of src0 +movum1, [r3]; row 4 of src1 +pmovzxbwm2, [r2 + r4] ; row 5 of src0 +movum3, [r3 + r5] ; row 5 of src1 +pmovzxbwm4, [r2 + r4 * 2] ; row 6 of src0 +movum5, [r3 + r5 * 2] ; row 6 of src1 +pmovzxbwm6, [r2 + r7] ; row 7 of src0 +movum7, [r3 + r8] ; row 7 of src1 + +paddw m0, m1 +paddw m2, m3 +paddw m4, m5 +paddw m6, m7 +packuswbm0, m2 +packuswbm4, m6 +vpermq m0, m0, 11011000b +vpermq m4, m4, 11011000b +vshufi64x2 m0, m0, 11011000b +vshufi64x2 m4, m4, 11011000b +movu[r0],ym0; row 4 of dst +movu[r0 + r1 * 2], ym4; row 6 of dst +vshufi64x2 m0, m0, 01001110b +vshufi64x2 m4, m4, 01001110b +movu[r0 + r1], ym0; row 5 of dst +movu[r0 + r9], ym4; row 7 of dst +%endmacro + + +%if HIGH_BIT_DEPTH==0 +%if ARCH_X86_64 +INIT_ZMM avx512 +cglobal pixel_add_ps_32x32, 6, 10, 8 +add r5, r5 +lea r7, [r4 * 3] +lea r8, [r5 * 3] +lea r9, [r1 * 3] + +PROCESS_ADD_PS_32x8_AVX512 +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +lea r0, [r0 + r1 * 4] +PROCESS_ADD_PS_32x8_AVX512 +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +lea r0, [r0 + r1 * 4] +PROCESS_ADD_PS_32x8_AVX512 +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +lea r0, [r0 + r1 * 4] +PROCESS_ADD_PS_32x8_AVX512 +RET + +INIT_ZMM avx512 +cglobal pixel_add_ps_32x64, 6,
[x265] [PATCH 020 of 307] x86: AVX512 pixel_sad_x4_W64
# HG changeset patch # User Vignesh Vijayakumar # Date 1499756069 -19800 # Tue Jul 11 12:24:29 2017 +0530 # Node ID a32718b2358bab3f19861d8402fe9adc8e312633 # Parent 40ab4480d070fca77c35c97c7c229b25d9a98a8a x86: AVX512 pixel_sad_x4_W64 Size| AVX2 performance | AVX512 performance 64x16 | 67.53x | 87.52x 64x32 | 73.27x | 100.10x 64x48 | 76.21x | 100.98x 64x64 | 79.72x | 102.79x diff -r 40ab4480d070 -r a32718b2358b source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 11 10:05:30 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 11 12:24:29 2017 +0530 @@ -3736,6 +3736,11 @@ p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512); +p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx512); +p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512); +p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512); +p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx512); + p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_avx512); p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_avx512); p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_avx512); diff -r 40ab4480d070 -r a32718b2358b source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Tue Jul 11 10:05:30 2017 +0530 +++ b/source/common/x86/sad-a.asm Tue Jul 11 12:24:29 2017 +0530 @@ -4128,6 +4128,315 @@ SAD_X4_48x8_AVX2 PIXEL_SAD_X4_END_AVX2 RET + +; +;sad_x4 avx512 code start +; +%macro SAD_X4_64x8_AVX512 0 +movum4, [r0] +movum5, [r1] +movum6, [r2] +movum7, [r3] +movum8, [r4] + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 + +movum4, [r0 + FENC_STRIDE] +movum5, [r1 + r5] +movum6, [r2 + r5] +movum7, [r3 + r5] +movum8, [r4 + r5] + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 + +movum4, [r0 + FENC_STRIDE * 2] +movum5, [r1 + r5 * 2] +movum6, [r2 + r5 * 2] +movum7, [r3 + r5 * 2] +movum8, [r4 + r5 * 2] + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 + +movum4, [r0 + FENC_STRIDE * 3] +movum5, [r1 + r7] +movum6, [r2 + r7] +movum7, [r3 + r7] +movum8, [r4 + r7] + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 + +add r0, FENC_STRIDE * 4 +lea r1, [r1 + r5 * 4] +lea r2, [r2 + r5 * 4] +lea r3, [r3 + r5 * 4] +lea r4, [r4 + r5 * 4] + +movum4, [r0] +movum5, [r1] +movum6, [r2] +movum7, [r3] +movum8, [r4] + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 + +movum4, [r0 + FENC_STRIDE] +movum5, [r1 + r5] +movum6, [r2 + r5] +movum7, [r3 + r5] +movum8, [r4 + r5] + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 + +movum4, [r0 + FENC_STRIDE * 2] +movum5, [r1 + r5 * 2] +movum6, [r2 + r5 * 2] +movum7, [r3 + r5 * 2] +movum8, [r4 + r5 * 2] + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 + +movum4, [r0 + FENC_STRIDE * 3] +
[x265] [PATCH 024 of 307] x86: AVX512 pixel_sub_ps_32xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1499854366 -19800 # Wed Jul 12 15:42:46 2017 +0530 # Node ID 77b61125a20591cb5bad2a15a30cb9114a1d8d30 # Parent fda2f079d3358900506a7965569c6a9a39d15eb4 x86: AVX512 pixel_sub_ps_32xN AVX2 performance : 3.35x AVX512 performance: 6.07x diff -r fda2f079d335 -r 77b61125a205 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Jul 12 15:31:51 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Jul 12 15:42:46 2017 +0530 @@ -3811,6 +3811,9 @@ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx2); p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512); +p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512); +p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512); +p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512); } #endif diff -r fda2f079d335 -r 77b61125a205 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Wed Jul 12 15:31:51 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Wed Jul 12 15:42:46 2017 +0530 @@ -5359,6 +5359,117 @@ %endif ;- +; void pixel_sub_ps_32x32(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); +;- +%macro PROCESS_SUB_PS_32x8_AVX512 0 +pmovzxbwm0, [r2] +pmovzxbwm1, [r3] +pmovzxbwm2, [r2 + r4] +pmovzxbwm3, [r3 + r5] +pmovzxbwm4, [r2 + 2 * r4] +pmovzxbwm5, [r3 + 2 * r5] +pmovzxbwm6, [r2 + r7] +pmovzxbwm7, [r3 + r8] + +psubw m0, m1 +psubw m2, m3 +psubw m4, m5 +psubw m6, m7 + +movu[r0], m0 +movu[r0 + r1],m2 +movu[r0 + r1 * 2 ], m4 +movu[r0 + r9],m6 + +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +lea r0, [r0 + r1 * 4] + +pmovzxbwm0, [r2] +pmovzxbwm1, [r3] +pmovzxbwm2, [r2 + r4] +pmovzxbwm3, [r3 + r5] +pmovzxbwm4, [r2 + 2 * r4] +pmovzxbwm5, [r3 + 2 * r5] +pmovzxbwm6, [r2 + r7] +pmovzxbwm7, [r3 + r8] + +psubw m0, m1 +psubw m2, m3 +psubw m4, m5 +psubw m6, m7 + +movu[r0], m0 +movu[r0 + r1],m2 +movu[r0 + r1 * 2 ], m4 +movu[r0 + r9],m6 +%endmacro + +%if HIGH_BIT_DEPTH==0 +%if ARCH_X86_64 +INIT_ZMM avx512 +cglobal pixel_sub_ps_32x32, 6, 10, 8 +add r1, r1 +lea r7, [r4 * 3] +lea r8, [r5 * 3] +lea r9, [r1 * 3] + +PROCESS_SUB_PS_32x8_AVX512 +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +lea r0, [r0 + r1 * 4] +PROCESS_SUB_PS_32x8_AVX512 +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +lea r0, [r0 + r1 * 4] +PROCESS_SUB_PS_32x8_AVX512 +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +lea r0, [r0 + r1 * 4] +PROCESS_SUB_PS_32x8_AVX512 +RET + +INIT_ZMM avx512 +cglobal pixel_sub_ps_32x64, 6, 10, 8 +add r1, r1 +lea r7, [r4 * 3] +lea r8, [r5 * 3] +lea r9, [r1 * 3] + +PROCESS_SUB_PS_32x8_AVX512 +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +lea r0, [r0 + r1 * 4] +PROCESS_SUB_PS_32x8_AVX512 +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +lea r0, [r0 + r1 * 4] +PROCESS_SUB_PS_32x8_AVX512 +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +lea r0, [r0 + r1 * 4] +PROCESS_SUB_PS_32x8_AVX512 +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +lea r0, [r0 + r1 * 4] +PROCESS_SUB_PS_32x8_AVX512 +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +lea r0, [r0 + r1 * 4] +PROCESS_SUB_PS_32x8_AVX512 +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +lea r0, [r0 + r1 * 4] +PROCESS_SUB_PS_32x8_AVX512 +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +lea r0, [r0 + r1 * 4] +PROCESS_SUB_PS_32x8_AVX512 +RET +%endif +%endif + +;- ; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcst
[x265] [PATCH 027 of 307] x86: AVX512 pixel_sad_x3_32xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1500208511 -19800 # Sun Jul 16 18:05:11 2017 +0530 # Node ID 5a2d94db6fcaabf532f00848a72fa337bb5e65ac # Parent 20ca79c2c6a803e2c6caf0c1dc87fb211ea9f708 x86: AVX512 pixel_sad_x3_32xN Size | AVX2 performance | AVX512 performance - 32x8 | 55.55x | 65.60x 32x16 | 54.95x | 67.83x 32x24 | 57.95x | 72.69x 32x32 | 64.35x | 76.33x 32x64 | 65.02x | 82.61x diff -r 20ca79c2c6a8 -r 5a2d94db6fca source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jul 14 11:49:50 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Sun Jul 16 18:05:11 2017 +0530 @@ -3736,6 +3736,11 @@ p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512); +p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx512); +p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx512); +p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512); +p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512); +p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512); p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx512); p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512); p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512); diff -r 20ca79c2c6a8 -r 5a2d94db6fca source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Fri Jul 14 11:49:50 2017 +0530 +++ b/source/common/x86/sad-a.asm Sun Jul 16 18:05:11 2017 +0530 @@ -6235,6 +6235,77 @@ paddd m2, m3 %endmacro +%macro SAD_X3_32x8_AVX512 0 +movuym3, [r0] +vinserti32x8m3, [r0 + FENC_STRIDE], 1 +movuym4, [r1] +vinserti32x8m4, [r1 + r4], 1 +movuym5, [r2] +vinserti32x8m5, [r2 + r4], 1 +movuym6, [r3] +vinserti32x8m6, [r3 + r4], 1 + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movuym3, [r0 + FENC_STRIDE * 2] +vinserti32x8 m3, [r0 + FENC_STRIDE * 3], 1 +movuym4, [r1 + r4 * 2] +vinserti32x8 m4, [r1 + r6], 1 +movuym5, [r2 + r4 * 2] +vinserti32x8 m5, [r2 + r6], 1 +movuym6, [r3 + r4 * 2] +vinserti32x8 m6, [r3 + r6], 1 + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +add r0, FENC_STRIDE * 4 +lea r1, [r1 + r4 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r4 * 4] + +movuym3, [r0] +vinserti32x8m3, [r0 + FENC_STRIDE], 1 +movuym4, [r1] +vinserti32x8m4, [r1 + r4], 1 +movuym5, [r2] +vinserti32x8m5, [r2 + r4], 1 +movuym6, [r3] +vinserti32x8m6, [r3 + r4], 1 + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movuym3, [r0 + FENC_STRIDE * 2] +vinserti32x8 m3, [r0 + FENC_STRIDE * 3], 1 +movuym4, [r1 + r4 * 2] +vinserti32x8 m4, [r1 + r6], 1 +movuym5, [r2 + r4 * 2] +vinserti32x8 m5, [r2 + r6], 1 +movuym6, [r3 + r4 * 2] +vinserti32x8 m6, [r3 + r6], 1 + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 +%endmacro + %macro PIXEL_SAD_X3_END_AVX512 0 vextracti32x8 ym3, m0, 1 vextracti32x8 ym4, m1, 1 @@ -6382,6 +6453,126 @@ SAD_X3_64x8_AVX512 PIXEL_SAD_X3_END_AVX512 RET + +INIT_ZMM avx512 +cglobal pixel_sad_x3_32x8, 6,7,8 +pxorm0, m0 +pxorm1, m1 +pxorm2, m2 +lea r6, [r4 * 3] + +SAD_X3_32x8_AVX512 +PIXEL_SAD_X3_END_AVX512 +RET + +INIT_ZMM avx512 +cglobal pixel_sad_x3_32x16, 6,7,8 +pxorm0, m0 +pxorm1, m1 +pxorm2, m2 +lea r6, [r4 * 3] + +SAD_X3_32x8_AVX512 +add r0, FENC_STRIDE * 4 +lea r1, [r1 + r4 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r4 * 4] +SAD_X3_32x8_AVX512 +PIXEL_SAD_X3_END_AVX512 +RET + +INIT_ZMM avx512 +cglobal pixel_sad_x3_32x24, 6,7,8 +pxorm0, m0 +pxorm1, m1 +pxorm2, m2 +lea r6, [r4 * 3] + +SAD_X3_32x8_AVX512 +add r0, FENC_STRIDE * 4 +lea r1, [r1 + r4 * 4] +le
[x265] [PATCH 028 of 307] x86: AVX512 pixel_sad_x3_48x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1500260234 -19800 # Mon Jul 17 08:27:14 2017 +0530 # Node ID 229c13a0d7e4a1dafad7b0a2e9eef041ecccdb77 # Parent 5a2d94db6fcaabf532f00848a72fa337bb5e65ac x86: AVX512 pixel_sad_x3_48x64 AVX2 performance : 59.91x AVX512 performance: 61.95x diff -r 5a2d94db6fca -r 229c13a0d7e4 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Sun Jul 16 18:05:11 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jul 17 08:27:14 2017 +0530 @@ -3745,6 +3745,7 @@ p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512); p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512); p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx512); +p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx512); p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512); p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512); diff -r 5a2d94db6fca -r 229c13a0d7e4 source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Sun Jul 16 18:05:11 2017 +0530 +++ b/source/common/x86/sad-a.asm Mon Jul 17 08:27:14 2017 +0530 @@ -6306,6 +6306,125 @@ paddd m2, m3 %endmacro +%macro SAD_X3_48x8_AVX512 0 +movuym3, [r0] +vinserti32x8m3, [r0 + FENC_STRIDE], 1 +movuym4, [r1] +vinserti32x8m4, [r1 + r4], 1 +movuym5, [r2] +vinserti32x8m5, [r2 + r4], 1 +movuym6, [r3] +vinserti32x8m6, [r3 + r4], 1 + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movuym3, [r0 + FENC_STRIDE * 2] +vinserti32x8 m3, [r0 + FENC_STRIDE * 3], 1 +movuym4, [r1 + r4 * 2] +vinserti32x8 m4, [r1 + r6], 1 +movuym5, [r2 + r4 * 2] +vinserti32x8 m5, [r2 + r6], 1 +movuym6, [r3 + r4 * 2] +vinserti32x8 m6, [r3 + r6], 1 + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movu xm3, [r0 + 32] +vinserti32x4m3, [r0 + FENC_STRIDE + 32], 1 +vinserti32x4m3, [r0 + 2 * FENC_STRIDE + 32], 2 +vinserti32x4m3, [r0 + 3 * FENC_STRIDE + 32], 3 +movu xm4, [r1 + 32] +vinserti32x4m4, [r1 + r4 + 32], 1 +vinserti32x4m4, [r1 + 2 * r4 + 32], 2 +vinserti32x4m4, [r1 + r6 + 32], 3 +movu xm5, [r2 + 32] +vinserti32x4m5, [r2 + r4 + 32], 1 +vinserti32x4m5, [r2 + 2 * r4 + 32], 2 +vinserti32x4m5, [r2 + r6 + 32], 3 +movu xm6, [r3 + 32] +vinserti32x4m6, [r3 + r4 + 32], 1 +vinserti32x4m6, [r3 + 2 * r4 + 32], 2 +vinserti32x4m6, [r3 + r6 + 32], 3 + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +add r0, FENC_STRIDE * 4 +lea r1, [r1 + r4 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r4 * 4] + +movuym3, [r0] +vinserti32x8m3, [r0 + FENC_STRIDE], 1 +movuym4, [r1] +vinserti32x8m4, [r1 + r4], 1 +movuym5, [r2] +vinserti32x8m5, [r2 + r4], 1 +movuym6, [r3] +vinserti32x8m6, [r3 + r4], 1 + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movuym3, [r0 + FENC_STRIDE * 2] +vinserti32x8 m3, [r0 + FENC_STRIDE * 3], 1 +movuym4, [r1 + r4 * 2] +vinserti32x8 m4, [r1 + r6], 1 +movuym5, [r2 + r4 * 2] +vinserti32x8 m5, [r2 + r6], 1 +movuym6, [r3 + r4 * 2] +vinserti32x8 m6, [r3 + r6], 1 + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movu xm3, [r0 + 32] +vinserti32x4m3, [r0 + FENC_STRIDE + 32], 1 +vinserti32x4m3, [r0 + 2 * FENC_STRIDE + 32], 2 +vinserti32x4m3, [r0 + 3 * FENC_STRIDE + 32], 3 +movu xm4, [r1 + 32] +vinserti32x4m4, [r1 + r4 + 32], 1 +vinserti32x4m4, [r1 + 2 * r4 + 32], 2 +vinserti32x4m4, [r1 + r6 + 32], 3 +movu xm5, [r2 + 32] +vinserti32x4m5, [r2 + r4 + 32], 1 +vinserti32x4m5, [r2 + 2 * r4 + 32], 2 +vinserti32x4m5, [r2 + r6 + 32], 3 +movu xm6, [r3 + 32] +vinserti32x4m6, [r3 + r4 + 32], 1 +vinserti32x4m6, [r3 + 2 * r4 + 32], 2 +vinserti32x4m6, [r3 + r6 + 32], 3 + +psadbw m7, m
[x265] [PATCH 025 of 307] x86: AVX512 pixel_sad_x4_W32
# HG changeset patch # User Vignesh Vijayakumar # Date 1500011514 -19800 # Fri Jul 14 11:21:54 2017 +0530 # Node ID 3183189cf8a0f1b95c31ecc39dd07b220ec53cea # Parent 77b61125a20591cb5bad2a15a30cb9114a1d8d30 x86: AVX512 pixel_sad_x4_W32 Size | AVX2 performance | AVX512 performance - 32x8 | 46.21x | 57.48x 32x16 | 50.19x | 72.69x 32x24 | 53.83x | 77.17x 32x32 | 56.39x | 82.85x 32x64 | 58.53x | 88.15x diff -r 77b61125a205 -r 3183189cf8a0 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Jul 12 15:42:46 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jul 14 11:21:54 2017 +0530 @@ -3736,6 +3736,11 @@ p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512); +p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512); +p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512); +p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512); +p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512); +p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx512); p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx512); p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512); p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512); diff -r 77b61125a205 -r 3183189cf8a0 source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Wed Jul 12 15:42:46 2017 +0530 +++ b/source/common/x86/sad-a.asm Fri Jul 14 11:21:54 2017 +0530 @@ -4260,6 +4260,94 @@ paddd m3, m4 %endmacro +%macro SAD_X4_32x8_AVX512 0 +movuym4, [r0] +vinserti32x8m4, [r0 + FENC_STRIDE], 1 +movuym5, [r1] +vinserti32x8m5, [r1 + r5], 1 +movuym6, [r2] +vinserti32x8m6, [r2 + r5], 1 +movuym7, [r3] +vinserti32x8m7, [r3 + r5], 1 +movuym8, [r4] +vinserti32x8m8, [r4 + r5], 1 + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 + +movuym4, [r0 + FENC_STRIDE * 2] +vinserti32x8 m4, [r0 + FENC_STRIDE * 3], 1 +movuym5, [r1 + r5 * 2] +vinserti32x8 m5, [r1 + r7], 1 +movuym6, [r2 + r5 * 2] +vinserti32x8 m6, [r2 + r7], 1 +movuym7, [r3 + r5 * 2] +vinserti32x8 m7, [r3 + r7], 1 +movuym8, [r4 + r5 * 2] +vinserti32x8 m8, [r4 + r7], 1 + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 + +add r0, FENC_STRIDE * 4 +lea r1, [r1 + r5 * 4] +lea r2, [r2 + r5 * 4] +lea r3, [r3 + r5 * 4] +lea r4, [r4 + r5 * 4] + +movuym4, [r0] +vinserti32x8m4, [r0 + FENC_STRIDE], 1 +movuym5, [r1] +vinserti32x8m5, [r1 + r5], 1 +movuym6, [r2] +vinserti32x8m6, [r2 + r5], 1 +movuym7, [r3] +vinserti32x8m7, [r3 + r5], 1 +movuym8, [r4] +vinserti32x8m8, [r4 + r5], 1 + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 + +movuym4, [r0 + FENC_STRIDE * 2] +vinserti32x8 m4, [r0 + FENC_STRIDE * 3], 1 +movuym5, [r1 + r5 * 2] +vinserti32x8 m5, [r1 + r7], 1 +movuym6, [r2 + r5 * 2] +vinserti32x8 m6, [r2 + r7], 1 +movuym7, [r3 + r5 * 2] +vinserti32x8 m7, [r3 + r7], 1 +movuym8, [r4 + r5 * 2] +vinserti32x8 m8, [r4 + r7], 1 + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 +%endmacro + %macro PIXEL_SAD_X4_END_AVX512 0 vextracti32x8 ym4, m0, 1 vextracti32x8 ym5, m1, 1 @@ -4434,6 +4522,144 @@ SAD_X4_64x8_AVX512 PIXEL_SAD_X4_END_AVX512 RET + +INIT_ZMM avx512 +cglobal pixel_sad_x4_32x8, 7,8,10 +pxorm0, m0 +pxorm1, m1 +pxorm2, m2 +pxorm3, m3 +lea r7, [r5 * 3] + +SAD_X4_32x8_AVX512 +PIXEL_SAD_X4_END_AVX512 +RET + +INIT_ZMM avx512 +cglobal pixel_sad_x4_32x16, 7,8,10 +pxorm0, m0 +pxor
[x265] [PATCH 029 of 307] x86: AVX512 pixel_sad_x4_48x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1500263597 -19800 # Mon Jul 17 09:23:17 2017 +0530 # Node ID 576a93cba7d189fddba3466a21188f0ece3ed278 # Parent 229c13a0d7e4a1dafad7b0a2e9eef041ecccdb77 x86: AVX512 pixel_sad_x4_48x64 AVX2 performance : 59.49x AVX512 performance: 62.29x diff -r 229c13a0d7e4 -r 576a93cba7d1 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jul 17 08:27:14 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jul 17 09:23:17 2017 +0530 @@ -3756,6 +3756,7 @@ p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512); p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512); p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx512); +p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_avx512); p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_avx512); p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_avx512); diff -r 229c13a0d7e4 -r 576a93cba7d1 source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Mon Jul 17 08:27:14 2017 +0530 +++ b/source/common/x86/sad-a.asm Mon Jul 17 09:23:17 2017 +0530 @@ -4348,6 +4348,154 @@ paddd m3, m4 %endmacro +%macro SAD_X4_48x8_AVX512 0 +movuym4, [r0] +vinserti32x8m4, [r0 + FENC_STRIDE], 1 +movuym5, [r1] +vinserti32x8m5, [r1 + r5], 1 +movuym6, [r2] +vinserti32x8m6, [r2 + r5], 1 +movuym7, [r3] +vinserti32x8m7, [r3 + r5], 1 +movuym8, [r4] +vinserti32x8m8, [r4 + r5], 1 + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 + +movuym4, [r0 + FENC_STRIDE * 2] +vinserti32x8 m4, [r0 + FENC_STRIDE * 3], 1 +movuym5, [r1 + r5 * 2] +vinserti32x8 m5, [r1 + r7], 1 +movuym6, [r2 + r5 * 2] +vinserti32x8 m6, [r2 + r7], 1 +movuym7, [r3 + r5 * 2] +vinserti32x8 m7, [r3 + r7], 1 +movuym8, [r4 + r5 * 2] +vinserti32x8 m8, [r4 + r7], 1 + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 + +movu xm4, [r0 + 32] +vinserti32x4m4, [r0 + FENC_STRIDE + 32], 1 +vinserti32x4m4, [r0 + FENC_STRIDE * 2 + 32], 2 +vinserti32x4m4, [r0 + FENC_STRIDE * 3 + 32], 3 +movu xm5, [r1 + 32] +vinserti32x4m5, [r1 + r5 + 32], 1 +vinserti32x4m5, [r1 + r5 * 2 + 32], 2 +vinserti32x4m5, [r1 + r7 + 32], 3 +movu xm6, [r2 + 32] +vinserti32x4m6, [r2 + r5 + 32], 1 +vinserti32x4m6, [r2 + r5 * 2 + 32], 2 +vinserti32x4m6, [r2 + r7 + 32], 3 +movu xm7, [r3 + 32] +vinserti32x4m7, [r3 + r5 + 32], 1 +vinserti32x4m7, [r3 + r5 * 2 + 32], 2 +vinserti32x4m7, [r3 + r7 + 32], 3 +movu xm8, [r4 + 32] +vinserti32x4m8, [r4 + r5 + 32], 1 +vinserti32x4m8, [r4 + r5 * 2 + 32], 2 +vinserti32x4m8, [r4 + r7 + 32], 3 + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 + +add r0, FENC_STRIDE * 4 +lea r1, [r1 + r5 * 4] +lea r2, [r2 + r5 * 4] +lea r3, [r3 + r5 * 4] +lea r4, [r4 + r5 * 4] + +movuym4, [r0] +vinserti32x8m4, [r0 + FENC_STRIDE], 1 +movuym5, [r1] +vinserti32x8m5, [r1 + r5], 1 +movuym6, [r2] +vinserti32x8m6, [r2 + r5], 1 +movuym7, [r3] +vinserti32x8m7, [r3 + r5], 1 +movuym8, [r4] +vinserti32x8m8, [r4 + r5], 1 + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 + +movuym4, [r0 + FENC_STRIDE * 2] +vinserti32x8 m4, [r0 + FENC_STRIDE * 3], 1 +movuym5, [r1 + r5 * 2] +vinserti32x8 m5, [r1 + r7], 1 +movuym6, [r2 + r5 * 2] +vinserti32x8 m6, [r2 + r7], 1 +movuym7, [r3 + r5 * 2] +vinserti32x8 m7, [r3 + r7], 1 +movuym8, [r4 + r5 * 2] +vinserti32x8 m8, [r4 + r7], 1 + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +
[x265] [PATCH 026 of 307] x86: AVX512 pixel_sad_x3_W64
# HG changeset patch # User Vignesh Vijayakumar # Date 1500013190 -19800 # Fri Jul 14 11:49:50 2017 +0530 # Node ID 20ca79c2c6a803e2c6caf0c1dc87fb211ea9f708 # Parent 3183189cf8a0f1b95c31ecc39dd07b220ec53cea x86: AVX512 pixel_sad_x3_W64 Size | AVX2 performance | AVX512 performance - 64x16 | 64,76x | 95.17x 64x32 | 71.08x | 106.10x 64x48 | 71.45x | 108.12x 64x64 | 75.57x | 110.06x diff -r 3183189cf8a0 -r 20ca79c2c6a8 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jul 14 11:21:54 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jul 14 11:49:50 2017 +0530 @@ -3736,6 +3736,11 @@ p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512); +p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx512); +p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512); +p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512); +p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx512); + p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512); p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512); p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512); diff -r 3183189cf8a0 -r 20ca79c2c6a8 source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Fri Jul 14 11:21:54 2017 +0530 +++ b/source/common/x86/sad-a.asm Fri Jul 14 11:49:50 2017 +0530 @@ -6129,6 +6129,263 @@ RET %endif +; +;sad_x3 avx512 code start +; +%macro SAD_X3_64x8_AVX512 0 +movum3, [r0] +movum4, [r1] +movum5, [r2] +movum6, [r3] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE] +movum4, [r1 + r4] +movum5, [r2 + r4] +movum6, [r3 + r4] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 2] +movum4, [r1 + r4 * 2] +movum5, [r2 + r4 * 2] +movum6, [r3 + r4 * 2] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 3] +movum4, [r1 + r6] +movum5, [r2 + r6] +movum6, [r3 + r6] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +add r0, FENC_STRIDE * 4 +lea r1, [r1 + r4 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r4 * 4] + +movum3, [r0] +movum4, [r1] +movum5, [r2] +movum6, [r3] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE] +movum4, [r1 + r4] +movum5, [r2 + r4] +movum6, [r3 + r4] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 2] +movum4, [r1 + r4 * 2] +movum5, [r2 + r4 * 2] +movum6, [r3 + r4 * 2] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 3] +movum4, [r1 + r6] +movum5, [r2 + r6] +movum6, [r3 + r6] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 +%endmacro + +%macro PIXEL_SAD_X3_END_AVX512 0 +vextracti32x8 ym3, m0, 1 +vextracti32x8 ym4, m1, 1 +vextracti32x8 ym5, m2, 1 +paddd ym0, ym3 +paddd ym1, ym4 +paddd ym2, ym5 +vextracti64x2 xm3, m0, 1 +vextracti64x2 xm4, m1, 1 +vextracti64x2 xm5, m2, 1 +paddd xm0, xm3 +paddd xm1, xm4 +paddd xm2, xm5 +pshufd xm3, xm0, 2 +
[x265] [PATCH 030 of 307] x86: AVX512 convert_p2s 64xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1499858302 -19800 # Wed Jul 12 16:48:22 2017 +0530 # Node ID a77082ebfa67b40f3dbb8cd45b54c17e710a104c # Parent 576a93cba7d189fddba3466a21188f0ece3ed278 x86: AVX512 convert_p2s 64xN Size| AVX2 performance | AVX512 performance 64x16 | 2.05x | 3.77x 64x32 | 2.16x | 3.88x 64x48 | 2.13x | 3.91x 64x64 | 2.16x | 4.00x diff -r 576a93cba7d1 -r a77082ebfa67 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jul 17 09:23:17 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Jul 12 16:48:22 2017 +0530 @@ -3832,6 +3832,11 @@ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512); p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512); +p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_avx512); +p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512); +p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512); +p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512); + } #endif } diff -r 576a93cba7d1 -r a77082ebfa67 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Mon Jul 17 09:23:17 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Wed Jul 12 16:48:22 2017 +0530 @@ -2269,6 +2269,186 @@ P2S_H_64xN_avx2 48 ;- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;- +%macro PROCESS_P2S_64x8_AVX512 0 +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + mmsize/2] +pmovzxbwm2, [r0 + r1] +pmovzxbwm3, [r0 + r1 + mmsize/2] + +psllw m0, 6 +psllw m1, 6 +psllw m2, 6 +psllw m3, 6 +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu[r2], m0 +movu[r2 + mmsize], m1 +movu[r2 + r3], m2 +movu[r2 + r3 + mmsize], m3 + +pmovzxbwm0, [r0 + r1 * 2] +pmovzxbwm1, [r0 + r1 * 2 + mmsize/2] +pmovzxbwm2, [r0 + r5] +pmovzxbwm3, [r0 + r5 + mmsize/2] + +psllw m0, 6 +psllw m1, 6 +psllw m2, 6 +psllw m3, 6 +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu[r2 + r3 * 2], m0 +movu[r2 + r3 * 2 + mmsize], m1 +movu[r2 + r6], m2 +movu[r2 + r6 + mmsize], m3 + +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] + +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + mmsize/2] +pmovzxbwm2, [r0 + r1] +pmovzxbwm3, [r0 + r1 + mmsize/2] + +psllw m0, 6 +psllw m1, 6 +psllw m2, 6 +psllw m3, 6 +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu[r2], m0 +movu[r2 + mmsize], m1 +movu[r2 + r3], m2 +movu[r2 + r3 + mmsize], m3 + +pmovzxbwm0, [r0 + r1 * 2] +pmovzxbwm1, [r0 + r1 * 2 + mmsize/2] +pmovzxbwm2, [r0 + r5] +pmovzxbwm3, [r0 + r5 + mmsize/2] + +psllw m0, 6 +psllw m1, 6 +psllw m2, 6 +psllw m3, 6 +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu[r2 + r3 * 2], m0 +movu[r2 + r3 * 2 + mmsize], m1 +movu[r2 + r6], m2 +movu[r2 + r6 + mmsize], m3 +%endmacro + +INIT_ZMM avx512 +cglobal filterPixelToShort_64x64, 3, 7, 5 +mov r3d, r3m +add r3d, r3d +lea r5, [r1 * 3] +lea r6, [r3 * 3] + +; load constant +vpbroadcastd m4, [pw_2000] + +PROCESS_P2S_64x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_64x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_64x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_64x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_64x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_64x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_64x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_64x8_AVX512 +RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_64x48, 3, 7, 9 +mov r3d, r3m +add r3d, r3d +lea r5, [r1 * 3] +lea r6, [r3 * 3] + +; load constant +vpbroadcastd
[x265] [PATCH 031 of 307] x86: AVX512 convert_p2s_32xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1500445753 -19800 # Wed Jul 19 11:59:13 2017 +0530 # Node ID 60c8ad7f3cadcfe7bb5242a89908546ce38bb5d1 # Parent a77082ebfa67b40f3dbb8cd45b54c17e710a104c x86: AVX512 convert_p2s_32xN Size| AVX2 performance | AVX512 performance 32x8| 1.51x | 1.54x 32x16 | 2.18x | 3.62x 32x24 | 2.26x | 3.58x 32x32 | 2.28x | 3.94x 32x64 | 2.20x | 4.06x diff -r a77082ebfa67 -r 60c8ad7f3cad source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Jul 12 16:48:22 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Jul 19 11:59:13 2017 +0530 @@ -3836,6 +3836,19 @@ p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512); p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512); p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512); +p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx2); +p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_avx512); +p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512); +p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512); +p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); } #endif diff -r a77082ebfa67 -r 60c8ad7f3cad source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Wed Jul 12 16:48:22 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Wed Jul 19 11:59:13 2017 +0530 @@ -1956,6 +1956,184 @@ ;- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;- +%macro PROCESS_P2S_32x8_AVX512 0 +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + r1] +pmovzxbwm2, [r0 + r1 * 2] +pmovzxbwm3, [r0 + r5] + +psllw m0, 6 +psllw m1, 6 +psllw m2, 6 +psllw m3, 6 +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 + +movu[r2], m0 +movu[r2 + r3], m1 +movu[r2 + r3 * 2], m2 +movu[r2 + r6], m3 + +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] + +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + r1] +pmovzxbwm2, [r0 + r1 * 2] +pmovzxbwm3, [r0 + r5] + +psllw m0, 6 +psllw m1, 6 +psllw m2, 6 +psllw m3, 6 +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 + +movu[r2], m0 +movu[r2 + r3], m1 +movu[r2 + r3 * 2], m2 +movu[r2 + r6], m3 +%endmacro + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x8, 3, 7, 5 +mov r3d, r3m +add r3d, r3d +lea r5, [r1 * 3] +lea r6, [r3 * 3] + +; load constant +vpbroadcastd m4, [pw_2000] + +PROCESS_P2S_32x8_AVX512 +RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x16, 3, 7, 5 +mov r3d, r3m +add r3d, r3d +lea r5, [r1 * 3] +lea r6, [r3 * 3] + +; load constant +vpbroadcastd m4, [pw_2000] + +PROCESS_P2S_32x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_32x8_AVX512 +RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x24, 3, 7, 5 +mov r3d, r3m +add r3d, r3d +lea r5, [r1 * 3] +lea r6, [r3 * 3] + +; load constant +vpbroadcastd m4, [pw_2000] + +PROCESS_P2S_32x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_32x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_32x8_AVX512 +RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x32, 3, 7, 5 +mov r3d, r3m +add r3d, r3d +lea r5, [r1
[x265] [PATCH 033 of 307] x86: AVX512 fix convert_p2s_64xN,48x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1500536572 -19800 # Thu Jul 20 13:12:52 2017 +0530 # Node ID bf9a9cd255216300408506d10d4ff8bc87a15845 # Parent 97d5ab44b6da2db69584875c2dde97aef5533d9b x86: AVX512 fix convert_p2s_64xN,48x64 diff -r 97d5ab44b6da -r bf9a9cd25521 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Wed Jul 19 12:25:43 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Thu Jul 20 13:12:52 2017 +0530 @@ -1953,9 +1953,6 @@ P2S_H_32xN_avx2 64 P2S_H_32xN_avx2 48 -;- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;- %macro PROCESS_P2S_32x8_AVX512 0 pmovzxbwm0, [r0] pmovzxbwm1, [r0 + r1] @@ -1999,6 +1996,9 @@ movu[r2 + r6], m3 %endmacro +;- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;- INIT_ZMM avx512 cglobal filterPixelToShort_32x8, 3, 7, 5 mov r3d, r3m @@ -2446,9 +2446,6 @@ P2S_H_64xN_avx2 32 P2S_H_64xN_avx2 48 -;- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;- %macro PROCESS_P2S_64x8_AVX512 0 pmovzxbwm0, [r0] pmovzxbwm1, [r0 + mmsize/2] @@ -2526,6 +2523,9 @@ movu[r2 + r6 + mmsize], m3 %endmacro +;- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;- INIT_ZMM avx512 cglobal filterPixelToShort_64x64, 3, 7, 5 mov r3d, r3m @@ -2561,14 +2561,14 @@ RET INIT_ZMM avx512 -cglobal filterPixelToShort_64x48, 3, 7, 9 +cglobal filterPixelToShort_64x48, 3, 7, 5 mov r3d, r3m add r3d, r3d lea r5, [r1 * 3] lea r6, [r3 * 3] ; load constant -vpbroadcastd m8, [pw_2000] +vpbroadcastd m4, [pw_2000] PROCESS_P2S_64x8_AVX512 lea r0, [r0 + r1 * 4] @@ -2589,14 +2589,14 @@ RET INIT_ZMM avx512 -cglobal filterPixelToShort_64x32, 3, 7, 9 +cglobal filterPixelToShort_64x32, 3, 7, 5 mov r3d, r3m add r3d, r3d lea r5, [r1 * 3] lea r6, [r3 * 3] ; load constant -vpbroadcastd m8, [pw_2000] +vpbroadcastd m4, [pw_2000] PROCESS_P2S_64x8_AVX512 lea r0, [r0 + r1 * 4] @@ -2611,14 +2611,14 @@ RET INIT_ZMM avx512 -cglobal filterPixelToShort_64x16, 3, 7, 9 +cglobal filterPixelToShort_64x16, 3, 7, 5 mov r3d, r3m add r3d, r3d lea r5, [r1 * 3] lea r6, [r3 * 3] ; load constant -vpbroadcastd m8, [pw_2000] +vpbroadcastd m4, [pw_2000] PROCESS_P2S_64x8_AVX512 lea r0, [r0 + r1 * 4] @@ -3047,9 +3047,6 @@ jnz.loop RET -;- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;- %macro PROCESS_P2S_48x8_AVX512 0 pmovzxbwm0, [r0] pmovzxbwm1, [r0 + r1] @@ -3123,6 +3120,9 @@ movu[r2 + r6 + 64], ym3 %endmacro +;- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;- INIT_ZMM avx512 cglobal filterPixelToShort_48x64, 3,7,5 mov r3d, r3m @@ -3131,7 +3131,7 @@ lea r6, [r3 * 3] ; load constant -vpbroadcastd m8, [pw_2000] +vpbroadcastd m4, [pw_2000] PROCESS_P2S_48x8_AVX512 lea r0, [r0 + r1 * 4] ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 034 of 307] x86: AVX512 ssd_ss_64x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1500528397 -19800 # Thu Jul 20 10:56:37 2017 +0530 # Node ID 0320e60b3323546eb6767508f1c39cd088e9f03e # Parent bf9a9cd255216300408506d10d4ff8bc87a15845 x86: AVX512 ssd_ss_64x64 AVX2 performance : 14.85x AVX512 performance : 21.35x diff -r bf9a9cd25521 -r 0320e60b3323 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 20 13:12:52 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jul 20 10:56:37 2017 +0530 @@ -3851,6 +3851,8 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); +p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512); + } #endif } diff -r bf9a9cd25521 -r 0320e60b3323 source/common/x86/ssd-a.asm --- a/source/common/x86/ssd-a.asm Thu Jul 20 13:12:52 2017 +0530 +++ b/source/common/x86/ssd-a.asm Thu Jul 20 10:56:37 2017 +0530 @@ -1377,7 +1377,124 @@ HADDD m2, m0 movdeax, xm2 RET +;- +; ssd_ss avx512 code start +;- +%macro PROCESS_SSD_SS_64x8_AVX512 0 +movum0, [r0] +movum1, [r0 + mmsize] +movum2, [r0 + r1] +movum3, [r0 + r1 + mmsize] +psubw m0, [r2] +psubw m1, [r2 + mmsize] +psubw m2, [r2 + r3] +psubw m3, [r2 + r3 + mmsize] +pmaddwd m0, m0 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +paddd m4, m0 +paddd m5, m1 +paddd m4, m2 +paddd m5, m3 + +movum0, [r0 + 2 * r1] +movum1, [r0 + 2 * r1 + mmsize] +movum2, [r0 + r5] +movum3, [r0 + r5 + mmsize] + +psubw m0, [r2 + 2 * r3] +psubw m1, [r2 + 2 * r3 + mmsize] +psubw m2, [r2 + r6] +psubw m3, [r2 + r6 + mmsize] +pmaddwd m0, m0 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +paddd m4, m0 +paddd m5, m1 +paddd m4, m2 +paddd m5, m3 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] + +movum0, [r0] +movum1, [r0 + mmsize] +movum2, [r0 + r1] +movum3, [r0 + r1 + mmsize] + +psubw m0, [r2] +psubw m1, [r2 + mmsize] +psubw m2, [r2 + r3] +psubw m3, [r2 + r3 + mmsize] +pmaddwd m0, m0 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +paddd m4, m0 +paddd m5, m1 +paddd m4, m2 +paddd m5, m3 + +movum0, [r0 + 2 * r1] +movum1, [r0 + 2 * r1 + mmsize] +movum2, [r0 + r5] +movum3, [r0 + r5 + mmsize] + +psubw m0, [r2 + 2 * r3] +psubw m1, [r2 + 2 * r3 + mmsize] +psubw m2, [r2 + r6] +psubw m3, [r2 + r6 + mmsize] +pmaddwd m0, m0 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +paddd m4, m0 +paddd m5, m1 +paddd m4, m2 +paddd m5, m3 +%endmacro + +INIT_ZMM avx512 +cglobal pixel_ssd_ss_64x64, 4,7,6 +add r1d, r1d +add r3d, r3d +lea r5, [r1 * 3] +lea r6, [r3 * 3] +pxorm4, m4 +pxorm5, m5 + +PROCESS_SSD_SS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_64x8_AVX512 +paddd m4, m5 +HADDD m4, m0 +movdeax, xm4 +RET +;- +; ssd_ss avx512 code end +;- %endif ; !HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 032 of 307] x86: AVX512 convert_p2s 48x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1500447343 -19800 # Wed Jul 19 12:25:43 2017 +0530 # Node ID 97d5ab44b6da2db69584875c2dde97aef5533d9b # Parent 60c8ad7f3cadcfe7bb5242a89908546ce38bb5d1 x86: AVX512 convert_p2s 48x64 AVX2 performance : 2.22x AVX512 performance: 3.01x diff -r 60c8ad7f3cad -r 97d5ab44b6da source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Jul 19 11:59:13 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Jul 19 12:25:43 2017 +0530 @@ -3841,6 +3841,7 @@ p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512); p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512); p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512); +p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512); diff -r 60c8ad7f3cad -r 97d5ab44b6da source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Wed Jul 19 11:59:13 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Wed Jul 19 12:25:43 2017 +0530 @@ -3047,6 +3047,115 @@ jnz.loop RET +;- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;- +%macro PROCESS_P2S_48x8_AVX512 0 +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + r1] +pmovzxbwm2, [r0 + r1 * 2] +pmovzxbwm3, [r0 + r5] +psllw m0, 6 +psllw m1, 6 +psllw m2, 6 +psllw m3, 6 +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu[r2], m0 +movu[r2 + r3], m1 +movu[r2 + r3 * 2], m2 +movu[r2 + r6], m3 + +pmovzxbwym0, [r0 + 32] +pmovzxbwym1, [r0 + r1 + 32] +pmovzxbwym2, [r0 + r1 * 2 + 32] +pmovzxbwym3, [r0 + r5 + 32] +psllw ym0, 6 +psllw ym1, 6 +psllw ym2, 6 +psllw ym3, 6 +psubw ym0, ym4 +psubw ym1, ym4 +psubw ym2, ym4 +psubw ym3, ym4 +movu[r2 + 64], ym0 +movu[r2 + r3 + 64], ym1 +movu[r2 + r3 * 2 + 64], ym2 +movu[r2 + r6 + 64], ym3 + +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] + +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + r1] +pmovzxbwm2, [r0 + r1 * 2] +pmovzxbwm3, [r0 + r5] +psllw m0, 6 +psllw m1, 6 +psllw m2, 6 +psllw m3, 6 +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu[r2], m0 +movu[r2 + r3], m1 +movu[r2 + r3 * 2], m2 +movu[r2 + r6], m3 + +pmovzxbwym0, [r0 + 32] +pmovzxbwym1, [r0 + r1 + 32] +pmovzxbwym2, [r0 + r1 * 2 + 32] +pmovzxbwym3, [r0 + r5 + 32] +psllw ym0, 6 +psllw ym1, 6 +psllw ym2, 6 +psllw ym3, 6 +psubw ym0, ym4 +psubw ym1, ym4 +psubw ym2, ym4 +psubw ym3, ym4 +movu[r2 + 64], ym0 +movu[r2 + r3 + 64], ym1 +movu[r2 + r3 * 2 + 64], ym2 +movu[r2 + r6 + 64], ym3 +%endmacro + +INIT_ZMM avx512 +cglobal filterPixelToShort_48x64, 3,7,5 +mov r3d, r3m +add r3d, r3d +lea r5, [r1 * 3] +lea r6, [r3 * 3] + +; load constant +vpbroadcastd m8, [pw_2000] + +PROCESS_P2S_48x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_48x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_48x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_48x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_48x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_48x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_48x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_48x8_AVX512 +RET %macro PROCESS_LUMA_W4_4R 0 movdm0, [r0] ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 036 of 307] x86: AVX512 blockcopy_ss_64x64
# HG changeset patch # User Jayashri Murugan # Date 1499162011 -19800 # Tue Jul 04 15:23:31 2017 +0530 # Node ID 3e3a44c6d77c0c0a7b3a084127a0dc6c835ff392 # Parent 2eda6628c75302a10d59918a58740d6e27434293 x86: AVX512 blockcopy_ss_64x64 AVX2 performance over C code : 1.32x AVX512 performance over C code : 3.00x diff -r 2eda6628c753 -r 3e3a44c6d77c source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 20 16:59:52 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 04 15:23:31 2017 +0530 @@ -3854,6 +3854,8 @@ p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512); p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512); +p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512); + } #endif } diff -r 2eda6628c753 -r 3e3a44c6d77c source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Thu Jul 20 16:59:52 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Tue Jul 04 15:23:31 2017 +0530 @@ -4462,6 +4462,154 @@ BLOCKCOPY_SS_W64_H4_avx 64, 48 BLOCKCOPY_SS_W64_H4_avx 64, 64 +%macro PROCESS_BLOCKCOPY_SS_W64_H8_avx512 0 +movum0, [r2] +movum1, [r2 + mmsize] +movum2, [r2 + r3] +movum3, [r2 + r3 + mmsize] + +movu[r0], m0 +movu[r0 + mmsize], m1 +movu[r0 + r1], m2 +movu[r0 + r1 + mmsize], m3 + +movum0, [r2 + 2 * r3] +movum1, [r2 + 2 * r3 + mmsize] +movum2, [r2 + r6] +movum3, [r2 + r6 + mmsize] +lea r2, [r2 + 4 * r3] + +movu[r0 + 2 * r1], m0 +movu[r0 + 2 * r1 + mmsize], m1 +movu[r0 + r5], m2 +movu[r0 + r5 + mmsize], m3 +lea r0, [r0 + 4 * r1] + +movum0, [r2] +movum1, [r2 + mmsize] +movum2, [r2 + r3] +movum3, [r2 + r3 + mmsize] + +movu[r0], m0 +movu[r0 + mmsize], m1 +movu[r0 + r1], m2 +movu[r0 + r1 + mmsize], m3 + +movum0, [r2 + 2 * r3] +movum1, [r2 + 2 * r3 + mmsize] +movum2, [r2 + r6] +movum3, [r2 + r6 + mmsize] +lea r2, [r2 + 4 * r3] + +movu[r0 + 2 * r1], m0 +movu[r0 + 2 * r1 + mmsize], m1 +movu[r0 + r5], m2 +movu[r0 + r5 + mmsize], m3 +lea r0, [r0 + 4 * r1] +%endmacro + +%macro PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 0 +movum0, [r2] +movum1, [r2 + mmsize] +movum2, [r2 + r3] +movum3, [r2 + r3 + mmsize] + +movu[r0], m0 +movu[r0 + mmsize], m1 +movu[r0 + r1], m2 +movu[r0 + r1 + mmsize], m3 + +movum0, [r2 + 2 * r3] +movum1, [r2 + 2 * r3 + mmsize] +movum2, [r2 + r6] +movum3, [r2 + r6 + mmsize] +lea r2, [r2 + 4 * r3] + +movu[r0 + 2 * r1], m0 +movu[r0 + 2 * r1 + mmsize], m1 +movu[r0 + r5], m2 +movu[r0 + r5 + mmsize], m3 +lea r0, [r0 + 4 * r1] + +movum0, [r2] +movum1, [r2 + mmsize] +movum2, [r2 + r3] +movum3, [r2 + r3 + mmsize] + +movu[r0], m0 +movu[r0 + mmsize], m1 +movu[r0 + r1], m2 +movu[r0 + r1 + mmsize], m3 + +movum0, [r2 + 2 * r3] +movum1, [r2 + 2 * r3 + mmsize] +movum2, [r2 + r6] +movum3, [r2 + r6 + mmsize] + +movu[r0 + 2 * r1], m0 +movu[r0 + 2 * r1 + mmsize], m1 +movu[r0 + r5], m2 +movu[r0 + r5 + mmsize], m3 +%endmacro + +;- +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) +;- +INIT_ZMM avx512 +cglobal blockcopy_ss_64x16, 4, 7, 4 +add r1, r1 +add r3, r3 +lea r5, [3 * r1] +lea r6, [3 * r3] + +PROCESS_BLOCKCOPY_SS_W64_H8_avx512 +PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 +RET + +INIT_ZMM avx512 +cglobal blockcopy_ss_64x32, 4, 7, 4 +add r1, r1 +add r3, r3 +lea r5, [3 * r1] +lea r6, [3 * r3] + +PROCESS_BLOCKCOPY_SS_W64_H8_avx512 +PROCESS_BLOCKCOPY_SS_W64_H8_avx512 +PROCESS_BLOCKCOPY_SS_W64_H8_avx512 +PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 +RET + +INIT_ZMM avx512 +cglobal blockcopy_ss_64x48, 4, 7, 4 +add r1, r1 +add r3, r3 +lea r5, [3 * r1] +lea r6, [3 * r3] + +PROCESS_BLOCKCOPY_SS_W64_H8_avx512 +PROCESS_BLOCKCOPY_SS_W64_H8_avx512 +PROCESS_BLOCKCOPY_SS_W64_H8_avx512 +PROCESS_BLOCKCOPY_SS_W64_H8_avx512 +PROCESS_BLOCKCOPY_SS_W64_H8_avx512 +PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 +RET + +INIT_ZMM avx512 +cglobal blockcopy_ss_64x64, 4, 7, 4 +add r1, r1 +
[x265] [PATCH 037 of 307] x86: AVX512 blockcopy_ss_32xN
# HG changeset patch # User Jayashri Murugan # Date 1499171579 -19800 # Tue Jul 04 18:02:59 2017 +0530 # Node ID ef8989f43083cd5195ff3ba360959fe3900399e5 # Parent 3e3a44c6d77c0c0a7b3a084127a0dc6c835ff392 x86: AVX512 blockcopy_ss_32xN AVX2 performance over C code : 1.82x AVX512 performance over C code : 4.56x diff -r 3e3a44c6d77c -r ef8989f43083 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 04 15:23:31 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 04 18:02:59 2017 +0530 @@ -3854,6 +3854,9 @@ p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512); p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512); +p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512); +p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512); +p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_avx512); p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512); } diff -r 3e3a44c6d77c -r ef8989f43083 source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Tue Jul 04 15:23:31 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Tue Jul 04 18:02:59 2017 +0530 @@ -4164,6 +4164,143 @@ BLOCKCOPY_SS_W32_H4_avx 32, 48 BLOCKCOPY_SS_W32_H4_avx 32, 64 +%macro PROCESS_BLOCKCOPY_SS_W32_H8_avx512 0 +movum0, [r2] +movum1, [r2 + r3] +movum2, [r2 + 2 * r3] +movum3, [r2 + r6] +lea r2, [r2 + 4 * r3] + +movu[r0], m0 +movu[r0 + r1], m1 +movu[r0 + 2 * r1], m2 +movu[r0 + r5], m3 +lea r0, [r0 + 4 * r1] + +movum0, [r2] +movum1, [r2 + r3] +movum2, [r2 + 2 * r3] +movum3, [r2 + r6] +lea r2, [r2 + 4 * r3] + +movu[r0], m0 +movu[r0 + r1], m1 +movu[r0 + 2 * r1], m2 +movu[r0 + r5], m3 +lea r0, [r0 + 4 * r1] +%endmacro + +%macro PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 0 +movum0, [r2] +movum1, [r2 + r3] +movum2, [r2 + 2 * r3] +movum3, [r2 + r6] +lea r2, [r2 + 4 * r3] + +movu[r0], m0 +movu[r0 + r1], m1 +movu[r0 + 2 * r1], m2 +movu[r0 + r5], m3 +lea r0, [r0 + 4 * r1] + +movum0, [r2] +movum1, [r2 + r3] +movum2, [r2 + 2 * r3] +movum3, [r2 + r6] + +movu[r0], m0 +movu[r0 + r1], m1 +movu[r0 + 2 * r1], m2 +movu[r0 + r5], m3 +%endmacro + +;- +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) +;- +INIT_ZMM avx512 +cglobal blockcopy_ss_32x8, 4, 7, 4 + +addr1, r1 +addr3, r3 +lear5, [3 * r1] +lear6, [3 * r3] + +PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 +RET + +INIT_ZMM avx512 +cglobal blockcopy_ss_32x16, 4, 7, 4 + +addr1, r1 +addr3, r3 +lear5, [3 * r1] +lear6, [3 * r3] + +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 +RET + +INIT_ZMM avx512 +cglobal blockcopy_ss_32x24, 4, 7, 4 + +addr1, r1 +addr3, r3 +lear5, [3 * r1] +lear6, [3 * r3] + +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 +RET + +INIT_ZMM avx512 +cglobal blockcopy_ss_32x32, 4, 7, 4 + +addr1, r1 +addr3, r3 +lear5, [3 * r1] +lear6, [3 * r3] + +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 +RET + +INIT_ZMM avx512 +cglobal blockcopy_ss_32x48, 4, 7, 4 + +addr1, r1 +addr3, r3 +lear5, [3 * r1] +lear6, [3 * r3] + +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 +RET + +INIT_ZMM avx512 +cglobal blockcopy_ss_32x64, 4, 7, 4 + +addr1, r1 +addr3, r3 +lear5, [3 * r1] +lear6, [3 * r3] + +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 +RET + ;- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t sr
[x265] [PATCH 035 of 307] x86: AVX512 ssd_ss_32x32
# HG changeset patch # User Vignesh Vijayakumar # Date 1500550192 -19800 # Thu Jul 20 16:59:52 2017 +0530 # Node ID 2eda6628c75302a10d59918a58740d6e27434293 # Parent 0320e60b3323546eb6767508f1c39cd088e9f03e x86: AVX512 ssd_ss_32x32 AVX2 performance : 12.73x AVX512 performance : 19.72x diff -r 0320e60b3323 -r 2eda6628c753 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 20 10:56:37 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jul 20 16:59:52 2017 +0530 @@ -3852,6 +3852,7 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512); +p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512); } #endif diff -r 0320e60b3323 -r 2eda6628c753 source/common/x86/ssd-a.asm --- a/source/common/x86/ssd-a.asm Thu Jul 20 10:56:37 2017 +0530 +++ b/source/common/x86/ssd-a.asm Thu Jul 20 16:59:52 2017 +0530 @@ -1457,6 +1457,47 @@ paddd m5, m3 %endmacro +%macro PROCESS_SSD_SS_32x8_AVX512 0 +movum0, [r0] +movum1, [r0 + r1] +movum2, [r0 + 2 * r1] +movum3, [r0 + r5] + +psubw m0, [r2] +psubw m1, [r2 + r3] +psubw m2, [r2 + 2 * r3] +psubw m3, [r2 + r6] +pmaddwd m0, m0 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +paddd m4, m0 +paddd m5, m1 +paddd m4, m2 +paddd m5, m3 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] + +movum0, [r0] +movum1, [r0 + r1] +movum2, [r0 + 2 * r1] +movum3, [r0 + r5] + +psubw m0, [r2] +psubw m1, [r2 + r3] +psubw m2, [r2 + 2 * r3] +psubw m3, [r2 + r6] +pmaddwd m0, m0 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +paddd m4, m0 +paddd m5, m1 +paddd m4, m2 +paddd m5, m3 +%endmacro + INIT_ZMM avx512 cglobal pixel_ssd_ss_64x64, 4,7,6 add r1d, r1d @@ -1492,6 +1533,30 @@ HADDD m4, m0 movdeax, xm4 RET + +INIT_ZMM avx512 +cglobal pixel_ssd_ss_32x32, 4,7,6 +add r1d, r1d +add r3d, r3d +lea r5, [r1 * 3] +lea r6, [r3 * 3] +pxorm4, m4 +pxorm5, m5 + +PROCESS_SSD_SS_32x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_32x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_32x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_32x8_AVX512 +paddd m4, m5 +HADDD m4, m0 +movdeax, xm4 +RET ;- ; ssd_ss avx512 code end ;- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 038 of 307] x86: AVX512 getResidual32
# HG changeset patch # User Jayashri Murugan # Date 1500627732 -19800 # Fri Jul 21 14:32:12 2017 +0530 # Node ID 49123506b563fd44378e856e6833c77812d0349e # Parent ef8989f43083cd5195ff3ba360959fe3900399e5 x86: AVX512 getResidual32 BIT_DEPTH = 8 AVX2 performance over C code : 2.99x AVX512 performance over C code : 5.46x HIGH_BIT_DEPTH AVX2 performance over C code : 3.10x AVX512 performance over C code : 5.60x diff -r ef8989f43083 -r 49123506b563 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 04 18:02:59 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jul 21 14:32:12 2017 +0530 @@ -3723,6 +3723,7 @@ p.integral_inith[INTEGRAL_24] = PFX(integral24h_avx2); p.integral_inith[INTEGRAL_32] = PFX(integral32h_avx2); +p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512); } if (cpuMask & X265_CPU_AVX512) { @@ -3859,6 +3860,8 @@ p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_avx512); p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512); +p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512); + } #endif } diff -r ef8989f43083 -r 49123506b563 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Tue Jul 04 18:02:59 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Fri Jul 21 14:32:12 2017 +0530 @@ -554,6 +554,135 @@ %endrep RET %endif + +%macro PROCESS_GETRESIDUAL32_W4_HBD_AVX512 0 +movum0, [r0] +movum1, [r0 + r3] +movum2, [r0 + r3 * 2] +movum3, [r0 + r4] +lea r0, [r0 + r3 * 4] + +movum4, [r1] +movum5, [r1 + r3] +movum6, [r1 + r3 * 2] +movum7, [r1 + r4] +lea r1, [r1 + r3 * 4] + +psubw m0, m4 +psubw m1, m5 +psubw m2, m6 +psubw m3, m7 + +movu[r2], m0 +movu[r2 + r3], m1 +movu[r2 + r3 * 2], m2 +movu[r2 + r4], m3 +lea r2, [r2 + r3 * 4] +%endmacro + +%macro PROCESS_GETRESIDUAL32_W4_HBD_AVX512_END 0 +movum0, [r0] +movum1, [r0 + r3] +movum2, [r0 + r3 * 2] +movum3, [r0 + r4] + +movum4, [r1] +movum5, [r1 + r3] +movum6, [r1 + r3 * 2] +movum7, [r1 + r4] + +psubw m0, m4 +psubw m1, m5 +psubw m2, m6 +psubw m3, m7 + +movu[r2], m0 +movu[r2 + r3], m1 +movu[r2 + r3 * 2], m2 +movu[r2 + r4], m3 +%endmacro + +%macro PROCESS_GETRESIDUAL32_W4_AVX512 0 +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + r3] +pmovzxbwm2, [r0 + r3 * 2] +pmovzxbwm3, [r0 + r4] +lea r0, [r0 + r3 * 4] + +pmovzxbwm4, [r1] +pmovzxbwm5, [r1 + r3] +pmovzxbwm6, [r1 + r3 * 2] +pmovzxbwm7, [r1 + r4] +lea r1, [r1 + r3 * 4] + +psubw m0, m4 +psubw m1, m5 +psubw m2, m6 +psubw m3, m7 + +movu[r2], m0 +movu[r2 + r3 * 2], m1 +lea r2, [r2 + r3 * 4] +movu[r2], m2 +movu[r2 + r3 * 2], m3 +lea r2, [r2 + r3 * 4] +%endmacro + +%macro PROCESS_GETRESIDUAL32_W4_AVX512_END 0 +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + r3] +pmovzxbwm2, [r0 + r3 * 2] +pmovzxbwm3, [r0 + r4] + +pmovzxbwm4, [r1] +pmovzxbwm5, [r1 + r3] +pmovzxbwm6, [r1 + r3 * 2] +pmovzxbwm7, [r1 + r4] + +psubw m0, m4 +psubw m1, m5 +psubw m2, m6 +psubw m3, m7 + +movu[r2], m0 +movu[r2 + r3 * 2], m1 +lea r2, [r2 + r3 * 4] +movu[r2], m2 +movu[r2 + r3 * 2], m3 +%endmacro + + +%if HIGH_BIT_DEPTH +INIT_ZMM avx512 +cglobal getResidual32, 4,5,8 +add r3, r3 +lea r4, [r3 * 3] + +PROCESS_GETRESIDUAL32_W4_HBD_AVX512 +PROCESS_GETRESIDUAL32_W4_HBD_AVX512 +PROCESS_GETRESIDUAL32_W4_HBD_AVX512 +PROCESS_GETRESIDUAL32_W4_HBD_AVX512 +PROCESS_GETRESIDUAL32_W4_HBD_AVX512 +PROCESS_GETRESIDUAL32_W4_HBD_AVX512 +PROCESS_GETRESIDUAL32_W4_HBD_AVX512 +PROCESS_GETRESIDUAL32_W4_HBD_AVX512_END +RET +%else +INIT_ZMM avx512 +cglobal getResidual32, 4,5,8 +lea r4, [r3 * 3] + +PROCESS_GETRESIDUAL32_W4_AVX512 +PROCESS_GETRESIDUAL32_W4_AVX512 +PROCESS_GETRESIDUAL32_W4_AVX512 +PROCESS_GETRESIDUAL32_W4_AVX512 +PROCESS_GETRESIDUAL32_W4_AVX512 +PROCESS_GETRESIDUAL32_W4_AVX512 +PROCESS_GETRESIDUAL32_W4_AVX512 +PROCESS_GETRESIDUAL32_W4_AVX512_END +RET +%endif + ;- ; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff); ;---
[x265] [PATCH 041 of 307] x86: AVX512 sub_ps_32xN for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1500888338 -19800 # Mon Jul 24 14:55:38 2017 +0530 # Node ID 156acfb1bbb3cee56ed7b3337850a1fc9e4429ee # Parent 9a4caf163d0fbdbc51c9f681ed898a39a5602bcf x86: AVX512 sub_ps_32xN for high bit depth AVX2 performance : 23.62x AVX512 performance : 35.86x diff -r 9a4caf163d0f -r 156acfb1bbb3 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jul 24 14:35:52 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jul 24 14:55:38 2017 +0530 @@ -2192,6 +2192,9 @@ { p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512); p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512); +p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512); +p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512); +p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512); } } #else // if HIGH_BIT_DEPTH diff -r 9a4caf163d0f -r 156acfb1bbb3 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Mon Jul 24 14:35:52 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Mon Jul 24 14:55:38 2017 +0530 @@ -5487,9 +5487,6 @@ PIXELSUB_PS_W32_H8_avx2 32, 64 %endif -;- -; void pixel_sub_ps_32x32(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); -;- %macro PROCESS_SUB_PS_32x8_AVX512 0 pmovzxbwm0, [r2] pmovzxbwm1, [r3] @@ -5534,7 +5531,119 @@ movu[r0 + r9],m6 %endmacro -%if HIGH_BIT_DEPTH==0 +%macro PROCESS_SUB_PS_32x8_HBD_AVX512 0 +movum0, [r2] +movum1, [r3] +movum2, [r2 + r4] +movum3, [r3 + r5] +psubw m0, m1 +psubw m2, m3 + +movu[r0], m0 +movu[r0 + r1],m2 + +movum0, [r2 + r4 * 2] +movum1, [r3 + r5 * 2] +movum2, [r2 + r7] +movum3, [r3 + r8] +psubw m0, m1 +psubw m2, m3 + +movu[r0 + r1 * 2],m0 +movu[r0 + r6],m2 + +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] + +movum0, [r2] +movum1, [r3] +movum2, [r2 + r4] +movum3, [r3 + r5] +psubw m0, m1 +psubw m2, m3 + +movu[r0], m0 +movu[r0 + r1],m2 + +movum0, [r2 + r4 * 2] +movum1, [r3 + r5 * 2] +movum2, [r2 + r7] +movum3, [r3 + r8] +psubw m0, m1 +psubw m2, m3 + +movu[r0 + r1 * 2],m0 +movu[r0 + r6],m2 +%endmacro + +;- +; void pixel_sub_ps_32x32(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); +;- +%if HIGH_BIT_DEPTH +%if ARCH_X86_64 +INIT_ZMM avx512 +cglobal pixel_sub_ps_32x32, 6, 9, 4 +add r1d,r1d +add r4d,r4d +add r5d,r5d +lea r6, [r1 * 3] +lea r7, [r4 * 3] +lea r8, [r5 * 3] +PROCESS_SUB_PS_32x8_HBD_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +PROCESS_SUB_PS_32x8_HBD_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +PROCESS_SUB_PS_32x8_HBD_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +PROCESS_SUB_PS_32x8_HBD_AVX512 +RET + +cglobal pixel_sub_ps_32x64, 6, 9, 4 +add r1d,r1d +add r4d,r4d +add r5d,r5d +lea r6, [r1 * 3] +lea r7, [r4 * 3] +lea r8, [r5 * 3] +PROCESS_SUB_PS_32x8_HBD_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +PROCESS_SUB_PS_32x8_HBD_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +PROCESS_SUB_PS_32x8_HBD_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +PROCESS_SUB_PS_32x8_HBD_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +PROCESS_SUB_
[x265] [PATCH 039 of 307] x86: AVX512 sad_w32 and sad_w64 cleanup
# HG changeset patch # User Vignesh Vijayakumar # Date 1500621502 -19800 # Fri Jul 21 12:48:22 2017 +0530 # Node ID be860e68659a37dae543956a65a4eb167f8b5504 # Parent 49123506b563fd44378e856e6833c77812d0349e x86: AVX512 sad_w32 and sad_w64 cleanup diff -r 49123506b563 -r be860e68659a source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Fri Jul 21 14:32:12 2017 +0530 +++ b/source/common/x86/sad-a.asm Fri Jul 21 12:48:22 2017 +0530 @@ -7565,29 +7565,79 @@ movdeax, xm0 RET -;- -; int pixel_sad_64x%1( uint8_t *, intptr_t, uint8_t *, intptr_t ) -;- -%macro PIXEL_SAD_W64_AVX512 1 -INIT_ZMM avx512 -cglobal pixel_sad_64x%1, 4,5,6 -xorps m0, m0 -xorps m5, m5 - -%rep %1/2 -movu m1, [r0] ; first 64 of row 0 of pix0 -movu m2, [r2] ; first 64 of row 0 of pix1 -movu m3, [r0 + r1] ; first 64 of row 1 of pix0 -movu m4, [r2 + r3] ; first 64 of row 1 of pix1 +%macro PROCESS_SAD_64x8_AVX512 0 +movu m1, [r0] +movu m2, [r2] +movu m3, [r0 + r1] +movu m4, [r2 + r3] psadbw m1, m2 psadbw m3, m4 paddd m0, m1 paddd m5, m3 -lear2, [r2 + 2 * r3] -lear0, [r0 + 2 * r1] -%endrep - -paddd m0, m5 +movu m1, [r0 + 2 * r1] +movu m2, [r2 + 2 * r3] +movu m3, [r0 + r5] +movu m4, [r2 + r6] +psadbw m1, m2 +psadbw m3, m4 +paddd m0, m1 +paddd m5, m3 + +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] + +movu m1, [r0] +movu m2, [r2] +movu m3, [r0 + r1] +movu m4, [r2 + r3] +psadbw m1, m2 +psadbw m3, m4 +paddd m0, m1 +paddd m5, m3 +movu m1, [r0 + 2 * r1] +movu m2, [r2 + 2 * r3] +movu m3, [r0 + r5] +movu m4, [r2 + r6] +psadbw m1, m2 +psadbw m3, m4 +paddd m0, m1 +paddd m5, m3 +%endmacro + +%macro PROCESS_SAD_32x8_AVX512 0 +movu ym1, [r0] +movu ym2, [r2] +vinserti32x8m1, [r0 + r1], 1 +vinserti32x8m2, [r2 + r3], 1 +movu ym3, [r0 + 2 * r1] +movu ym4, [r2 + 2 * r3] +vinserti32x8m3, [r0 + r5], 1 +vinserti32x8m4, [r2 + r6], 1 + +psadbw m1, m2 +psadbw m3, m4 +paddd m0, m1 +paddd m0, m3 + +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] + +movu ym1, [r0] +movu ym2, [r2] +vinserti32x8m1, [r0 + r1], 1 +vinserti32x8m2, [r2 + r3], 1 +movu ym3, [r0 + 2 * r1] +movu ym4, [r2 + 2 * r3] +vinserti32x8m3, [r0 + r5], 1 +vinserti32x8m4, [r2 + r6], 1 + +psadbw m1, m2 +psadbw m3, m4 +paddd m0, m1 +paddd m0, m3 +%endmacro + +%macro PROCESS_SAD_AVX512_END 0 vextracti32x8 ym1, m0, 1 paddd ym0, ym1 vextracti64x2 xm1, m0, 1 @@ -7595,53 +7645,195 @@ pshufd xm1, xm0, 2 paddd xm0, xm1 movd eax, xm0 +%endmacro +;- +; int pixel_sad_64x%1( uint8_t *, intptr_t, uint8_t *, intptr_t ) +;- +INIT_ZMM avx512 +cglobal pixel_sad_64x16, 4,5,6 +xorps m0, m0 +xorps m5, m5 +lea r5, [3 * r1] +lea r6, [3 * r3] + +PROCESS_SAD_64x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_64x8_AVX512 +paddd m0, m5 +PROCESS_SAD_AVX512_END RET -%endmacro - -PIXEL_SAD_W64_AVX512 16 -PIXEL_SAD_W64_AVX512 32 -PIXEL_SAD_W64_AVX512 48 -PIXEL_SAD_W64_AVX512 64 - -%macro PIXEL_SAD_W32_AVX512 1 + INIT_ZMM avx512 -cglobal pixel_sad_32x%1, 4,7,5 +cglobal pixel_sad_64x32, 4,5,6 +xorps m0, m0 +xorps m5, m5 +lea r5, [3 * r1] +lea r6, [3 * r3] + +PROCESS_SAD_64x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_64x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_64x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_64x8_AVX512 +paddd m0, m5 +PROCESS_SAD_AVX512_END +RET + +INIT_ZMM avx512 +cglobal pixel_sad_64x48, 4,5,
[x265] [PATCH 042 of 307] x86:AVX512 ssd_s_32
# HG changeset patch # User Vignesh Vijayakumar # Date 1500629149 -19800 # Fri Jul 21 14:55:49 2017 +0530 # Node ID 6b3b8ef0f37e0f7860f4f43c99e581674b19f9e3 # Parent 156acfb1bbb3cee56ed7b3337850a1fc9e4429ee x86:AVX512 ssd_s_32 AVX2 performance : 7.37x AVX512 performance : 13.06x diff -r 156acfb1bbb3 -r 6b3b8ef0f37e source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jul 24 14:55:38 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jul 21 14:55:49 2017 +0530 @@ -3858,6 +3858,7 @@ p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512); p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512); +p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512); p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512); p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512); diff -r 156acfb1bbb3 -r 6b3b8ef0f37e source/common/x86/ssd-a.asm --- a/source/common/x86/ssd-a.asm Mon Jul 24 14:55:38 2017 +0530 +++ b/source/common/x86/ssd-a.asm Fri Jul 21 14:55:49 2017 +0530 @@ -3389,3 +3389,65 @@ movdeax, xm0 %endif RET + +;- +; ssd_s avx512 code start +;- +%macro PROCESS_SSD_S_32x8_AVX512 0 +movum1, [r0] +movum2, [r0 + r1] +movum3, [r0 + 2 * r1] +movum4, [r0 + r3] + +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +pmaddwd m4, m4 +paddd m1, m2 +paddd m3, m4 +paddd m1, m3 +paddd m0, m1 + +lea r0, [r0 + 4 * r1] + +movum1, [r0] +movum2, [r0 + r1] +movum3, [r0 + 2 * r1] +movum4, [r0 + r3] + +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +pmaddwd m4, m4 +paddd m1, m2 +paddd m3, m4 +paddd m1, m3 +paddd m0, m1 +%endmacro + +;- +; int pixel_ssd_s( int16_t *ref, intptr_t i_stride ) +;- +%if HIGH_BIT_DEPTH==0 +INIT_ZMM avx512 +cglobal pixel_ssd_s_32, 2,4,5 +add r1, r1 +lea r3, [r1 * 3] +pxorm0, m0 + +PROCESS_SSD_S_32x8_AVX512 +lea r0, [r0 + 4 * r1] +PROCESS_SSD_S_32x8_AVX512 +lea r0, [r0 + 4 * r1] +PROCESS_SSD_S_32x8_AVX512 +lea r0, [r0 + 4 * r1] +PROCESS_SSD_S_32x8_AVX512 + +; calculate sum and return +HADDD m0, m1 +movdeax, xm0 +RET +%endif +;- +; ssd_s avx512 code end +;- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 044 of 307] x86: AVX512 blockcopy_ss_32x32, blockcopy_pp_32xN, blockcopy_ps_32x32 and
# HG changeset patch # User Jayashri Murugan # Date 1500964406 -19800 # Tue Jul 25 12:03:26 2017 +0530 # Node ID 4978d583e2e82aec1f09d94ecdf52191eac7ceb5 # Parent 2ad06d32a8465ce20e673c819b917a7524ecf8e9 x86: AVX512 blockcopy_ss_32x32, blockcopy_pp_32xN, blockcopy_ps_32x32 and blockcopy_sp_32x32 for HIGH_BIT_DEPTH HIGH_BIT_DEPTH: Primitive | AVX2 performance | AVX512 performance --- copy_ss[32x32] |1.83x |3.91x copy_pp[32x64] |1.98x |3.22x copy_pp[32x32] |1.91x |4.11x copy_pp[32x24] |2.06x |4.20x copy_pp[32x16] |1.80x |4.60x copy_ps[32x32] |1.90x |4.60x copy_sp[32x32] |2.02x |4.78x diff -r 2ad06d32a846 -r 4978d583e2e8 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 25 12:02:13 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 25 12:03:26 2017 +0530 @@ -2205,6 +2205,31 @@ p.pu[LUMA_64x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x16_avx512); p.cu[BLOCK_64x64].copy_ps = (copy_ps_t)PFX(blockcopy_ss_64x64_avx512); p.cu[BLOCK_64x64].copy_sp = (copy_sp_t)PFX(blockcopy_ss_64x64_avx512); + +// 32 X N +p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512); +p.pu[LUMA_32x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x64_avx512); +p.pu[LUMA_32x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x32_avx512); +p.pu[LUMA_32x24].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x24_avx512); +p.pu[LUMA_32x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x16_avx512); +p.pu[LUMA_32x8].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x24_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x48_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_32x64_avx512); +p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512); +p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_avx512); +p.cu[BLOCK_32x32].copy_ps = (copy_ps_t)PFX(blockcopy_ss_32x32_avx512); +p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ps = (copy_ps_t)PFX(blockcopy_ss_32x32_avx512); +p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ps = (copy_ps_t)PFX(blockcopy_ss_32x64_avx512); +p.cu[BLOCK_32x32].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x32_avx512); +p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x32_avx512); +p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x64_avx512); + } } #else // if HIGH_BIT_DEPTH ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 040 of 307] x86: AVX512 sub_ps_64x64 for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1500887152 -19800 # Mon Jul 24 14:35:52 2017 +0530 # Node ID 9a4caf163d0fbdbc51c9f681ed898a39a5602bcf # Parent be860e68659a37dae543956a65a4eb167f8b5504 x86: AVX512 sub_ps_64x64 for high bit depth AVX2 performance : 21.24x AVX512 performance : 36.95x diff -r be860e68659a -r 9a4caf163d0f source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jul 21 12:48:22 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jul 24 14:35:52 2017 +0530 @@ -2191,6 +2191,7 @@ if (cpuMask & X265_CPU_AVX512) { p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512); +p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512); } } #else // if HIGH_BIT_DEPTH diff -r be860e68659a -r 9a4caf163d0f source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Fri Jul 21 12:48:22 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Mon Jul 24 14:35:52 2017 +0530 @@ -6023,9 +6023,6 @@ RET %endif -;- -; void pixel_sub_ps_64x64(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); -;- %macro PROCESS_SUB_PS_64x8_AVX512 0 pmovzxbwm0, [r2] pmovzxbwm1, [r2 + 32] @@ -6112,7 +6109,129 @@ movu[r0 + 2 * r1 + 64], m5 %endmacro -%if HIGH_BIT_DEPTH==0 +%macro PROCESS_SUB_PS_64x8_HBD_AVX512 0 +movum0, [r2] +movum1, [r2 + 64] +movum4, [r3] +movum5, [r3 + 64] +psubw m0, m4 +psubw m1, m5 +movum2, [r2 + r4] +movum3, [r2 + r4 + 64] +movum6, [r3 + r5] +movum7, [r3 + r5 + 64] +psubw m2, m6 +psubw m3, m7 + +movu[r0], m0 +movu[r0 + 64],m1 +movu[r0 + r1],m2 +movu[r0 + r1 + 64], m3 + +movum0, [r2 + r4 * 2] +movum1, [r2 + r4 * 2 + 64] +movum4, [r3 + r5 * 2] +movum5, [r3 + r5 * 2 + 64] +psubw m0, m4 +psubw m1, m5 +movum2, [r2 + r7] +movum3, [r2 + r7 + 64] +movum6, [r3 + r8] +movum7, [r3 + r8 + 64] +psubw m2, m6 +psubw m3, m7 + +movu[r0 + r1 * 2],m0 +movu[r0 + r1 * 2 + 64], m1 +movu[r0 + r6],m2 +movu[r0 + r6 + 64], m3 + +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] + +movum0, [r2] +movum1, [r2 + 64] +movum4, [r3] +movum5, [r3 + 64] +psubw m0, m4 +psubw m1, m5 +movum2, [r2 + r4] +movum3, [r2 + r4 + 64] +movum6, [r3 + r5] +movum7, [r3 + r5 + 64] +psubw m2, m6 +psubw m3, m7 + +movu[r0], m0 +movu[r0 + 64],m1 +movu[r0 + r1],m2 +movu[r0 + r1 + 64], m3 + +movum0, [r2 + r4 * 2] +movum1, [r2 + r4 * 2 + 64] +movum4, [r3 + r5 * 2] +movum5, [r3 + r5 * 2 + 64] +psubw m0, m4 +psubw m1, m5 +movum2, [r2 + r7] +movum3, [r2 + r7 + 64] +movum6, [r3 + r8] +movum7, [r3 + r8 + 64] +psubw m2, m6 +psubw m3, m7 + +movu[r0 + r1 * 2],m0 +movu[r0 + r1 * 2 + 64], m1 +movu[r0 + r6],m2 +movu[r0 + r6 + 64], m3 +%endmacro +;- +; void pixel_sub_ps_64x64(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1); +;- +%if HIGH_BIT_DEPTH +%if ARCH_X86_64 +INIT_ZMM avx512 +cglobal pixel_sub_ps_64x64, 6, 9, 8 +add r1d,r1d +add r4d,r4d +add r5d,r5d +lea r6, [r1 * 3] +lea r7, [r4 * 3] +lea r8, [r5 * 3] + +PROCESS_SUB_PS_64x8_HBD_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +PROCESS_SUB_PS_64x8_HBD_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] +PROCESS_SUB_PS_64x8_HBD_AVX512 +lea r0, [r0 + r1 * 4] +lea r2,
[x265] [PATCH 045 of 307] x86:AVX-512 blockcopy_pp_64xN
# HG changeset patch # User Kalyan Goswami # Date 1500979633 -19800 # Tue Jul 25 16:17:13 2017 +0530 # Node ID 723c72ffe3eacba3db73eb46332f7cf5c97efa8a # Parent 4978d583e2e82aec1f09d94ecdf52191eac7ceb5 x86:AVX-512 blockcopy_pp_64xN Size| AVX2 performance | AVX512 performance 64x64 | 1.54x | 3.22x 64x48 | 1.74x | 3.29x 64x32 | 1.65x | 3.96x 64x16 | 1.69x | 3.79x diff -r 4978d583e2e8 -r 723c72ffe3ea source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 25 12:03:26 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 25 16:17:13 2017 +0530 @@ -3848,6 +3848,11 @@ p.pu[LUMA_16x8].sad = PFX(pixel_sad_16x8_avx512); p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx512); +p.pu[LUMA_64x64].copy_pp = PFX(blockcopy_pp_64x64_avx512); +p.pu[LUMA_64x32].copy_pp = PFX(blockcopy_pp_64x32_avx512); +p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_avx512); +p.pu[LUMA_64x16].copy_pp = PFX(blockcopy_pp_64x16_avx512); + p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512); p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512); p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512); diff -r 4978d583e2e8 -r 723c72ffe3ea source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Tue Jul 25 12:03:26 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Tue Jul 25 16:17:13 2017 +0530 @@ -1103,6 +1103,47 @@ BLOCKCOPY_PP_W64_H4_avx 64, 48 BLOCKCOPY_PP_W64_H4_avx 64, 64 +;-- +; Macro to calculate blockcopy_pp_64x4_avx512 +;-- +%macro PROCESS_BLOCKCOPY_PP_64X4_avx512 0 +movum0, [r2] +movum1, [r2 + r3] +movum2, [r2 + 2 * r3] +movum3, [r2 + r4] + +movu[r0] , m0 +movu[r0 + r1] , m1 +movu[r0 + 2 * r1] , m2 +movu[r0 + r5] , m3 +%endmacro + +;-- +; void blockcopy_pp_64x%1(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) +;-- +%macro BLOCKCOPY_PP_W64_H4_avx512 1 +INIT_ZMM avx512 +cglobal blockcopy_pp_64x%1, 4, 4, 6 +lear4, [3 * r3] +lear5, [3 * r1] + +%rep %1/4 - 1 +PROCESS_BLOCKCOPY_PP_64X4_avx512 +lea r2, [r2 + 4 * r3] +lea r0, [r0 + 4 * r1] +%endrep + +PROCESS_BLOCKCOPY_PP_64X4_avx512 +RET +%endmacro + +BLOCKCOPY_PP_W64_H4_avx512 16 +BLOCKCOPY_PP_W64_H4_avx512 32 +BLOCKCOPY_PP_W64_H4_avx512 48 +BLOCKCOPY_PP_W64_H4_avx512 64 + + + ;- ; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ;- diff -r 4978d583e2e8 -r 723c72ffe3ea source/common/x86/blockcopy8.h --- a/source/common/x86/blockcopy8.hTue Jul 25 12:03:26 2017 +0530 +++ b/source/common/x86/blockcopy8.hTue Jul 25 16:17:13 2017 +0530 @@ -54,6 +54,7 @@ FUNCDEF_CHROMA_PU(void, blockcopy_pp, sse2, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); +FUNCDEF_CHROMA_PU(void, blockcopy_pp, avx512, pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); FUNCDEF_PU(void, blockcopy_sp, sse2, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); FUNCDEF_PU(void, blockcopy_sp, sse4, pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 043 of 307] x86: AVX512 blockcopy_ss_64x64, blockcopy_pp_64xN, blockcopy_ps_64x64 and
# HG changeset patch # User Jayashri Murugan # Date 1500964333 -19800 # Tue Jul 25 12:02:13 2017 +0530 # Node ID 2ad06d32a8465ce20e673c819b917a7524ecf8e9 # Parent 6b3b8ef0f37e0f7860f4f43c99e581674b19f9e3 x86: AVX512 blockcopy_ss_64x64, blockcopy_pp_64xN, blockcopy_ps_64x64 and blockcopy_sp_64x64 for HIGH_BIT_DEPTH HIGH_BIT_DEPTH: Primitive | AVX2 performance | AVX512 performance --- copy_ss[64x64] |1.38x |2.85x copy_pp[64x64] |1.91x |3.03x copy_pp[64x48] |1.90x |3.21x copy_pp[64x32] |1.99x |3.26x copy_pp[64x16] |2.01x |3.56x copy_ps[64x64] |1.78x |3.46x copy_sp[64x64] |1.80x |3.25x diff -r 6b3b8ef0f37e -r 2ad06d32a846 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jul 21 14:55:49 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 25 12:02:13 2017 +0530 @@ -2191,10 +2191,20 @@ if (cpuMask & X265_CPU_AVX512) { p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512); +p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512); p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512); p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512); p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512); p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512); + +// 64 X N +p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512); +p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx512); +p.pu[LUMA_64x48].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x48_avx512); +p.pu[LUMA_64x32].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x32_avx512); +p.pu[LUMA_64x16].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x16_avx512); +p.cu[BLOCK_64x64].copy_ps = (copy_ps_t)PFX(blockcopy_ss_64x64_avx512); +p.cu[BLOCK_64x64].copy_sp = (copy_sp_t)PFX(blockcopy_ss_64x64_avx512); } } #else // if HIGH_BIT_DEPTH @@ -3727,7 +3737,6 @@ p.integral_inith[INTEGRAL_24] = PFX(integral24h_avx2); p.integral_inith[INTEGRAL_32] = PFX(integral32h_avx2); -p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512); } if (cpuMask & X265_CPU_AVX512) { ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 046 of 307] x86:AVX-512 blockfill_s_32x32
# HG changeset patch # User Kalyan Goswami # Date 1500980022 -19800 # Tue Jul 25 16:23:42 2017 +0530 # Node ID 9e1401dcdfc3c9fb633d81b7b39321ac5969a245 # Parent 723c72ffe3eacba3db73eb46332f7cf5c97efa8a x86:AVX-512 blockfill_s_32x32 Size| AVX2 performance | AVX512 performance 32x32 | 4.58x | 9.73x diff -r 723c72ffe3ea -r 9e1401dcdfc3 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 25 16:17:13 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 25 16:23:42 2017 +0530 @@ -3866,6 +3866,8 @@ p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512); p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512); +p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_avx512); + p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512); p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512); p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2); diff -r 723c72ffe3ea -r 9e1401dcdfc3 source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Tue Jul 25 16:17:13 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Tue Jul 25 16:23:42 2017 +0530 @@ -2484,6 +2484,25 @@ movu [r0 + r3 + 32], m0 RET +; +; void blockfill_s_32x32(int16_t* dst, intptr_t dstride, int16_t val) +; +INIT_ZMM avx512 +cglobal blockfill_s_32x32, 3, 4, 1 +add r1, r1 +lea r3, [3 * r1] +movd xm0, r2d +vpbroadcastw m0, xm0 + +%rep 8 +movu [r0], m0 +movu [r0 + r1], m0 +movu [r0 + 2 * r1], m0 +movu [r0 + r3], m0 +lear0, [r0 + 4 * r1] +%endrep +RET + ;- ; void blockcopy_ps_2x4(int16_t* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride); ;- diff -r 723c72ffe3ea -r 9e1401dcdfc3 source/common/x86/blockcopy8.h --- a/source/common/x86/blockcopy8.hTue Jul 25 16:17:13 2017 +0530 +++ b/source/common/x86/blockcopy8.hTue Jul 25 16:23:42 2017 +0530 @@ -47,6 +47,7 @@ FUNCDEF_TU(void, blockfill_s, sse2, int16_t* dst, intptr_t dstride, int16_t val); FUNCDEF_TU(void, blockfill_s, avx2, int16_t* dst, intptr_t dstride, int16_t val); +FUNCDEF_TU(void, blockfill_s, avx512, int16_t* dst, intptr_t dstride, int16_t val); FUNCDEF_CHROMA_PU(void, blockcopy_ss, sse2, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); FUNCDEF_CHROMA_PU(void, blockcopy_ss, avx, int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 057 of 307] [x265-avx512]x86: AVX512 pixel_sad_64xN for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty # Date 1501653512 -19800 # Wed Aug 02 11:28:32 2017 +0530 # Node ID b355ac2912dd111b96dbb5893b34405863e7382f # Parent 784aff4e987c17e2ece9bd3484b256f97f3640f5 [x265-avx512]x86: AVX512 pixel_sad_64xN for high bit depth Size| AVX2 performance | AVX512 performance 64x16 | 27.47x | 43.37x 64x32 | 28.41x | 46.45x 64x48 | 26.51x | 48.47x 64x64 | 28.74x | 48.76x diff -r 784aff4e987c -r b355ac2912dd source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Aug 02 11:28:32 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Aug 02 11:28:32 2017 +0530 @@ -2261,6 +2261,10 @@ p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512); p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512); p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512); +p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512); +p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512); +p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); +p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512); p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512); p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512); diff -r 784aff4e987c -r b355ac2912dd source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Wed Aug 02 11:28:32 2017 +0530 +++ b/source/common/x86/sad16-a.asm Wed Aug 02 11:28:32 2017 +0530 @@ -1234,6 +1234,86 @@ paddd m0, m1 %endmacro + +%macro PROCESS_SAD_64x8_AVX512 0 +movum1, [r2] +movum2, [r2 + mmsize] +movum3, [r2 + r3] +movum4, [r2 + r3 + mmsize] +psubw m1, [r0] +psubw m2, [r0 + mmsize] +psubw m3, [r0 + r1] +psubw m4, [r0 + r1 + mmsize] +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +pabsw m4, m4 +paddw m1, m2 +paddw m3, m4 +paddw m5, m1, m3 + +movum1, [r2 + 2 * r3] +movum2, [r2 + 2 * r3 + mmsize] +movum3, [r2 + r5] +movum4, [r2 + r5 + mmsize] +psubw m1, [r0 + 2 * r1] +psubw m2, [r0 + 2 * r1 + mmsize] +psubw m3, [r0 + r4] +psubw m4, [r0 + r4 + mmsize] +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +pabsw m4, m4 +paddw m1, m2 +paddw m3, m4 +paddw m1, m3 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] + +pmaddwd m5, m6 +paddd m0, m5 +pmaddwd m1, m6 +paddd m0, m1 + +movum1, [r2] +movum2, [r2 + mmsize] +movum3, [r2 + r3] +movum4, [r2 + r3 + mmsize] +psubw m1, [r0] +psubw m2, [r0 + mmsize] +psubw m3, [r0 + r1] +psubw m4, [r0 + r1 + mmsize] +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +pabsw m4, m4 +paddw m1, m2 +paddw m3, m4 +paddw m5, m1, m3 + +movum1, [r2 + 2 * r3] +movum2, [r2 + 2 * r3 + mmsize] +movum3, [r2 + r5] +movum4, [r2 + r5 + mmsize] +psubw m1, [r0 + 2 * r1] +psubw m2, [r0 + 2 * r1 + mmsize] +psubw m3, [r0 + r4] +psubw m4, [r0 + r4 + mmsize] +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +pabsw m4, m4 +paddw m1, m2 +paddw m3, m4 +paddw m1, m3 + +pmaddwd m5, m6 +paddd m0, m5 +pmaddwd m1, m6 +paddd m0, m1 +%endmacro + %macro PROCESS_SAD_32x8_AVX512 0 movum1, [r2] movum2, [r2 + r3] @@ -1572,6 +1652,116 @@ PROCESS_SAD_AVX512_END RET +;- +; int pixel_sad_64x%1( uint16_t *, intptr_t, uint16_t *, intptr_t ) +;- +INIT_ZMM avx512 +cglobal pixel_sad_64x16, 4,6,7 +pxorm0, m0 + +vbroadcasti32x8 m6, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] + +PROCESS_SAD_64x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_64x8_AVX512 +PROCESS_SAD_AVX512_END +RET + +INIT_ZMM avx512 +cglobal pixel_sad_64x32, 4,6,7 +pxorm0, m0 + +vbroadcasti32x8 m6, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] + +PROCESS_SAD_64x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_64x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_64x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_64x8_AVX512 +PROCESS_SAD_AVX512_END +RET + +INIT_ZMM avx512 +cglobal pixel_sad_64x48, 4,6,7 +pxorm0, m0 + +vbroadcasti32x8 m6, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r
[x265] [PATCH 055 of 307] [x265-avx512]x86: AVX512 pixel_sad_32xN for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty # Date 1501239623 -19800 # Fri Jul 28 16:30:23 2017 +0530 # Node ID 215976d65b80985998b2597b8ba4c697f1465a1d # Parent e65ac86010af8f7ab1e5b43591330eeb6c818473 [x265-avx512]x86: AVX512 pixel_sad_32xN for high bit depth Size| AVX2 performance | AVX512 performance 32x8| 27.57x | 35.17x 32x16 | 27.96x | 40.74x 32x24 | 31.21x | 45.19x 32x32 | 32.12x | 47.23x 32x64 | 28.79x | 53.35x diff -r e65ac86010af -r 215976d65b80 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jul 28 16:30:23 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jul 28 16:30:23 2017 +0530 @@ -2262,6 +2262,12 @@ p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512); p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512); +p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512); +p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512); +p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512); +p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512); +p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512); + } } #else // if HIGH_BIT_DEPTH diff -r e65ac86010af -r 215976d65b80 source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Fri Jul 28 16:30:23 2017 +0530 +++ b/source/common/x86/sad16-a.asm Fri Jul 28 16:30:23 2017 +0530 @@ -1208,6 +1208,179 @@ movd eax, xm0 %endmacro +%macro PROCESS_SAD_32x8_AVX512 0 +movum1, [r2] +movum2, [r2 + r3] +movum3, [r2 + 2 * r3] +movum4, [r2 + r5] +psubw m1, [r0] +psubw m2, [r0 + r1] +psubw m3, [r0 + 2 * r1] +psubw m4, [r0 + r4] +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +pabsw m4, m4 +paddw m1, m2 +paddw m3, m4 +paddw m5, m1, m3 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] + +movum1, [r2] +movum2, [r2 + r3] +movum3, [r2 + 2 * r3] +movum4, [r2 + r5] +psubw m1, [r0] +psubw m2, [r0 + r1] +psubw m3, [r0 + 2 * r1] +psubw m4, [r0 + r4] +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +pabsw m4, m4 +paddw m1, m2 +paddw m3, m4 +paddw m1, m3 + +pmaddwd m5, m6 +paddd m0, m5 +pmaddwd m1, m6 +paddd m0, m1 +%endmacro + +%macro PROCESS_SAD_AVX512_END 0 +vextracti32x8 ym1, m0, 1 +paddd ym0, ym1 +vextracti64x2 xm1, m0, 1 +paddd xm0, xm1 +pshufd xm1, xm0, 1110b +paddd xm0, xm1 +pshufd xm1, xm0, 0001b +paddd xm0, xm1 +movd eax, xm0 +%endmacro + + + +;- +; int pixel_sad_32x%1( uint16_t *, intptr_t, uint16_t *, intptr_t ) +;- +INIT_ZMM avx512 +cglobal pixel_sad_32x8, 4,6,7 +pxorm0, m0 + +vbroadcasti32x8 m6, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] + +PROCESS_SAD_32x8_AVX512 +PROCESS_SAD_AVX512_END +RET + + +INIT_ZMM avx512 +cglobal pixel_sad_32x16, 4,6,7 +pxorm0, m0 + +vbroadcasti32x8 m6, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] + +PROCESS_SAD_32x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_32x8_AVX512 +PROCESS_SAD_AVX512_END +RET + +INIT_ZMM avx512 +cglobal pixel_sad_32x24, 4,6,7 + pxorm0, m0 + +vbroadcasti32x8 m6, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] + +PROCESS_SAD_32x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_32x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_32x8_AVX512 +PROCESS_SAD_AVX512_END +RET + +INIT_ZMM avx512 +cglobal pixel_sad_32x32, 4,6,7 +pxorm0, m0 + +vbroadcasti32x8 m6, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] + +PROCESS_SAD_32x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_32x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_32x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_32x8_AVX512 +PROCESS_SAD_AVX512_END +RET + +INIT_ZMM avx512 +cglobal pixel_sad_32x64, 4,6,7 + pxorm0, m0 + +vbroadcasti32x8 m6, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] + +PROCESS_SAD_32x
[x265] [PATCH 053 of 307] x86: AVX512 pixel_add_ps_32xN for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1501571354 -19800 # Tue Aug 01 12:39:14 2017 +0530 # Node ID f8687bef93f25b343606e42f4fd252d5f0897d1a # Parent 05972a61eb1aeac474ecc0d0150671e879177112 x86: AVX512 pixel_add_ps_32xN for high bit depth AVX2 performance : 12.77x AVX512 performance : 21.54x This patch also cleanup low bit depth code diff -r 05972a61eb1a -r f8687bef93f2 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 01 10:56:55 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 01 12:39:14 2017 +0530 @@ -2198,6 +2198,9 @@ p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512); p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512); +p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512); +p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512); +p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx512); // 64 X N p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512); @@ -3893,8 +3896,8 @@ p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512); p.cu[BLOCK_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512); -p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx2); -p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx2); +p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps = PFX(pixel_add_ps_32x32_avx512); +p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps = PFX(pixel_add_ps_32x64_avx512); p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_avx512); p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512); diff -r 05972a61eb1a -r f8687bef93f2 source/common/x86/pixeladd8.asm --- a/source/common/x86/pixeladd8.asm Tue Aug 01 10:56:55 2017 +0530 +++ b/source/common/x86/pixeladd8.asm Tue Aug 01 12:39:14 2017 +0530 @@ -769,132 +769,6 @@ PIXEL_ADD_PS_W32_H4_avx2 64 ;- -; void pixel_add_ps_32x32(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) -;- -%macro PROCESS_ADD_PS_32x8_AVX512 0 -pmovzxbwm0, [r2]; row 0 of src0 -movum1, [r3]; row 0 of src1 -pmovzxbwm2, [r2 + r4] ; row 1 of src0 -movum3, [r3 + r5] ; row 1 of src1 -pmovzxbwm4, [r2 + r4 * 2] ; row 2 of src0 -movum5, [r3 + r5 * 2] ; row 2 of src1 -pmovzxbwm6, [r2 + r7] ; row 3 of src0 -movum7, [r3 + r8] ; row 3 of src1 - -paddw m0, m1 -paddw m2, m3 -paddw m4, m5 -paddw m6, m7 -packuswbm0, m2 -packuswbm4, m6 -vpermq m0, m0, 11011000b -vpermq m4, m4, 11011000b -vshufi64x2 m0, m0, 11011000b -vshufi64x2 m4, m4, 11011000b -movu[r0],ym0; row 0 of dst -movu[r0 + r1 * 2], ym4; row 2 of dst -vshufi64x2 m0, m0, 01001110b -vshufi64x2 m4, m4, 01001110b -movu[r0 + r1], ym0; row 1 of dst -movu[r0 + r9], ym4; row 3 of dst - -lea r2, [r2 + r4 * 4] -lea r3, [r3 + r5 * 4] -lea r0, [r0 + r1 * 4] - -pmovzxbwm0, [r2]; row 4 of src0 -movum1, [r3]; row 4 of src1 -pmovzxbwm2, [r2 + r4] ; row 5 of src0 -movum3, [r3 + r5] ; row 5 of src1 -pmovzxbwm4, [r2 + r4 * 2] ; row 6 of src0 -movum5, [r3 + r5 * 2] ; row 6 of src1 -pmovzxbwm6, [r2 + r7] ; row 7 of src0 -movum7, [r3 + r8] ; row 7 of src1 - -paddw m0, m1 -paddw m2, m3 -paddw m4, m5 -paddw m6, m7 -packuswbm0, m2 -packuswbm4, m6 -vpermq m0, m0, 11011000b -vpermq m4, m4, 11011000b -vshufi64x2 m0, m0, 11011000b -vshufi64x2 m4, m4, 11011000b -movu[r0],ym0; row 4 of dst -movu[r0 + r1 * 2], ym4; row 6 of dst -vshufi64x2 m0, m0, 01001110b -vshufi64x2 m4, m4, 01001110b -movu[r0 + r1], ym0; row 5 of dst -movu[r0 + r9], ym4; row 7 of dst -%endmacro - -
[x265] [PATCH 047 of 307] x86: AVX512 convert_p2s_64xN for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1500987523 -19800 # Tue Jul 25 18:28:43 2017 +0530 # Node ID d05b920865e7c9e8cc9441e77df888b48acb50d1 # Parent 9e1401dcdfc3c9fb633d81b7b39321ac5969a245 x86: AVX512 convert_p2s_64xN for high bit depth Size | AVX2 performance | AVX512 performance -- 64x16 | 10.53x| 18.40x 64x32 | 11.10x| 19.51x 64x48 | 11.14x| 19.07x 64x64 | 11.26x| 20.25x diff -r 9e1401dcdfc3 -r d05b920865e7 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 25 16:23:42 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 25 18:28:43 2017 +0530 @@ -2230,6 +2230,10 @@ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x32_avx512); p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = (copy_sp_t)PFX(blockcopy_ss_32x64_avx512); +p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_avx512); +p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512); +p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512); +p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512); } } #else // if HIGH_BIT_DEPTH diff -r 9e1401dcdfc3 -r d05b920865e7 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Tue Jul 25 16:23:42 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Tue Jul 25 18:28:43 2017 +0530 @@ -301,6 +301,183 @@ FILTER_VER_LUMA_sse2 ps, 64, 16 FILTER_VER_LUMA_sse2 ps, 16, 64 +;- +;p2s avx512 code start +;- +%macro P2S_64x8_AVX512 0 +movu m0, [r0] +movu m1, [r0 + r1] +movu m2, [r0 + r1 * 2] +movu m3, [r0 + r5] +psllw m0, (14 - BIT_DEPTH) +psllw m1, (14 - BIT_DEPTH) +psllw m2, (14 - BIT_DEPTH) +psllw m3, (14 - BIT_DEPTH) +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu [r2], m0 +movu [r2 + r3], m1 +movu [r2 + r3 * 2], m2 +movu [r2 + r4], m3 + +movu m0, [r0 + mmsize] +movu m1, [r0 + r1 + mmsize] +movu m2, [r0 + r1 * 2 + mmsize] +movu m3, [r0 + r5 + mmsize] +psllw m0, (14 - BIT_DEPTH) +psllw m1, (14 - BIT_DEPTH) +psllw m2, (14 - BIT_DEPTH) +psllw m3, (14 - BIT_DEPTH) +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu [r2 + mmsize], m0 +movu [r2 + r3 + mmsize], m1 +movu [r2 + r3 * 2 + mmsize], m2 +movu [r2 + r4 + mmsize], m3 + +lear0, [r0 + r1 * 4] +lear2, [r2 + r3 * 4] + +movu m0, [r0] +movu m1, [r0 + r1] +movu m2, [r0 + r1 * 2] +movu m3, [r0 + r5] +psllw m0, (14 - BIT_DEPTH) +psllw m1, (14 - BIT_DEPTH) +psllw m2, (14 - BIT_DEPTH) +psllw m3, (14 - BIT_DEPTH) +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu [r2], m0 +movu [r2 + r3], m1 +movu [r2 + r3 * 2], m2 +movu [r2 + r4], m3 + +movu m0, [r0 + mmsize] +movu m1, [r0 + r1 + mmsize] +movu m2, [r0 + r1 * 2 + mmsize] +movu m3, [r0 + r5 + mmsize] +psllw m0, (14 - BIT_DEPTH) +psllw m1, (14 - BIT_DEPTH) +psllw m2, (14 - BIT_DEPTH) +psllw m3, (14 - BIT_DEPTH) +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu [r2 + mmsize], m0 +movu [r2 + r3 + mmsize], m1 +movu [r2 + r3 * 2 + mmsize], m2 +movu [r2 + r4 + mmsize], m3 +%endmacro + +;- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride) +;- +INIT_ZMM avx512 +cglobal filterPixelToShort_64x16, 4, 6, 5 +addr1d, r1d +addr3d, r3d +lear4, [r3 * 3] +lear5, [r1 * 3] + +; load constant +vbroadcasti32x8m4, [pw_2000] +P2S_64x8_AVX512 +lear0, [r0 + r1 * 4] +lear2, [r2 + r3 * 4] +P2S_64x8_AVX512 +RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_64x32, 4, 6, 5 +addr1d, r1d +addr3d, r3d +lear4, [r3 * 3] +lear5, [r1 * 3] + +; load constant +vbroadcasti32x8m4, [pw_2000] +P2S_64x8_AVX512 +lear0, [r0 + r1 * 4] +lear2, [r2 + r3 * 4] +P2S_64x8_AVX512 +lear0, [r0 + r1 * 4] +lear2,
[x265] [PATCH 052 of 307] x86: AVX512 pixel_add_ps_64x64 for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1501565215 -19800 # Tue Aug 01 10:56:55 2017 +0530 # Node ID 05972a61eb1aeac474ecc0d0150671e879177112 # Parent 984cad60283b474ed756238cf904b08df290e103 x86: AVX512 pixel_add_ps_64x64 for high bit depth AVX2 performance: 14.14x AVX512 performance: 20.40x diff -r 984cad60283b -r 05972a61eb1a source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 25 16:37:38 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 01 10:56:55 2017 +0530 @@ -2197,6 +2197,8 @@ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512); p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512); +p.cu[BLOCK_64x64].add_ps = PFX(pixel_add_ps_64x64_avx512); + // 64 X N p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512); p.pu[LUMA_64x64].copy_pp = (copy_pp_t)PFX(blockcopy_ss_64x64_avx512); diff -r 984cad60283b -r 05972a61eb1a source/common/x86/pixeladd8.asm --- a/source/common/x86/pixeladd8.asm Tue Jul 25 16:37:38 2017 +0530 +++ b/source/common/x86/pixeladd8.asm Tue Aug 01 10:56:55 2017 +0530 @@ -1272,7 +1272,7 @@ %endif ;- -; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +; pixel_add_ps_64x64 avx512 code start ;- %macro PROCESS_ADD_PS_64x8_AVX512 0 pmovzxbwm0, [r2] @@ -1376,8 +1376,148 @@ movu[r0 + r1], m4 %endmacro +%macro PROCESS_ADD_PS_64x8_HBD_AVX512 0 +movum0, [r2] +movum1, [r2 + mmsize] +movum2, [r3] +movum3, [r3 + mmsize] +paddw m0, m2 +paddw m1, m3 + +CLIPW2 m0, m1, m4, m5 +movu[r0],m0 +movu[r0 + mmsize], m1 + +movum0, [r2 + r4] +movum1, [r2 + r4 + mmsize] +movum2, [r3 + r5] +movum3, [r3 + r5 + mmsize] +paddw m0, m2 +paddw m1, m3 + +CLIPW2 m0, m1, m4, m5 +movu[r0 + r1], m0 +movu[r0 + r1 + mmsize], m1 + +movum0, [r2 + r4 * 2] +movum1, [r2 + r4 * 2 + mmsize] +movum2, [r3 + r5 * 2] +movum3, [r3 + r5 * 2 + mmsize] +paddw m0, m2 +paddw m1, m3 + +CLIPW2 m0, m1, m4, m5 +movu[r0 + r1 * 2], m0 +movu[r0 + r1 * 2 + mmsize], m1 + +movum0, [r2 + r6] +movum1, [r2 + r6 + mmsize] +movum2, [r3 + r7] +movum3, [r3 + r7 + mmsize] +paddw m0, m2 +paddw m1, m3 + +CLIPW2 m0, m1, m4, m5 +movu[r0 + r8], m0 +movu[r0 + r8 + mmsize], m1 + +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r5 * 4] + +movum0, [r2] +movum1, [r2 + mmsize] +movum2, [r3] +movum3, [r3 + mmsize] +paddw m0, m2 +paddw m1, m3 + +CLIPW2 m0, m1, m4, m5 +movu[r0],m0 +movu[r0 + mmsize], m1 + +movum0, [r2 + r4] +movum1, [r2 + r4 + mmsize] +movum2, [r3 + r5] +movum3, [r3 + r5 + mmsize] +paddw m0, m2 +paddw m1, m3 + +CLIPW2 m0, m1, m4, m5 +movu[r0 + r1], m0 +movu[r0 + r1 + mmsize], m1 + +movum0, [r2 + r4 * 2] +movum1, [r2 + r4 * 2 + mmsize] +movum2, [r3 + r5 * 2] +movum3, [r3 + r5 * 2 + mmsize] +paddw m0, m2 +paddw m1, m3 + +CLIPW2 m0, m1, m4, m5 +movu[r0 + r1 * 2], m0 +movu[r0 + r1 * 2 + mmsize], m1 + +movum0, [r2 + r6] +movum1, [r2 + r6 + mmsize] +movum2, [r3 + r7] +movum3, [r3 + r7 + mmsize] +paddw m0, m2 +paddw m1, m3 + +CLIPW2 m0, m1, m4, m5 +movu[r0 + r8], m0 +movu[r0 + r8 + mmsize], m1 +%endmacro +;- +; void pixel_add_ps_64x64(pixel *dest, intptr_t destride, pixel *src0, int16_t *scr1, intptr_t srcStride0, intptr_t srcStride1) +;- +%if HIGH_BIT_DEPTH %if ARCH_X86_64 -%if HIGH_BIT_DEPTH==0 +INIT_ZMM avx512 +cglobal pixel_add_ps_64x64, 6, 9, 6 +vbroadcasti32x8 m5, [pw_pixel_max] +pxor m4, m4 +add r4d, r4d +add r5d, r5d +add r1d, r1d +lea r6, [r4 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] + +PROCESS_ADD_PS_64x8_HBD_AVX
[x265] [PATCH 056 of 307] [x265-avx512]x86: AVX512 pixel_sad_64xN for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty # Date 1501653512 -19800 # Wed Aug 02 11:28:32 2017 +0530 # Node ID 784aff4e987c17e2ece9bd3484b256f97f3640f5 # Parent 215976d65b80985998b2597b8ba4c697f1465a1d [x265-avx512]x86: AVX512 pixel_sad_64xN for high bit depth Size| AVX2 performance | AVX512 performance 64x16 | 27.47x | 43.37x 64x32 | 28.41x | 46.45x 64x48 | 26.51x | 48.47x 64x64 | 28.74x | 48.76x diff -r 215976d65b80 -r 784aff4e987c source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jul 28 16:30:23 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Aug 02 11:28:32 2017 +0530 @@ -2267,6 +2267,10 @@ p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512); p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512); p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512); +p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512); +p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512); +p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); +p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512); } } diff -r 215976d65b80 -r 784aff4e987c source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Fri Jul 28 16:30:23 2017 +0530 +++ b/source/common/x86/sad16-a.asm Wed Aug 02 11:28:32 2017 +0530 @@ -1154,6 +1154,86 @@ INIT_XMM sse2 SAD_12 12, 16 + +%macro PROCESS_SAD_64x8_AVX512 0 +movum1, [r2] +movum2, [r2 + mmsize] +movum3, [r2 + r3] +movum4, [r2 + r3 + mmsize] +psubw m1, [r0] +psubw m2, [r0 + mmsize] +psubw m3, [r0 + r1] +psubw m4, [r0 + r1 + mmsize] +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +pabsw m4, m4 +paddw m1, m2 +paddw m3, m4 +paddw m5, m1, m3 + +movum1, [r2 + 2 * r3] +movum2, [r2 + 2 * r3 + mmsize] +movum3, [r2 + r5] +movum4, [r2 + r5 + mmsize] +psubw m1, [r0 + 2 * r1] +psubw m2, [r0 + 2 * r1 + mmsize] +psubw m3, [r0 + r4] +psubw m4, [r0 + r4 + mmsize] +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +pabsw m4, m4 +paddw m1, m2 +paddw m3, m4 +paddw m1, m3 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] + +pmaddwd m5, m6 +paddd m0, m5 +pmaddwd m1, m6 +paddd m0, m1 + +movum1, [r2] +movum2, [r2 + mmsize] +movum3, [r2 + r3] +movum4, [r2 + r3 + mmsize] +psubw m1, [r0] +psubw m2, [r0 + mmsize] +psubw m3, [r0 + r1] +psubw m4, [r0 + r1 + mmsize] +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +pabsw m4, m4 +paddw m1, m2 +paddw m3, m4 +paddw m5, m1, m3 + +movum1, [r2 + 2 * r3] +movum2, [r2 + 2 * r3 + mmsize] +movum3, [r2 + r5] +movum4, [r2 + r5 + mmsize] +psubw m1, [r0 + 2 * r1] +psubw m2, [r0 + 2 * r1 + mmsize] +psubw m3, [r0 + r4] +psubw m4, [r0 + r4 + mmsize] +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +pabsw m4, m4 +paddw m1, m2 +paddw m3, m4 +paddw m1, m3 + +pmaddwd m5, m6 +paddd m0, m5 +pmaddwd m1, m6 +paddd m0, m1 +%endmacro + %macro PROCESS_SAD_32x8_AVX512 0 movum1, [r2] movum2, [r2 + r3] @@ -1263,6 +1343,116 @@ %endmacro +;- +; int pixel_sad_64x%1( uint16_t *, intptr_t, uint16_t *, intptr_t ) +;- +INIT_ZMM avx512 +cglobal pixel_sad_64x16, 4,6,7 +pxorm0, m0 + +vbroadcasti32x8 m6, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] + +PROCESS_SAD_64x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_64x8_AVX512 +PROCESS_SAD_AVX512_END +RET + +INIT_ZMM avx512 +cglobal pixel_sad_64x32, 4,6,7 +pxorm0, m0 + +vbroadcasti32x8 m6, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] + +PROCESS_SAD_64x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_64x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_64x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_64x8_AVX512 +PROCESS_SAD_AVX512_END +RET + +INIT_ZMM avx512 +cglobal pixel_sad_64x48, 4,6,7 +pxorm0, m0 + +vbroadcasti32x8 m6, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] + +PROCESS_SAD_64x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_64x
[x265] [PATCH 049 of 307] x86: AVX512 convert_p2s_48x64 for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1501043664 -19800 # Wed Jul 26 10:04:24 2017 +0530 # Node ID a75dd880817adddafac5e1105e512ea79c7a089b # Parent b4c2149e9bb1119857363094492b50e85593fb74 x86: AVX512 convert_p2s_48x64 for high bit depth AVX2 performance : 9.77x AVX512 performance : 14.64x diff -r b4c2149e9bb1 -r a75dd880817a source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 25 18:50:51 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Jul 26 10:04:24 2017 +0530 @@ -2239,6 +2239,7 @@ p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512); p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512); p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512); +p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512); diff -r b4c2149e9bb1 -r a75dd880817a source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Tue Jul 25 18:50:51 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Wed Jul 26 10:04:24 2017 +0530 @@ -416,6 +416,79 @@ movu [r2 + r4], m3 %endmacro +%macro P2S_48x8_AVX512 0 +movu m0, [r0] +movu m1, [r0 + r1] +movu m2, [r0 + r1 * 2] +movu m3, [r0 + r5] +psllw m0, (14 - BIT_DEPTH) +psllw m1, (14 - BIT_DEPTH) +psllw m2, (14 - BIT_DEPTH) +psllw m3, (14 - BIT_DEPTH) +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu [r2], m0 +movu [r2 + r3], m1 +movu [r2 + r3 * 2], m2 +movu [r2 + r4], m3 + +movu ym0, [r0 + mmsize] +movu ym1, [r0 + r1 + mmsize] +movu ym2, [r0 + r1 * 2 + mmsize] +movu ym3, [r0 + r5 + mmsize] +psllw ym0, (14 - BIT_DEPTH) +psllw ym1, (14 - BIT_DEPTH) +psllw ym2, (14 - BIT_DEPTH) +psllw ym3, (14 - BIT_DEPTH) +psubw ym0, ym4 +psubw ym1, ym4 +psubw ym2, ym4 +psubw ym3, ym4 +movu [r2 + mmsize], ym0 +movu [r2 + r3 + mmsize], ym1 +movu [r2 + r3 * 2 + mmsize], ym2 +movu [r2 + r4 + mmsize], ym3 + +lear0, [r0 + r1 * 4] +lear2, [r2 + r3 * 4] + +movu m0, [r0] +movu m1, [r0 + r1] +movu m2, [r0 + r1 * 2] +movu m3, [r0 + r5] +psllw m0, (14 - BIT_DEPTH) +psllw m1, (14 - BIT_DEPTH) +psllw m2, (14 - BIT_DEPTH) +psllw m3, (14 - BIT_DEPTH) +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu [r2], m0 +movu [r2 + r3], m1 +movu [r2 + r3 * 2], m2 +movu [r2 + r4], m3 + +movu ym0, [r0 + mmsize] +movu ym1, [r0 + r1 + mmsize] +movu ym2, [r0 + r1 * 2 + mmsize] +movu ym3, [r0 + r5 + mmsize] +psllw ym0, (14 - BIT_DEPTH) +psllw ym1, (14 - BIT_DEPTH) +psllw ym2, (14 - BIT_DEPTH) +psllw ym3, (14 - BIT_DEPTH) +psubw ym0, ym4 +psubw ym1, ym4 +psubw ym2, ym4 +psubw ym3, ym4 +movu [r2 + mmsize], ym0 +movu [r2 + r3 + mmsize], ym1 +movu [r2 + r3 * 2 + mmsize], ym2 +movu [r2 + r4 + mmsize], ym3 +%endmacro + ;- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride) ;- @@ -640,6 +713,39 @@ lear2, [r2 + r3 * 4] P2S_32x8_AVX512 RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_48x64, 4, 6, 5 +addr1d, r1d +addr3d, r3d +lear4, [r3 * 3] +lear5, [r1 * 3] + +; load constant +vbroadcasti32x8m4, [pw_2000] +P2S_48x8_AVX512 +lear0, [r0 + r1 * 4] +lear2, [r2 + r3 * 4] +P2S_48x8_AVX512 +lear0, [r0 + r1 * 4] +lear2, [r2 + r3 * 4] +P2S_48x8_AVX512 +lear0, [r0 + r1 * 4] +lear2, [r2 + r3 * 4] +P2S_48x8_AVX512 +lear0, [r0 + r1 * 4] +lear2, [r2 + r3 * 4] +P2S_48x8_AVX512 +lear0, [r0 + r1 * 4] +lear2, [r2 + r3 * 4] +P2S_48x8_AVX512 +lear0, [r0 + r1 * 4] +lear2, [r2 + r3 * 4] +P2S_48x8_AVX512 +lear0, [r0 + r1 * 4] +lear2, [r2 + r3 * 4] +P2S_48x8_AVX512 +RET ;--
[x265] [PATCH 050 of 307] x86: AVX512 ssd_s_16
# HG changeset patch # User Vignesh Vijayakumar # Date 1500967696 -19800 # Tue Jul 25 12:58:16 2017 +0530 # Node ID 09159f73f47b7eda15c8d0294774fe6eafdadea7 # Parent a75dd880817adddafac5e1105e512ea79c7a089b x86: AVX512 ssd_s_16 This patch also reworks ssd_s_32 to support high bit depth ssd_s_16 AVX2 performance : 14.11x AVX512 performance : 16.14x ssd_s_32 for high bit depth AVX2 performance : 14.78x AVX512 performance : 20.54x diff -r a75dd880817a -r 09159f73f47b source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Jul 26 10:04:24 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 25 12:58:16 2017 +0530 @@ -2249,6 +2249,8 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); +p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512); + } } #else // if HIGH_BIT_DEPTH @@ -3919,6 +3921,7 @@ p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512); p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512); p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512); +p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx512); p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512); p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512); diff -r a75dd880817a -r 09159f73f47b source/common/x86/ssd-a.asm --- a/source/common/x86/ssd-a.asm Wed Jul 26 10:04:24 2017 +0530 +++ b/source/common/x86/ssd-a.asm Tue Jul 25 12:58:16 2017 +0530 @@ -3425,10 +3425,28 @@ paddd m0, m1 %endmacro +%macro PROCESS_SSD_S_16x8_AVX512 0 +movu ym1, [r0] +vinserti32x8 m1,[r0 + r1], 1 +movu ym2, [r0 + 2 * r1] +vinserti32x8 m2,[r0 + r3], 1 +lea r0,[r0 + 4 * r1] +movu ym3, [r0] +vinserti32x8 m3,[r0 + r1], 1 +movu ym4, [r0 + 2 * r1] +vinserti32x8 m4,[r0 + r3], 1 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +pmaddwd m4, m4 +paddd m1, m2 +paddd m3, m4 +paddd m1, m3 +paddd m0, m1 +%endmacro ;- ; int pixel_ssd_s( int16_t *ref, intptr_t i_stride ) ;- -%if HIGH_BIT_DEPTH==0 INIT_ZMM avx512 cglobal pixel_ssd_s_32, 2,4,5 add r1, r1 @@ -3444,10 +3462,39 @@ PROCESS_SSD_S_32x8_AVX512 ; calculate sum and return +%if BIT_DEPTH >= 10 +movum1, m0 +pxorm2, m2 +punpckldq m0, m2 +punpckhdq m1, m2 +paddq m0, m1 +vextracti32x8 ym2, m0, 1 +paddq ym0, ym2 +vextracti32x4 xm2, m0, 1 +paddq xm2, xm0 +movhlps xm1, xm2 +paddq xm2, xm1 +movqrax, xm2 +%else +HADDD m0, m1 +movdeax, xm0 +%endif +RET + +INIT_ZMM avx512 +cglobal pixel_ssd_s_16, 2,4,5 +add r1, r1 +lea r3, [r1 * 3] +pxorm0, m0 + +PROCESS_SSD_S_16x8_AVX512 +lea r0, [r0 + 4 * r1] +PROCESS_SSD_S_16x8_AVX512 + +; calculate sum and return HADDD m0, m1 movdeax, xm0 RET -%endif ;- ; ssd_s avx512 code end ;- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 054 of 307] [x265-avx512]x86: AVX512 pixel_sad_32xN for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty # Date 1501239623 -19800 # Fri Jul 28 16:30:23 2017 +0530 # Node ID e65ac86010af8f7ab1e5b43591330eeb6c818473 # Parent f8687bef93f25b343606e42f4fd252d5f0897d1a [x265-avx512]x86: AVX512 pixel_sad_32xN for high bit depth Size| AVX2 performance | AVX512 performance 32x8| 27.57x | 35.17x 32x16 | 27.96x | 40.74x 32x24 | 31.21x | 45.19x 32x32 | 32.12x | 47.23x 32x64 | 28.79x | 53.35x diff -r f8687bef93f2 -r e65ac86010af source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 01 12:39:14 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jul 28 16:30:23 2017 +0530 @@ -2256,6 +2256,12 @@ p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512); +p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512); +p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512); +p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512); +p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512); +p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512); + } } #else // if HIGH_BIT_DEPTH diff -r f8687bef93f2 -r e65ac86010af source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Tue Aug 01 12:39:14 2017 +0530 +++ b/source/common/x86/sad16-a.asm Fri Jul 28 16:30:23 2017 +0530 @@ -1154,6 +1154,179 @@ INIT_XMM sse2 SAD_12 12, 16 +%macro PROCESS_SAD_32x8_AVX512 0 +movum1, [r2] +movum2, [r2 + r3] +movum3, [r2 + 2 * r3] +movum4, [r2 + r5] +psubw m1, [r0] +psubw m2, [r0 + r1] +psubw m3, [r0 + 2 * r1] +psubw m4, [r0 + r4] +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +pabsw m4, m4 +paddw m1, m2 +paddw m3, m4 +paddw m5, m1, m3 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] + +movum1, [r2] +movum2, [r2 + r3] +movum3, [r2 + 2 * r3] +movum4, [r2 + r5] +psubw m1, [r0] +psubw m2, [r0 + r1] +psubw m3, [r0 + 2 * r1] +psubw m4, [r0 + r4] +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +pabsw m4, m4 +paddw m1, m2 +paddw m3, m4 +paddw m1, m3 + +pmaddwd m5, m6 +paddd m0, m5 +pmaddwd m1, m6 +paddd m0, m1 +%endmacro + +%macro PROCESS_SAD_AVX512_END 0 +vextracti32x8 ym1, m0, 1 +paddd ym0, ym1 +vextracti64x2 xm1, m0, 1 +paddd xm0, xm1 +pshufd xm1, xm0, 1110b +paddd xm0, xm1 +pshufd xm1, xm0, 0001b +paddd xm0, xm1 +movd eax, xm0 +%endmacro + + + +;- +; int pixel_sad_32x%1( uint16_t *, intptr_t, uint16_t *, intptr_t ) +;- +INIT_ZMM avx512 +cglobal pixel_sad_32x8, 4,6,7 +pxorm0, m0 + +vbroadcasti32x8 m6, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] + +PROCESS_SAD_32x8_AVX512 +PROCESS_SAD_AVX512_END +RET + + +INIT_ZMM avx512 +cglobal pixel_sad_32x16, 4,6,7 +pxorm0, m0 + +vbroadcasti32x8 m6, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] + +PROCESS_SAD_32x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_32x8_AVX512 +PROCESS_SAD_AVX512_END +RET + +INIT_ZMM avx512 +cglobal pixel_sad_32x24, 4,6,7 + pxorm0, m0 + +vbroadcasti32x8 m6, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] + +PROCESS_SAD_32x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_32x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_32x8_AVX512 +PROCESS_SAD_AVX512_END +RET + +INIT_ZMM avx512 +cglobal pixel_sad_32x32, 4,6,7 +pxorm0, m0 + +vbroadcasti32x8 m6, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] + +PROCESS_SAD_32x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_32x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_32x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +PROCESS_SAD_32x8_AVX512 +PROCESS_SAD_AVX512_END +RET + +INIT_ZMM avx512 +cglobal pixel_sad_32x64, 4,6,7 + pxorm0, m0 + +vbroadcasti32x8 m6, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] + +PROCESS_SAD_32x8_AVX512 +lear2, [r2 + 4 * r3] +lea
[x265] [PATCH 059 of 307] [x265-avx512]x86: AVX512 pixel_sad_48x64 for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty # Date 1501674408 -19800 # Wed Aug 02 17:16:48 2017 +0530 # Node ID 55ed1898de6bd2b8688aa8f1f7b29ae35f674ab4 # Parent 585b35cf6baad20d1cd5fb760d88ad2fbd99e63f [x265-avx512]x86: AVX512 pixel_sad_48x64 for high bit depth AVX2 performance : 29.55x AVX512 performance : 40.07x diff -r 585b35cf6baa -r 55ed1898de6b source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Aug 02 17:16:48 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Aug 02 17:16:48 2017 +0530 @@ -2272,6 +2272,7 @@ p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512); p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512); p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512); +p.pu[LUMA_48x64].sad = PFX(pixel_sad_48x64_avx512); p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512); p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512); p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); diff -r 585b35cf6baa -r 55ed1898de6b source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Wed Aug 02 17:16:48 2017 +0530 +++ b/source/common/x86/sad16-a.asm Wed Aug 02 17:16:48 2017 +0530 @@ -1986,6 +1986,111 @@ PROCESS_SAD_AVX512_END RET +;- +; int pixel_sad_48x64( uint16_t *, intptr_t, uint16_t *, intptr_t ) +;- +INIT_ZMM avx512 +cglobal pixel_sad_48x64, 4, 7, 9 +pxorm0, m0 +mov r6d, 64/8 + +vbroadcasti32x8 m8, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] +.loop: +movum1, [r2] +movum2, [r2 + r3] +movu ym3, [r2 + mmsize] +vinserti32x8m3, [r2 + r3 + mmsize], 1 +movum4, [r0] +movum5, [r0 + r1] +movu ym6, [r0 + mmsize] +vinserti32x8m6, [r0 + r1 + mmsize], 1 + +psubw m1, m4 +psubw m2, m5 +psubw m3, m6 +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +paddw m1, m2 +paddw m7, m3, m1 + +movum1, [r2 + 2 * r3] +movum2, [r2 + r5] +movu ym3, [r2 + 2 * r3 + mmsize] +vinserti32x8m3, [r2 + r5 + mmsize], 1 +movum4, [r0 + 2 * r1] +movum5, [r0 + r4] +movu ym6, [r0 + 2 * r1 + mmsize] +vinserti32x8m6, [r0 + r4 + mmsize], 1 +psubw m1, m4 +psubw m2, m5 +psubw m3, m6 +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +paddw m1, m2 +paddw m1, m3 + +pmaddwd m7, m8 +paddd m0, m7 +pmaddwd m1, m8 +paddd m0, m1 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] + +movum1, [r2] +movum2, [r2 + r3] +movu ym3, [r2 + mmsize] +vinserti32x8m3, [r2 + r3 + mmsize], 1 +movum4, [r0] +movum5, [r0 + r1] +movu ym6, [r0 + mmsize] +vinserti32x8m6, [r0 + r1 + mmsize], 1 + +psubw m1, m4 +psubw m2, m5 +psubw m3, m6 +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +paddw m1, m2 +paddw m7, m3, m1 + +movum1, [r2 + 2 * r3] +movum2, [r2 + r5] +movu ym3, [r2 + 2 * r3 + mmsize] +vinserti32x8m3, [r2 + r5 + mmsize], 1 +movum4, [r0 + 2 * r1] +movum5, [r0 + r4] +movu ym6, [r0 + 2 * r1 + mmsize] +vinserti32x8m6, [r0 + r4 + mmsize], 1 +psubw m1, m4 +psubw m2, m5 +psubw m3, m6 +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +paddw m1, m2 +paddw m1, m3 + +pmaddwd m7, m8 +paddd m0, m7 +pmaddwd m1, m8 +paddd m0, m1 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] + +dec r6d +jg .loop + +PROCESS_SAD_AVX512_END +RET + ;= ; SAD x3/x4 ;= ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 051 of 307] x86: AVX512 cleanup addAvg, copy_ps and copy_sp
# HG changeset patch # User Vignesh Vijayakumar # Date 1500980858 -19800 # Tue Jul 25 16:37:38 2017 +0530 # Node ID 984cad60283b474ed756238cf904b08df290e103 # Parent 09159f73f47b7eda15c8d0294774fe6eafdadea7 x86: AVX512 cleanup addAvg, copy_ps and copy_sp diff -r 09159f73f47b -r 984cad60283b source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Tue Jul 25 12:58:16 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Tue Jul 25 16:37:38 2017 +0530 @@ -2162,15 +2162,7 @@ BLOCKCOPY_SP_W64_H4_avx2 64, 64 -%macro BLOCKCOPY_SP_W64_H4_avx512 2 -INIT_ZMM avx512 -cglobal blockcopy_sp_%1x%2, 4, 7, 4, dst, dstStride, src, srcStride -movr4d, %2/4 -addr3, r3 -lear5, [3 * r3] -lear6, [3 * r1] - -.loop: +%macro PROCESS_BLOCKCOPY_SP_64x8_AVX512 0 movu m0, [r2] movu m1, [r2 + 64] movu m2, [r2 + r3] @@ -2187,8 +2179,8 @@ movu m0, [r2 + 2 * r3] movu m1, [r2 + 2 * r3 + 64] -movu m2, [r2 + r5] -movu m3, [r2 + r5 + 64] +movu m2, [r2 + r4] +movu m3, [r2 + r4 + 64] packuswb m0, m1 packuswb m2, m3 @@ -2197,17 +2189,69 @@ vshufi64x2 m0, m0, 11011000b vshufi64x2 m2, m2, 11011000b movu [r0 + 2 * r1], m0 -movu [r0 + r6], m2 +movu [r0 + r5], m2 lear0, [r0 + 4 * r1] lear2, [r2 + 4 * r3] -decr4d -jnz.loop +movu m0, [r2] +movu m1, [r2 + 64] +movu m2, [r2 + r3] +movu m3, [r2 + r3 + 64] + +packuswb m0, m1 +packuswb m2, m3 +vpermq m0, m0, 11011000b +vpermq m2, m2, 11011000b +vshufi64x2 m0, m0, 11011000b +vshufi64x2 m2, m2, 11011000b +movu [r0], m0 +movu [r0 + r1], m2 + +movu m0, [r2 + 2 * r3] +movu m1, [r2 + 2 * r3 + 64] +movu m2, [r2 + r4] +movu m3, [r2 + r4 + 64] + +packuswb m0, m1 +packuswb m2, m3 +vpermq m0, m0, 11011000b +vpermq m2, m2, 11011000b +vshufi64x2 m0, m0, 11011000b +vshufi64x2 m2, m2, 11011000b +movu [r0 + 2 * r1], m0 +movu [r0 + r5], m2 +%endmacro + +INIT_ZMM avx512 +cglobal blockcopy_sp_64x64, 4, 6, 4 +addr3, r3 +lear4, [3 * r3] +lear5, [3 * r1] + +PROCESS_BLOCKCOPY_SP_64x8_AVX512 +lear0, [r0 + 4 * r1] +lear2, [r2 + 4 * r3] +PROCESS_BLOCKCOPY_SP_64x8_AVX512 +lear0, [r0 + 4 * r1] +lear2, [r2 + 4 * r3] +PROCESS_BLOCKCOPY_SP_64x8_AVX512 +lear0, [r0 + 4 * r1] +lear2, [r2 + 4 * r3] +PROCESS_BLOCKCOPY_SP_64x8_AVX512 +lear0, [r0 + 4 * r1] +lear2, [r2 + 4 * r3] +PROCESS_BLOCKCOPY_SP_64x8_AVX512 +lear0, [r0 + 4 * r1] +lear2, [r2 + 4 * r3] +PROCESS_BLOCKCOPY_SP_64x8_AVX512 +lear0, [r0 + 4 * r1] +lear2, [r2 + 4 * r3] +PROCESS_BLOCKCOPY_SP_64x8_AVX512 +lear0, [r0 + 4 * r1] +lear2, [r2 + 4 * r3] +PROCESS_BLOCKCOPY_SP_64x8_AVX512 RET -%endmacro - -BLOCKCOPY_SP_W64_H4_avx512 64, 64 ;- ; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val) @@ -3184,35 +3228,78 @@ BLOCKCOPY_PS_W32_H4_avx2 32, 32 BLOCKCOPY_PS_W32_H4_avx2 32, 64 -%macro BLOCKCOPY_PS_W32_H4_avx512 2 -INIT_ZMM avx512 -cglobal blockcopy_ps_%1x%2, 4, 7, 4 -add r1, r1 -mov r4d, %2/8 -lea r5, [3 * r3] -lea r6, [3 * r1] -.loop: -%rep 2 +%macro PROCESS_BLOCKCOPY_PS_32x8_AVX512 0 pmovzxbw m0, [r2] pmovzxbw m1, [r2 + r3] pmovzxbw m2, [r2 + r3 * 2] -pmovzxbw m3, [r2 + r5] +pmovzxbw m3, [r2 + r4] movu [r0], m0 movu [r0 + r1], m1 movu [r0 + r1 * 2], m2 -movu [r0 + r6], m3 +movu [r0 + r5], m3 lea r0, [r0 + 4 * r1] lea r2, [r2 + 4 * r3] -%endrep -dec r4d -jnz .loop + +pmovzxbw m0
[x265] [PATCH 048 of 307] x86: AVX512 convert_p2s_32xN for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1500988851 -19800 # Tue Jul 25 18:50:51 2017 +0530 # Node ID b4c2149e9bb1119857363094492b50e85593fb74 # Parent d05b920865e7c9e8cc9441e77df888b48acb50d1 x86: AVX512 convert_p2s_32xN for high bit depth Size | AVX2 performance | AVX512 performance -- 32x8 | 7.85x| 7.95x 32x16 | 9.54x| 15.32x 32x24 | 10.02x| 17.01x 32x32 | 10.97x| 18.22x 32x64 | 9.82x| 19.59x diff -r d05b920865e7 -r b4c2149e9bb1 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 25 18:28:43 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 25 18:50:51 2017 +0530 @@ -2234,6 +2234,20 @@ p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512); p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512); p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512); +p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx2); +p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_avx512); +p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512); +p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512); +p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); + } } #else // if HIGH_BIT_DEPTH diff -r d05b920865e7 -r b4c2149e9bb1 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Tue Jul 25 18:28:43 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Tue Jul 25 18:50:51 2017 +0530 @@ -377,6 +377,45 @@ movu [r2 + r4 + mmsize], m3 %endmacro +%macro P2S_32x8_AVX512 0 +movu m0, [r0] +movu m1, [r0 + r1] +movu m2, [r0 + r1 * 2] +movu m3, [r0 + r5] +psllw m0, (14 - BIT_DEPTH) +psllw m1, (14 - BIT_DEPTH) +psllw m2, (14 - BIT_DEPTH) +psllw m3, (14 - BIT_DEPTH) +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu [r2], m0 +movu [r2 + r3], m1 +movu [r2 + r3 * 2], m2 +movu [r2 + r4], m3 + +lear0, [r0 + r1 * 4] +lear2, [r2 + r3 * 4] + +movu m0, [r0] +movu m1, [r0 + r1] +movu m2, [r0 + r1 * 2] +movu m3, [r0 + r5] +psllw m0, (14 - BIT_DEPTH) +psllw m1, (14 - BIT_DEPTH) +psllw m2, (14 - BIT_DEPTH) +psllw m3, (14 - BIT_DEPTH) +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu [r2], m0 +movu [r2 + r3], m1 +movu [r2 + r3 * 2], m2 +movu [r2 + r4], m3 +%endmacro + ;- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride) ;- @@ -475,6 +514,132 @@ lear2, [r2 + r3 * 4] P2S_64x8_AVX512 RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x8, 4, 6, 5 +addr1d, r1d +addr3d, r3d +lear4, [r3 * 3] +lear5, [r1 * 3] + +; load constant +vbroadcasti32x8m4, [pw_2000] +P2S_32x8_AVX512 +RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x16, 4, 6, 5 +addr1d, r1d +addr3d, r3d +lear4, [r3 * 3] +lear5, [r1 * 3] + +; load constant +vbroadcasti32x8m4, [pw_2000] +P2S_32x8_AVX512 +lear0, [r0 + r1 * 4] +lear2, [r2 + r3 * 4] +P2S_32x8_AVX512 +RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x24, 4, 6, 5 +addr1d, r1d +addr3d, r3d +lear4, [r3 * 3] +lear5, [r1 * 3] + +; load constant +vbroadcasti32x8m4, [pw_2000] +P2S_32x8_AVX512 +lear0, [r0 + r1 * 4] +lear2, [r2 + r3 * 4] +P2S_32x8_AVX512 +lear0, [r0 + r1 * 4] +lear2, [r2 + r3 * 4] +P2S_32x8_AVX512 +
[x265] [PATCH 058 of 307] [x265-avx512]x86: AVX512 pixel_sad_48x64 for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty # Date 1501674408 -19800 # Wed Aug 02 17:16:48 2017 +0530 # Node ID 585b35cf6baad20d1cd5fb760d88ad2fbd99e63f # Parent b355ac2912dd111b96dbb5893b34405863e7382f [x265-avx512]x86: AVX512 pixel_sad_48x64 for high bit depth AVX2 performance : 29.55x AVX512 performance : 40.07x diff -r b355ac2912dd -r 585b35cf6baa source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Aug 02 11:28:32 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Aug 02 17:16:48 2017 +0530 @@ -2261,6 +2261,7 @@ p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512); p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512); p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512); +p.pu[LUMA_48x64].sad = PFX(pixel_sad_48x64_avx512); p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512); p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512); p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); diff -r b355ac2912dd -r 585b35cf6baa source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Wed Aug 02 11:28:32 2017 +0530 +++ b/source/common/x86/sad16-a.asm Wed Aug 02 17:16:48 2017 +0530 @@ -1881,6 +1881,111 @@ PROCESS_SAD_AVX512_END RET +;- +; int pixel_sad_48x64( uint16_t *, intptr_t, uint16_t *, intptr_t ) +;- +INIT_ZMM avx512 +cglobal pixel_sad_48x64, 4, 7, 9 +pxorm0, m0 +mov r6d, 64/8 + +vbroadcasti32x8 m8, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] +.loop: +movum1, [r2] +movum2, [r2 + r3] +movu ym3, [r2 + mmsize] +vinserti32x8m3, [r2 + r3 + mmsize], 1 +movum4, [r0] +movum5, [r0 + r1] +movu ym6, [r0 + mmsize] +vinserti32x8m6, [r0 + r1 + mmsize], 1 + +psubw m1, m4 +psubw m2, m5 +psubw m3, m6 +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +paddw m1, m2 +paddw m7, m3, m1 + +movum1, [r2 + 2 * r3] +movum2, [r2 + r5] +movu ym3, [r2 + 2 * r3 + mmsize] +vinserti32x8m3, [r2 + r5 + mmsize], 1 +movum4, [r0 + 2 * r1] +movum5, [r0 + r4] +movu ym6, [r0 + 2 * r1 + mmsize] +vinserti32x8m6, [r0 + r4 + mmsize], 1 +psubw m1, m4 +psubw m2, m5 +psubw m3, m6 +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +paddw m1, m2 +paddw m1, m3 + +pmaddwd m7, m8 +paddd m0, m7 +pmaddwd m1, m8 +paddd m0, m1 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] + +movum1, [r2] +movum2, [r2 + r3] +movu ym3, [r2 + mmsize] +vinserti32x8m3, [r2 + r3 + mmsize], 1 +movum4, [r0] +movum5, [r0 + r1] +movu ym6, [r0 + mmsize] +vinserti32x8m6, [r0 + r1 + mmsize], 1 + +psubw m1, m4 +psubw m2, m5 +psubw m3, m6 +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +paddw m1, m2 +paddw m7, m3, m1 + +movum1, [r2 + 2 * r3] +movum2, [r2 + r5] +movu ym3, [r2 + 2 * r3 + mmsize] +vinserti32x8m3, [r2 + r5 + mmsize], 1 +movum4, [r0 + 2 * r1] +movum5, [r0 + r4] +movu ym6, [r0 + 2 * r1 + mmsize] +vinserti32x8m6, [r0 + r4 + mmsize], 1 +psubw m1, m4 +psubw m2, m5 +psubw m3, m6 +pabsw m1, m1 +pabsw m2, m2 +pabsw m3, m3 +paddw m1, m2 +paddw m1, m3 + +pmaddwd m7, m8 +paddd m0, m7 +pmaddwd m1, m8 +paddd m0, m1 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] + +dec r6d +jg .loop + +PROCESS_SAD_AVX512_END +RET + ;= ; SAD x3/x4 ;= ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 061 of 307] x86: AVX512 addAvg_W64 for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1501588310 -19800 # Tue Aug 01 17:21:50 2017 +0530 # Node ID 465b4925d622ba66e2536c9f79eaaffcdd26d5fc # Parent 73ee464e136910a95d7b3070a1c736dedeaa6278 x86: AVX512 addAvg_W64 for high bit depth Size | AVX2 performance | AVX512 performance -- 64x16 | 11.13x | 18.48x 64x32 | 11.04x | 17.75x 64x48 | 10.97x | 17.85x 64x64 | 10.93x | 17.37x diff -r 73ee464e1369 -r 465b4925d622 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 01 16:45:51 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 01 17:21:50 2017 +0530 @@ -2278,6 +2278,10 @@ p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512); +p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_avx512); +p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx512); +p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512); +p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512); p.pu[LUMA_32x8].addAvg = PFX(addAvg_32x8_avx512); p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_avx512); p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx512); diff -r 73ee464e1369 -r 465b4925d622 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmTue Aug 01 16:45:51 2017 +0530 +++ b/source/common/x86/mc-a.asmTue Aug 01 17:21:50 2017 +0530 @@ -1738,6 +1738,80 @@ movu[r2 + r8], m0 %endmacro +%macro PROCESS_ADDAVG_64x4_HBD_AVX512 0 +movum0, [r0] +movum1, [r1] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2],m0 + +movum0, [r0 + mmsize] +movum1, [r1 + mmsize] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2 + mmsize], m0 + +movum0, [r0 + r3] +movum1, [r1 + r4] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2 + r5], m0 + +movum0, [r0 + r3 + mmsize] +movum1, [r1 + r4 + mmsize] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2 + r5 + mmsize], m0 + +movum0, [r0 + 2 * r3] +movum1, [r1 + 2 * r4] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2 + 2 * r5], m0 + +movum0, [r0 + 2 * r3 + mmsize] +movum1, [r1 + 2 * r4 + mmsize] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2 + 2 * r5 + mmsize], m0 + +movum0, [r0 + r6] +movum1, [r1 + r7] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2 + r8], m0 + +movum0, [r0 + r6 + mmsize] +movum1, [r1 + r7 + mmsize] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2 + r8 + mmsize], m0 +%endmacro + ;- ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) ;- @@ -1771,6 +1845,35 @@ ADDAVG_W32_HBD_AVX512 32 ADDAVG_W32_HBD_AVX512 48 ADDAVG_W32_HBD_AVX512 64 + +%macro ADDAVG_W64_HBD_AVX512 1 +INIT_ZMM avx512 +cglobal addAvg_64x%1, 6,9,6 +vbroadcasti32x8m4, [pw_ %+ ADDAVG_ROUND] +vbroadcasti32x8m5, [pw_pixel_max] +vbroadcasti32x8m3, [pw_ %+ ADDAVG_FACTOR] +pxorm2, m2 +add r3, r3 +add r4,
[x265] [PATCH 063 of 307] x86: AVX512 pixel_avg_weight_W64 for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1501593145 -19800 # Tue Aug 01 18:42:25 2017 +0530 # Node ID fabc3475654f222b469c57b6cf8fd41b334d71be # Parent ef7fd93923fa24a8f77a557817b03078356443e7 x86: AVX512 pixel_avg_weight_W64 for high bit depth Size | AVX2 performance | AVX512 performance -- 64x16 | 11.78x| 20.54x 64x32 | 12.08x| 23.01x 64x48 | 12.26x| 22.62x 64x64 | 12.35x| 22.67x diff -r ef7fd93923fa -r fabc3475654f source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 01 18:27:37 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 01 18:42:25 2017 +0530 @@ -2301,6 +2301,10 @@ p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_avx512); p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_avx512); p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_avx512); +p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx512); +p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx512); +p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx512); +p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx512); } } diff -r ef7fd93923fa -r fabc3475654f source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmTue Aug 01 18:27:37 2017 +0530 +++ b/source/common/x86/mc-a.asmTue Aug 01 18:42:25 2017 +0530 @@ -5676,6 +5676,84 @@ movu[r0 + r8], m2 %endmacro +%macro PROCESS_PIXELAVG_64x8_HBD_AVX512 0 +movum0, [r2] +movum1, [r4] +movum2, [r2 + r3] +movum3, [r4 + r5] +pavgw m0, m1 +pavgw m2, m3 +movu[r0], m0 +movu[r0 + r1], m2 + +movum0, [r2 + mmsize] +movum1, [r4 + mmsize] +movum2, [r2 + r3 + mmsize] +movum3, [r4 + r5 + mmsize] +pavgw m0, m1 +pavgw m2, m3 +movu[r0 + mmsize], m0 +movu[r0 + r1 + mmsize], m2 + +movum0, [r2 + r3 * 2] +movum1, [r4 + r5 * 2] +movum2, [r2 + r6] +movum3, [r4 + r7] +pavgw m0, m1 +pavgw m2, m3 +movu[r0 + r1 * 2], m0 +movu[r0 + r8], m2 + +movum0, [r2 + r3 * 2 + mmsize] +movum1, [r4 + r5 * 2 + mmsize] +movum2, [r2 + r6 + mmsize] +movum3, [r4 + r7 + mmsize] +pavgw m0, m1 +pavgw m2, m3 +movu[r0 + r1 * 2 + mmsize], m0 +movu[r0 + r8 + mmsize], m2 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +lea r4, [r4 + 4 * r5] + +movum0, [r2] +movum1, [r4] +movum2, [r2 + r3] +movum3, [r4 + r5] +pavgw m0, m1 +pavgw m2, m3 +movu[r0], m0 +movu[r0 + r1], m2 + +movum0, [r2 + mmsize] +movum1, [r4 + mmsize] +movum2, [r2 + r3 + mmsize] +movum3, [r4 + r5 + mmsize] +pavgw m0, m1 +pavgw m2, m3 +movu[r0 + mmsize], m0 +movu[r0 + r1 + mmsize], m2 + +movum0, [r2 + r3 * 2] +movum1, [r4 + r5 * 2] +movum2, [r2 + r6] +movum3, [r4 + r7] +pavgw m0, m1 +pavgw m2, m3 +movu[r0 + r1 * 2], m0 +movu[r0 + r8], m2 + +movum0, [r2 + r3 * 2 + mmsize] +movum1, [r4 + r5 * 2 + mmsize] +movum2, [r2 + r6 + mmsize] +movum3, [r4 + r7 + mmsize] +pavgw m0, m1 +pavgw m2, m3 +movu[r0 + r1 * 2 + mmsize], m0 +movu[r0 + r8 + mmsize], m2 +%endmacro + %macro PIXEL_AVG_HBD_W32 1 INIT_ZMM avx512 cglobal pixel_avg_32x%1, 6,9,4 @@ -5701,6 +5779,31 @@ PIXEL_AVG_HBD_W32 24 PIXEL_AVG_HBD_W32 32 PIXEL_AVG_HBD_W32 64 + +%macro PIXEL_AVG_HBD_W64 1 +INIT_ZMM avx512 +cglobal pixel_avg_64x%1, 6,9,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] + +%rep %1/8 - 1 +PROCESS_PIXELAVG_64x8_HBD_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +lea r4, [r4 + 4 * r5] +%endrep +PROCESS_PIXELAVG_64x8_HBD_AVX512 +RET +%endmacro + +PIXEL_AVG_HBD_W64 16 +PIXEL_AVG_HBD_W64 32 +PIXEL_AVG_HBD_W64 48 +PIXEL_AVG_HBD_W64 64 ;- ;pixel_avg_pp avx512 high bit depth code end ;- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 060 of 307] x86: AVX512 addAvg_W32 for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1501586151 -19800 # Tue Aug 01 16:45:51 2017 +0530 # Node ID 73ee464e136910a95d7b3070a1c736dedeaa6278 # Parent 55ed1898de6bd2b8688aa8f1f7b29ae35f674ab4 x86: AVX512 addAvg_W32 for high bit depth Size | AVX2 performance | AVX512 performance -- 32x8 | 9.83x| 18.11x 32x16 | 9.65x| 17.72x 32x24 | 9.50x| 18.41x 32x32 | 9.28x| 19.29x 32x64 | 9.23x| 18.71x diff -r 55ed1898de6b -r 73ee464e1369 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Aug 02 17:16:48 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 01 16:45:51 2017 +0530 @@ -2278,6 +2278,20 @@ p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512); +p.pu[LUMA_32x8].addAvg = PFX(addAvg_32x8_avx512); +p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_avx512); +p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx512); +p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_avx512); +p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = PFX(addAvg_32x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_avx512); + } } #else // if HIGH_BIT_DEPTH diff -r 55ed1898de6b -r 73ee464e1369 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmWed Aug 02 17:16:48 2017 +0530 +++ b/source/common/x86/mc-a.asmTue Aug 01 16:45:51 2017 +0530 @@ -1656,6 +1656,124 @@ ADDAVG_W64_H1_AVX2 32 ADDAVG_W64_H1_AVX2 48 ADDAVG_W64_H1_AVX2 64 + +;- +;addAvg avx512 high bit depth code start +;- +%macro PROCESS_ADDAVG_32x8_HBD_AVX512 0 +movum0, [r0] +movum1, [r1] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2],m0 + +movum0, [r0 + r3] +movum1, [r1 + r4] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2 + r5], m0 + +movum0, [r0 + 2 * r3] +movum1, [r1 + 2 * r4] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2 + 2 * r5], m0 + +movum0, [r0 + r6] +movum1, [r1 + r7] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2 + r8], m0 + +lea r2, [r2 + 4 * r5] +lea r0, [r0 + 4 * r3] +lea r1, [r1 + 4 * r4] + +movum0, [r0] +movum1, [r1] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2],m0 + +movum0, [r0 + r3] +movum1, [r1 + r4] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2 + r5], m0 + +movum0, [r0 + 2 * r3] +movum1, [r1 + 2 * r4] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2 + 2 * r5], m0 + +movum0, [r0 + r6] +movum1, [r1 + r7] +paddw m0, m1 +pmulhrsw
[x265] [PATCH 062 of 307] x86: AVX512 pixel_avg_weight_W32 for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1501592257 -19800 # Tue Aug 01 18:27:37 2017 +0530 # Node ID ef7fd93923fa24a8f77a557817b03078356443e7 # Parent 465b4925d622ba66e2536c9f79eaaffcdd26d5fc x86: AVX512 pixel_avg_weight_W32 for high bit depth Size | AVX2 performance | AVX512 performance -- 32x8 | 11.23x| 15.70x 32x16 | 10.88x| 19.51x 32x24 | 10.90x| 20.04x 32x32 | 11.78x| 20.37x 32x64 | 11.38x| 20.30x diff -r 465b4925d622 -r ef7fd93923fa source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 01 17:21:50 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 01 18:27:37 2017 +0530 @@ -2296,6 +2296,12 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_avx512); +p.pu[LUMA_32x8].pixelavg_pp = PFX(pixel_avg_32x8_avx512); +p.pu[LUMA_32x16].pixelavg_pp = PFX(pixel_avg_32x16_avx512); +p.pu[LUMA_32x24].pixelavg_pp = PFX(pixel_avg_32x24_avx512); +p.pu[LUMA_32x32].pixelavg_pp = PFX(pixel_avg_32x32_avx512); +p.pu[LUMA_32x64].pixelavg_pp = PFX(pixel_avg_32x64_avx512); + } } #else // if HIGH_BIT_DEPTH diff -r 465b4925d622 -r ef7fd93923fa source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmTue Aug 01 17:21:50 2017 +0530 +++ b/source/common/x86/mc-a.asmTue Aug 01 18:27:37 2017 +0530 @@ -5631,6 +5631,79 @@ RET %endif +;- +;pixel_avg_pp avx512 high bit depth code start +;- +%macro PROCESS_PIXELAVG_32x8_HBD_AVX512 0 +movum0, [r2] +movum1, [r4] +movum2, [r2 + r3] +movum3, [r4 + r5] +pavgw m0, m1 +pavgw m2, m3 +movu[r0], m0 +movu[r0 + r1], m2 + +movum0, [r2 + r3 * 2] +movum1, [r4 + r5 * 2] +movum2, [r2 + r6] +movum3, [r4 + r7] +pavgw m0, m1 +pavgw m2, m3 +movu[r0 + r1 * 2], m0 +movu[r0 + r8], m2 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +lea r4, [r4 + 4 * r5] + +movum0, [r2] +movum1, [r4] +movum2, [r2 + r3] +movum3, [r4 + r5] +pavgw m0, m1 +pavgw m2, m3 +movu[r0], m0 +movu[r0 + r1], m2 + +movum0, [r2 + r3 * 2] +movum1, [r4 + r5 * 2] +movum2, [r2 + r6] +movum3, [r4 + r7] +pavgw m0, m1 +pavgw m2, m3 +movu[r0 + r1 * 2], m0 +movu[r0 + r8], m2 +%endmacro + +%macro PIXEL_AVG_HBD_W32 1 +INIT_ZMM avx512 +cglobal pixel_avg_32x%1, 6,9,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] + +%rep %1/8 - 1 +PROCESS_PIXELAVG_32x8_HBD_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +lea r4, [r4 + 4 * r5] +%endrep +PROCESS_PIXELAVG_32x8_HBD_AVX512 +RET +%endmacro + +PIXEL_AVG_HBD_W32 8 +PIXEL_AVG_HBD_W32 16 +PIXEL_AVG_HBD_W32 24 +PIXEL_AVG_HBD_W32 32 +PIXEL_AVG_HBD_W32 64 +;- +;pixel_avg_pp avx512 high bit depth code end +;- %endif ; HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 065 of 307] [x265-avx512]x86: AVX512 pixel_sad_x3_32xN for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty # Date 1501765251 -19800 # Thu Aug 03 18:30:51 2017 +0530 # Node ID df45017fca906d5f3370dcc78e43284622753a73 # Parent 200e6c43adc0c77e588a44d734e7d340e4753ccd [x265-avx512]x86: AVX512 pixel_sad_x3_32xN for high bit depth Size| AVX2 performance | AVX512 performance 32x8| 20.72x | 29.20x 32x16 | 19.31x | 30.53x 32x24 | 19.78x | 33.32x 32x32 | 20.02x | 32.71x 32x64 | 20.40x | 33.30x diff -r 200e6c43adc0 -r df45017fca90 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 01 18:52:23 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Aug 03 18:30:51 2017 +0530 @@ -2307,6 +2307,12 @@ p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx512); p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx512); +p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx512); +p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx512); +p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512); +p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512); +p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512); + } } #else // if HIGH_BIT_DEPTH diff -r 200e6c43adc0 -r df45017fca90 source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Tue Aug 01 18:52:23 2017 +0530 +++ b/source/common/x86/sad16-a.asm Thu Aug 03 18:30:51 2017 +0530 @@ -2497,3 +2497,362 @@ SAD_X 4, 64, 48 SAD_X 4, 64, 64 +; +; SAD x3/x4 avx512 code start +; + +%macro PROCESS_SAD_X3_32x4_AVX512 0 +movum6, [r0] +movum3, [r1] +movum4, [r2] +movum5, [r3] + + +psubw m3, m6 +psubw m4, m6 +psubw m5, m6 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 + +movum6, [r0 + 2 * FENC_STRIDE] +movum3, [r1 + r4] +movum4, [r2 + r4] +movum5, [r3 + r4] + +psubw m3, m6 +psubw m4, m6 +psubw m5, m6 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 + +movum6, [r0 + 4 * FENC_STRIDE] +movum3, [r1 + 2 * r4] +movum4, [r2 + 2 * r4] +movum5, [r3 + 2 * r4] + +psubw m3, m6 +psubw m4, m6 +psubw m5, m6 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 + +movum6, [r0 + 6 * FENC_STRIDE] +movum3, [r1 + r6] +movum4, [r2 + r6] +movum5, [r3 + r6] + +psubw m3, m6 +psubw m4, m6 +psubw m5, m6 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 +%endmacro + + +%macro PROCESS_SAD_X3_END_AVX512 0 +vextracti32x8 ym3, m0, 1 +vextracti32x8 ym4, m1, 1 +vextracti32x8 ym5, m2, 1 + +paddd ym0, ym3 +paddd ym1, ym4 +paddd ym2, ym5 + +vextracti64x2 xm3, m0, 1 +vextracti64x2 xm4, m1, 1 +vextracti64x2 xm5, m2, 1 + +paddd xm0, xm3 +paddd xm1, xm4 +paddd xm2, xm5 + +pshufd xm3, xm0, 1110b +pshufd xm4, xm1, 1110b +pshufd xm5, xm2, 1110b + +paddd xm0, xm3 +paddd xm1, xm4 +paddd xm2, xm5 + +pshufd xm3, xm0, 0001b +pshufd xm4, xm1, 0001b +pshufd xm5, xm2, 0001b + +paddd xm0, xm3 +paddd xm1, xm4 +paddd xm2, xm5 + +movd [r5 + 0], xm0 +movd [r5 + 4], xm1 +movd [r5 + 8], xm2 +%endmacro + + +;-- +; void pixel_sad_x3_32x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res ) +;-- + +INIT_ZMM avx512 +cglobal pixel_sad_x3_32x8, 6,7,8 +pxorm0, m0 +pxorm1, m1 +pxorm2, m2 + +vbroadcasti32x8 m7, [pw_1] + +add r4d, r4d +lea r6d, [r4 * 3] + +PROCESS_SAD_X3_32x4_AVX512 +add r0, FENC_STRIDE * 8 +lea r1, [r1 + r4 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r4 * 4] +
[x265] [PATCH 064 of 307] x86: AVX512 pixel_avg_weight_48x64 for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1501593743 -19800 # Tue Aug 01 18:52:23 2017 +0530 # Node ID 200e6c43adc0c77e588a44d734e7d340e4753ccd # Parent fabc3475654f222b469c57b6cf8fd41b334d71be x86: AVX512 pixel_avg_weight_48x64 for high bit depth AVX2 performance: 11.84x AVX512 performance: 17.79x diff -r fabc3475654f -r 200e6c43adc0 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 01 18:42:25 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 01 18:52:23 2017 +0530 @@ -2305,6 +2305,7 @@ p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx512); p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx512); p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx512); +p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx512); } } diff -r fabc3475654f -r 200e6c43adc0 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmTue Aug 01 18:42:25 2017 +0530 +++ b/source/common/x86/mc-a.asmTue Aug 01 18:52:23 2017 +0530 @@ -5754,6 +5754,84 @@ movu[r0 + r8 + mmsize], m2 %endmacro +%macro PROCESS_PIXELAVG_48x8_HBD_AVX512 0 +movum0, [r2] +movum1, [r4] +movum2, [r2 + r3] +movum3, [r4 + r5] +pavgw m0, m1 +pavgw m2, m3 +movu[r0], m0 +movu[r0 + r1], m2 + +movuym0, [r2 + mmsize] +movuym1, [r4 + mmsize] +movuym2, [r2 + r3 + mmsize] +movuym3, [r4 + r5 + mmsize] +pavgw ym0, ym1 +pavgw ym2, ym3 +movu[r0 + mmsize], ym0 +movu[r0 + r1 + mmsize], ym2 + +movum0, [r2 + r3 * 2] +movum1, [r4 + r5 * 2] +movum2, [r2 + r6] +movum3, [r4 + r7] +pavgw m0, m1 +pavgw m2, m3 +movu[r0 + r1 * 2], m0 +movu[r0 + r8], m2 + +movuym0, [r2 + r3 * 2 + mmsize] +movuym1, [r4 + r5 * 2 + mmsize] +movuym2, [r2 + r6 + mmsize] +movuym3, [r4 + r7 + mmsize] +pavgw ym0, ym1 +pavgw ym2, ym3 +movu[r0 + r1 * 2 + mmsize], ym0 +movu[r0 + r8 + mmsize], ym2 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +lea r4, [r4 + 4 * r5] + +movum0, [r2] +movum1, [r4] +movum2, [r2 + r3] +movum3, [r4 + r5] +pavgw m0, m1 +pavgw m2, m3 +movu[r0], m0 +movu[r0 + r1], m2 + +movuym0, [r2 + mmsize] +movuym1, [r4 + mmsize] +movuym2, [r2 + r3 + mmsize] +movuym3, [r4 + r5 + mmsize] +pavgw ym0, ym1 +pavgw ym2, ym3 +movu[r0 + mmsize], ym0 +movu[r0 + r1 + mmsize], ym2 + +movum0, [r2 + r3 * 2] +movum1, [r4 + r5 * 2] +movum2, [r2 + r6] +movum3, [r4 + r7] +pavgw m0, m1 +pavgw m2, m3 +movu[r0 + r1 * 2], m0 +movu[r0 + r8], m2 + +movuym0, [r2 + r3 * 2 + mmsize] +movuym1, [r4 + r5 * 2 + mmsize] +movuym2, [r2 + r6 + mmsize] +movuym3, [r4 + r7 + mmsize] +pavgw ym0, ym1 +pavgw ym2, ym3 +movu[r0 + r1 * 2 + mmsize], ym0 +movu[r0 + r8 + mmsize], ym2 +%endmacro + %macro PIXEL_AVG_HBD_W32 1 INIT_ZMM avx512 cglobal pixel_avg_32x%1, 6,9,4 @@ -5804,6 +5882,24 @@ PIXEL_AVG_HBD_W64 32 PIXEL_AVG_HBD_W64 48 PIXEL_AVG_HBD_W64 64 + +INIT_ZMM avx512 +cglobal pixel_avg_48x64, 6,9,4 +add r1d, r1d +add r3d, r3d +add r5d, r5d +lea r6, [r3 * 3] +lea r7, [r5 * 3] +lea r8, [r1 * 3] + +%rep 7 +PROCESS_PIXELAVG_48x8_HBD_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +lea r4, [r4 + 4 * r5] +%endrep +PROCESS_PIXELAVG_48x8_HBD_AVX512 +RET ;- ;pixel_avg_pp avx512 high bit depth code end ;- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 070 of 307] [x265-avx512]x86: clean up line endings issue in sad16-a.asm and asm-primitives.cpp files
# HG changeset patch # User Gopi Satykrishna Akisetty # Date 1502103618 -19800 # Mon Aug 07 16:30:18 2017 +0530 # Node ID ad756cf6d35f0d1460c5a079bea8781ffd67b7c7 # Parent 039ed71e123c3e14bfaabbe3aada944157784b36 [x265-avx512]x86: clean up line endings issue in sad16-a.asm and asm-primitives.cpp files. diff -r 039ed71e123c -r ad756cf6d35f source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Aug 04 16:20:38 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Aug 07 16:30:18 2017 +0530 @@ -2267,17 +2267,6 @@ p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512); -p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512); -p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512); -p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512); -p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512); -p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512); -p.pu[LUMA_48x64].sad = PFX(pixel_sad_48x64_avx512); -p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512); -p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512); -p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); -p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512); - p.pu[LUMA_64x16].addAvg = PFX(addAvg_64x16_avx512); p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx512); p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512); @@ -2313,18 +2302,6 @@ p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512); p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512); -p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx512); -p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx512); -p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512); -p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512); -p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512); - -p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx512); -p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512); -p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512); -p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512); -p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512); - p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx512); p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512); p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512); diff -r 039ed71e123c -r ad756cf6d35f source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Fri Aug 04 16:20:38 2017 +0530 +++ b/source/common/x86/sad16-a.asm Mon Aug 07 16:30:18 2017 +0530 @@ -1235,85 +1235,6 @@ %endmacro -%macro PROCESS_SAD_64x8_AVX512 0 -movum1, [r2] -movum2, [r2 + mmsize] -movum3, [r2 + r3] -movum4, [r2 + r3 + mmsize] -psubw m1, [r0] -psubw m2, [r0 + mmsize] -psubw m3, [r0 + r1] -psubw m4, [r0 + r1 + mmsize] -pabsw m1, m1 -pabsw m2, m2 -pabsw m3, m3 -pabsw m4, m4 -paddw m1, m2 -paddw m3, m4 -paddw m5, m1, m3 - -movum1, [r2 + 2 * r3] -movum2, [r2 + 2 * r3 + mmsize] -movum3, [r2 + r5] -movum4, [r2 + r5 + mmsize] -psubw m1, [r0 + 2 * r1] -psubw m2, [r0 + 2 * r1 + mmsize] -psubw m3, [r0 + r4] -psubw m4, [r0 + r4 + mmsize] -pabsw m1, m1 -pabsw m2, m2 -pabsw m3, m3 -pabsw m4, m4 -paddw m1, m2 -paddw m3, m4 -paddw m1, m3 - -lea r0, [r0 + 4 * r1] -lea r2, [r2 + 4 * r3] - -pmaddwd m5, m6 -paddd m0, m5 -pmaddwd m1, m6 -paddd m0, m1 - -movum1, [r2] -movum2, [r2 + mmsize] -movum3, [r2 + r3] -movum4, [r2 + r3 + mmsize] -psubw m1, [r0] -psubw m2, [r0 + mmsize] -psubw m3, [r0 + r1] -psubw m4, [r0 + r1 + mmsize] -pabsw m1, m1 -pabsw m2, m2 -pabsw m3, m3 -pabsw m4, m4 -paddw m1, m2 -paddw m3, m4 -paddw m5, m1, m3 - -movum1, [r2 + 2 * r3] -movum2, [r2 + 2 * r3 + mmsize] -movum3, [r2 + r5] -movum4, [r2 + r5 + mmsize] -psubw m1, [r0 + 2 * r1] -psubw m2, [r0 + 2 * r1 + mmsize] -psubw m3, [r0 + r4] -psubw m4, [r0 + r4 + mmsize] -pabsw m1, m1 -pabsw m2, m2 -pabsw m3, m3 -pabsw m4, m4 -paddw m1, m2 -paddw m3, m4 -paddw m1, m3 - -pmaddwd m5, m6 -paddd m0, m5 -pmaddwd m1, m6 -paddd m0, m1 -%endmacro - %macro PROCESS_SAD_32x8_AVX512 0 movum1, [r2] movum2, [r2 + r3] @@ -1368,61 +1289,6 @@ movd eax, xm0 %endmacro -%macro PROCESS_SAD_32x8_AVX512 0 -movum1, [r2] -movum2, [r2 + r3] -movum3, [r2 + 2 * r3] -movum4, [r2 + r5] -psubw m1, [r0] -psubw m2, [r0 + r1] -psubw m3, [r0 + 2 * r1] -psubw m4, [r0
[x265] [PATCH 073 of 307] x86: AVX512 cpy1Dto2D_shl_32
# HG changeset patch # User Vignesh Vijayakumar # Date 1502186111 -19800 # Tue Aug 08 15:25:11 2017 +0530 # Node ID 7d7f2a4e771c7c2b573db9bc298d1a35bb72f32d # Parent ce93c1b1894ae7d789e451f65479f018ba90ec76 x86: AVX512 cpy1Dto2D_shl_32 Size | BitDepth | AVX2 performance | AVX512 performance --- 32x32|8 | 16.03x | 28.94x 32x32|10| 14.12x | 24.99x diff -r ce93c1b1894a -r 7d7f2a4e771c source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Aug 02 14:11:31 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 08 15:25:11 2017 +0530 @@ -2311,6 +2311,8 @@ p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512); p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); +p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); + } } #else // if HIGH_BIT_DEPTH @@ -3992,6 +3994,7 @@ p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512); p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512); p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); +p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); } #endif diff -r ce93c1b1894a -r 7d7f2a4e771c source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Wed Aug 02 14:11:31 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Tue Aug 08 15:25:11 2017 +0530 @@ -5513,7 +5513,62 @@ jnz.loop RET - +;-- +; cpy_1Dto2D_shl avx512 code start +;-- +%macro PROCESS_CPY1Dto2D_SHL_32x8_AVX512 0 +movum1,[r1 + 0 * mmsize] +movum2,[r1 + 1 * mmsize] +movum3,[r1 + 2 * mmsize] +movum4,[r1 + 3 * mmsize] +psllw m1,xm0 +psllw m2,xm0 +psllw m3,xm0 +psllw m4,xm0 +movu[r0], m1 +movu[r0 + r2], m2 +movu[r0 + 2 * r2], m3 +movu[r0 + r3], m4 + +add r1,4 * mmsize +lea r0,[r0 + r2 * 4] + +movum1,[r1 + 0 * mmsize] +movum2,[r1 + 1 * mmsize] +movum3,[r1 + 2 * mmsize] +movum4,[r1 + 3 * mmsize] +psllw m1,xm0 +psllw m2,xm0 +psllw m3,xm0 +psllw m4,xm0 +movu[r0], m1 +movu[r0 + r2], m2 +movu[r0 + 2 * r2], m3 +movu[r0 + r3], m4 +%endmacro +;-- +; void cpy1Dto2D_shl(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) +;-- +INIT_ZMM avx512 +cglobal cpy1Dto2D_shl_32, 4, 4, 5 +add r2d, r2d +movdxm0, r3d +lea r3, [3 * r2] + +PROCESS_CPY1Dto2D_SHL_32x8_AVX512 +add r1, 4 * mmsize +lea r0, [r0 + r2 * 4] +PROCESS_CPY1Dto2D_SHL_32x8_AVX512 +add r1, 4 * mmsize +lea r0, [r0 + r2 * 4] +PROCESS_CPY1Dto2D_SHL_32x8_AVX512 +add r1, 4 * mmsize +lea r0, [r0 + r2 * 4] +PROCESS_CPY1Dto2D_SHL_32x8_AVX512 +RET +;-- +; copy_cnt avx512 code end +;-- ;-- ; uint32_t copy_cnt(int16_t* dst, const int16_t* src, intptr_t srcStride); ;-- diff -r ce93c1b1894a -r 7d7f2a4e771c source/common/x86/blockcopy8.h --- a/source/common/x86/blockcopy8.hWed Aug 02 14:11:31 2017 +0530 +++ b/source/common/x86/blockcopy8.hTue Aug 08 15:25:11 2017 +0530 @@ -37,6 +37,7 @@ FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); FUNCDEF_TU_S(void, cpy1Dto2D_shl, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +FUNCDEF_TU_S(void, cpy1Dto2D_shl, avx512, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse2, int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); FUNCDEF_TU_S(void, cpy1Dto2D_shr, sse4, int16_t* dst, const int16_t* src, intptr_t srcStride, int
[x265] [PATCH 075 of 307] x86: revoke some changes in ipfilter8
# HG changeset patch # User Mythreyi P # Date 1522885678 25200 # Wed Apr 04 16:47:58 2018 -0700 # Node ID 7bdf20f62d02f5714c1332695ffa8c7c6a9d8a5a # Parent 563b3c4f91eb20374311ed18fb18ad12aeebaf26 x86: revoke some changes in ipfilter8 diff -r 563b3c4f91eb -r 7bdf20f62d02 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Fri Jul 28 11:43:23 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Wed Apr 04 16:47:58 2018 -0700 @@ -43,6 +43,15 @@ const pd_526336, times 8 dd 8192*64+2048 +const tab_ChromaCoeff, db 0, 64, 0, 0 + db -2, 58, 10, -2 + db -4, 54, 16, -2 + db -6, 46, 28, -4 + db -4, 36, 36, -4 + db -4, 28, 46, -6 + db -2, 16, 54, -4 + db -2, 10, 58, -2 + const tab_LumaCoeff, db 0, 0, 0, 64, 0, 0, 0, 0 db -1, 4, -10, 58, 17, -5, 1, 0 db -1, 4, -11, 40, 40, -11, 4, -1 diff -r 563b3c4f91eb -r 7bdf20f62d02 source/common/x86/v4-ipfilter8.asm --- a/source/common/x86/v4-ipfilter8.asmFri Jul 28 11:43:23 2017 +0530 +++ b/source/common/x86/v4-ipfilter8.asmWed Apr 04 16:47:58 2018 -0700 @@ -43,7 +43,7 @@ const v4_interp4_vpp_shuf1, dd 0, 1, 1, 2, 2, 3, 3, 4 dd 2, 3, 3, 4, 4, 5, 5, 6 -const tab_ChromaCoeff, db 0, 64, 0, 0 +const v4_tab_ChromaCoeff, db 0, 64, 0, 0 db -2, 58, 10, -2 db -4, 54, 16, -2 db -6, 46, 28, -4 @@ -1031,8 +1031,8 @@ movam6,[r5 + r4] movam5,[r5 + r4 + 16] %else -movam6,[tab_ChromaCoeff + r4] -movam5,[tab_ChromaCoeff + r4 + 16] +movam6,[v4_tab_ChromaCoeff + r4] +movam5,[v4_tab_ChromaCoeff + r4 + 16] %endif %ifidn %1,pp @@ -2114,10 +2114,10 @@ sub r0,r1 %ifdef PIC -lea r5,[tab_ChromaCoeff] +lea r5,[v4_tab_ChromaCoeff] movdm0,[r5 + r4 * 4] %else -movdm0,[tab_ChromaCoeff + r4 * 4] +movdm0,[v4_tab_ChromaCoeff + r4 * 4] %endif lea r4,[r1 * 3] lea r5,[r0 + 4 * r1] @@ -2430,10 +2430,10 @@ sub r0,r1 %ifdef PIC -lea r5,[tab_ChromaCoeff] +lea r5,[v4_tab_ChromaCoeff] movdm0,[r5 + r4 * 4] %else -movdm0,[tab_ChromaCoeff + r4 * 4] +movdm0,[v4_tab_ChromaCoeff + r4 * 4] %endif pshufb m0,[tab_Cm] @@ -2515,10 +2515,10 @@ sub r0,r1 %ifdef PIC -lea r5,[tab_ChromaCoeff] +lea r5,[v4_tab_ChromaCoeff] movdm0,[r5 + r4 * 4] %else -movdm0,[tab_ChromaCoeff + r4 * 4] +movdm0,[v4_tab_ChromaCoeff + r4 * 4] %endif pshufb m0,[tab_Cm] @@ -2611,10 +2611,10 @@ sub r0,r1 %ifdef PIC -lea r5,[tab_ChromaCoeff] +lea r5,[v4_tab_ChromaCoeff] movdm0,[r5 + r4 * 4] %else -movdm0,[tab_ChromaCoeff + r4 * 4] +movdm0,[v4_tab_ChromaCoeff + r4 * 4] %endif pshufb m0,[tab_Cm] @@ -2984,10 +2984,10 @@ sub r0,r1 %ifdef PIC -lea r5,[tab_ChromaCoeff] +lea r5,[v4_tab_ChromaCoeff] movdm0,[r5 + r4 * 4] %else -movdm0,[tab_ChromaCoeff + r4 * 4] +movdm0,[v4_tab_ChromaCoeff + r4 * 4] %endif pshufb m0,[tab_Cm] @@ -3180,10 +3180,10 @@ punpcklbw m4,m2, m3 %ifdef PIC -lea r6,[tab_ChromaCoeff] +lea r6,[v4_tab_ChromaCoeff] movdm5,[r6 + r4 * 4] %else -movdm5,[tab_ChromaCoeff + r4 * 4] +movdm5,[v4_tab_ChromaCoeff + r4 * 4] %endif pshufb m6,m5, [tab_Vm] @@ -3233,10 +3233,10 @@ add r3d, r3d %ifdef PIC -lea r5, [tab_ChromaCoeff] +lea r5, [v4_tab_ChromaCoeff] movdm0, [r5 + r4 * 4] %else -movdm0, [tab_ChromaCoeff + r4 * 4] +movdm0, [v4_tab_ChromaCoeff + r4 * 4] %endif pshufb m0, [tab_Cm] @@ -3280,10 +3280,10 @@ addr3d, r3d %ifdef PIC -lear5, [tab_ChromaCoeff] +lear5, [v4_tab_ChromaCoeff] movd m0, [r5 + r4 * 4] %else -movd m0, [tab_ChromaCoeff + r4 * 4] +movd m0, [v4_tab_ChromaCoeff + r4 * 4] %endif pshufb m0, [tab_Cm] @@ -3355,10 +3355,10 @@ addr3d, r3d %ifdef PIC -lear5
[x265] [PATCH 072 of 307] x86: AVX512 cpy2Dto1D_shl_32 and cpy2Dto1D_shl_16
# HG changeset patch # User Vignesh Vijayakumar # Date 1501663291 -19800 # Wed Aug 02 14:11:31 2017 +0530 # Node ID ce93c1b1894ae7d789e451f65479f018ba90ec76 # Parent aac415b7223acced7fc844c4a07225704b811df0 x86: AVX512 cpy2Dto1D_shl_32 and cpy2Dto1D_shl_16 Size | BitDepth | AVX2 performance | AVX512 performance --- 16x16|8 | 15.09x | 21.16 16x16|10| 16.05x | 17.86 32x32|8 | 13.90x | 25.62 32x32|10| 11.69x | 23.24 diff -r aac415b7223a -r ce93c1b1894a source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 01 17:37:05 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Aug 02 14:11:31 2017 +0530 @@ -2309,6 +2309,8 @@ p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512); p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512); +p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512); +p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); } } #else // if HIGH_BIT_DEPTH @@ -3988,6 +3990,8 @@ p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512); p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512); +p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512); +p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); } #endif diff -r aac415b7223a -r ce93c1b1894a source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Tue Aug 01 17:37:05 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Wed Aug 02 14:11:31 2017 +0530 @@ -6140,6 +6140,102 @@ RET ;-- +; cpy2Dto1D_shl avx512 code start +;-- +%macro PROCESS_CPY2Dto1D_SHL_16x8_AVX512 0 +movu m1,[r1] +vinserti32x8 m1,[r1 + r2], 1 +movu m2,[r1 + 2 * r2] +vinserti32x8 m2,[r1 + r3], 1 + +psllwm1, xm0 +psllwm2, xm0 +movu [r0], m1 +movu [r0 + mmsize], m2 + +add r0, 2 * mmsize +lea r1, [r1 + r2 * 4] + +movu m1,[r1] +vinserti32x8 m1,[r1 + r2], 1 +movu m2,[r1 + 2 * r2] +vinserti32x8 m2,[r1 + r3], 1 + +psllwm1, xm0 +psllwm2, xm0 +movu [r0], m1 +movu [r0 + mmsize], m2 +%endmacro + +%macro PROCESS_CPY2Dto1D_SHL_32x8_AVX512 0 +movu m1, [r1] +movu m2, [r1 + r2] +movu m3, [r1 + 2 * r2] +movu m4, [r1 + r3] + +psllwm1, xm0 +psllwm2, xm0 +psllwm3, xm0 +psllwm4, xm0 +movu [r0], m1 +movu [r0 + mmsize], m2 +movu [r0 + 2 * mmsize], m3 +movu [r0 + 3 * mmsize], m4 + +add r0, 4 * mmsize +lea r1, [r1 + r2 * 4] + +movu m1, [r1] +movu m2, [r1 + r2] +movu m3, [r1 + 2 * r2] +movu m4, [r1 + r3] + +psllwm1, xm0 +psllwm2, xm0 +psllwm3, xm0 +psllwm4, xm0 +movu [r0], m1 +movu [r0 + mmsize], m2 +movu [r0 + 2 * mmsize], m3 +movu [r0 + 3 * mmsize], m4 +%endmacro + +;-- +; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift); +;-- +INIT_ZMM avx512 +cglobal cpy2Dto1D_shl_32, 4, 4, 5 +add r2d, r2d +movdxm0, r3d +lea r3, [3 * r2] + +PROCESS_CPY2Dto1D_SHL_32x8_AVX512 +add r0, 4 * mmsize +lea r1, [r1 + r2 * 4] +PROCESS_CPY2Dto1D_SHL_32x8_AVX512 +add r0, 4 * mmsize +lea r1, [r1 + r2 * 4] +PROCESS_CPY2Dto1D_SHL_32x8_AVX512 +add r0, 4 * mmsize +lea r1, [r1 + r2 * 4] +PROCESS_CPY2Dto1D_SHL_32x8_AVX512 +RET + +INIT_ZMM avx512 +cglobal cpy2Dto1D_shl_16, 4, 4, 3 +add r2d, r2d +movdxm0, r3d +lea r3, [3 * r2] + +PROCESS_CPY2Dto1D_SHL_16x8_AVX512 +add r0, 2 * mmsize +lea r1, [r1 + r2 * 4] +PROCESS_CPY2Dto1D_SHL_16x8_AVX512 +RET +;-- +; cpy2Dto1D_shl avx512 code end +;-- +;-- ; void cpy1Dto2D_shr(int16_t* dst, const int16_t* src, intptr_t dstStride, int shift) ;-- INIT_XMM sse2 diff -r aac415b7223a -r ce93c1b1894a source/common/x86/blockcopy8.h --- a/source/common/x86/blockcopy8.hTue Aug 01 17:37:05 2017 +05
[x265] [PATCH 068 of 307] [x265-avx512]x86: AVX512 pixel_sad_x4_32xN for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty # Date 1501837071 -19800 # Fri Aug 04 14:27:51 2017 +0530 # Node ID c5b5b7cb9bbef4365692bfaf05a2a83796d5f1b0 # Parent c3a2abd8e46f8db3ba7c276f39fe41ed002ce295 [x265-avx512]x86: AVX512 pixel_sad_x4_32xN for high bit depth Size| AVX2 performance | AVX512 performance 32x8| 16.73x | 25.16x 32x16 | 18.36x | 29.04x 32x24 | 19.52x | 31.03x 32x32 | 18.78x | 31.95x 32x64 | 19.01x | 34.20x diff -r c3a2abd8e46f -r c5b5b7cb9bbe source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Aug 04 14:27:51 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Aug 04 14:27:51 2017 +0530 @@ -2325,6 +2325,12 @@ p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512); p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512); +p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx512); +p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512); +p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512); +p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512); +p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512); + } } #else // if HIGH_BIT_DEPTH diff -r c3a2abd8e46f -r c5b5b7cb9bbe source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Fri Aug 04 14:27:51 2017 +0530 +++ b/source/common/x86/sad16-a.asm Fri Aug 04 14:27:51 2017 +0530 @@ -3286,6 +3286,160 @@ ; SAD x3/x4 avx512 code start ; +%macro PROCESS_SAD_X4_32x4_AVX512 0 +movum8, [r0] +movum4, [r1] +movum5, [r2] +movum6, [r3] +movum7, [r4] + + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 + + +movum8, [r0 + 2 * FENC_STRIDE] +movum4, [r1 + r5] +movum5, [r2 + r5] +movum6, [r3 + r5] +movum7, [r4 + r5] + + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 + +movum8, [r0 + 4 * FENC_STRIDE] +movum4, [r1 + 2 * r5] +movum5, [r2 + 2 * r5] +movum6, [r3 + 2 * r5] +movum7, [r4 + 2 * r5] + + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 + +movum8, [r0 + 6 * FENC_STRIDE] +movum4, [r1 + r7] +movum5, [r2 + r7] +movum6, [r3 + r7] +movum7, [r4 + r7] + + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 +%endmacro + + +%macro PROCESS_SAD_X4_END_AVX512 0 +vextracti32x8 ym4, m0, 1 +vextracti32x8 ym5, m1, 1 +vextracti32x8 ym6, m2, 1 +vextracti32x8 ym7, m3, 1 + +paddd ym0, ym4 +paddd ym1, ym5 +paddd ym2, ym6 +paddd ym3, ym7 + +vextracti64x2 xm4, m0, 1 +vextracti64x2 xm5, m1, 1 +vextracti64x2 xm6, m2, 1 +vextracti64x2 xm7, m3, 1 + +paddd xm0, xm4 +paddd xm1, xm5 +paddd xm2, xm6 +paddd xm3, xm7 + +pshufd xm4, xm0, 1110b +pshufd xm5, xm1, 1110b +pshufd xm6, xm2, 1110b +pshufd xm7, xm3, 1110b + +paddd xm0, xm4 +paddd xm1, xm5 +paddd xm2, xm6 +paddd xm3, xm7 + +pshufd xm4, xm0, 0001b +pshufd xm5, xm1, 0001b +pshufd xm6, xm2, 0001b +pshufd xm7, xm3, 0001b + +paddd xm0, xm4 +paddd xm1, xm5 +paddd xm2, xm6 +paddd xm3, xm7 + +mov r0, r6mp +movd [r0 + 0], xm0 +movd [r0 + 4], xm1 +movd [r0 + 8], xm2 +movd [r0 + 12], xm3 +%endmacro + + + %macro PROCESS_SAD_X3_32x4_AVX512 0 movum6, [r0] movum3, [r1] @@ -3641,3 +3795,275 @@ PROCESS_SAD_X3_
[x265] [PATCH 076 of 307] x86: AVX512 interp_4tap_horiz_pp_32xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1502347959 -19800 # Thu Aug 10 12:22:39 2017 +0530 # Node ID f489bc0b864c48f557cc40b739e84fe1040e8728 # Parent 7bdf20f62d02f5714c1332695ffa8c7c6a9d8a5a x86: AVX512 interp_4tap_horiz_pp_32xN Color Space i444 Size| AVX2 performance | AVX512 performance 32x8| 23.96x | 31.57x 32x16 | 24.38x | 33.22x 32x24 | 22.41x | 36.92x 32x32 | 21.54x | 34.09x 32x64 | 23.27x | 29.14x Color Space i422 Size| AVX2 performance | AVX512 performance 32x16 | 25.55x | 33.16x 32x32 | 22.08x | 35.13x 32x48 | 24.01x | 34.53x 32x64 | 23.76x | 35.21x diff -r 7bdf20f62d02 -r f489bc0b864c source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Apr 04 16:47:58 2018 -0700 +++ b/source/common/x86/asm-primitives.cpp Thu Aug 10 12:22:39 2017 +0530 @@ -4001,6 +4001,18 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = PFX(interp_4tap_horiz_pp_64x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512); + +p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); + +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512); + } #endif } diff -r 7bdf20f62d02 -r f489bc0b864c source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Wed Apr 04 16:47:58 2018 -0700 +++ b/source/common/x86/ipfilter8.asm Thu Aug 10 12:22:39 2017 +0530 @@ -150,6 +150,8 @@ const interp4_horiz_shuf_load2_avx512, times 2 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +const interp4_horiz_shuf_store1_avx512, dd 0 ,8, 1, 9, 4, 12, 5, 13, 2, 10, 3, 11, 6, 14, 7, 15 + SECTION .text cextern pb_128 @@ -9867,6 +9869,44 @@ movu [r2], m5 %endmacro +%macro PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512 0 +; register map +; m0 - interpolate coeff +; m1, m2 - shuffle order table +; m3 - constant word 1 +; m4 - constant word 512 +; m9 - store shuffle order table + +movu ym5, [r0] +vinserti32x8 m5, [r0 + 4], 1 + +pshufb m6, m5, m2 +pshufb m5, m5, m1 +pmaddubsw m5, m0 +pmaddubsw m6, m0 +pmaddwdm5, m3 +pmaddwdm6, m3 + +movu ym7, [r0 + r1] +vinserti32x8 m7, [r0 + r1 + 4], 1 + +pshufb m8, m7, m2 +pshufb m7, m7, m1 +pmaddubsw m7, m0 +pmaddubsw m8, m0 +pmaddwdm7, m3 +pmaddwdm8, m3 + +packssdw m5, m6 +packssdw m7, m8 +pmulhrsw m5, m4 +pmulhrsw m7, m4 +packuswb m5, m7 +vpermd m5, m9, m5 +movu [r2], ym5 +vextracti32x8[r2 + r3], m5,1 +%endmacro + ;- ; void interp_4tap_horiz_pp_64xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx ;- @@ -9902,6 +9942,40 @@ IPFILTER_CHROMA_PP_64xN_AVX512 48 IPFILTER_CHROMA_PP_64xN_AVX512 16 +%macro IPFILTER_CHROMA_PP_32xN_AVX512 1 +INIT_ZMM avx512 +cglobal interp_4tap_horiz_pp_32x%1, 4,6,10 +mov r4d, r4m + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +vpbroadcastd m0, [r5 + r4 * 4] +%else +vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + +vbroadcasti32x8 m1,
[x265] [PATCH 078 of 307] [x265-avx512]x86: AVX512 pixel_sad_x4_64xN for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty # Date 1502171321 -19800 # Tue Aug 08 11:18:41 2017 +0530 # Node ID aa1747a46469afe6fc2d5e6295a4b43a14ea # Parent d0e43a0e3b531f3e4f42be169c224563753b0210 [x265-avx512]x86: AVX512 pixel_sad_x4_64xN for high bit depth Size| AVX2 performance | AVX512 performance 64x16 | 19.41x | 33.30x 64x32 | 19.75x | 33.22x 64x48 | 20.39x | 35.05x 64x64 | 20.25x | 36.72x diff -r d0e43a0e3b53 -r aa1747a46469 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 07 17:04:23 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 08 11:18:41 2017 +0530 @@ -2312,6 +2312,10 @@ p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512); p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512); p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512); +p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx512); +p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512); +p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512); +p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx512); p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512); p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); diff -r d0e43a0e3b53 -r aa1747a46469 source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Mon Aug 07 17:04:23 2017 +0530 +++ b/source/common/x86/sad16-a.asm Tue Aug 08 11:18:41 2017 +0530 @@ -2136,6 +2136,172 @@ paddd m3, m7 %endmacro +%macro PROCESS_SAD_X4_64x4_AVX512 0 +movum8, [r0] +movum10, [r0 + mmsize] +movum4, [r1] +movum11, [r1 + mmsize] +movum5, [r2] +movum12, [r2 + mmsize] +movum6, [r3] +movum13, [r3 + mmsize] +movum7, [r4] +movum14, [r4 + mmsize] + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +psubw m11, m10 +psubw m12, m10 +psubw m13, m10 +psubw m14, m10 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 +pabsw m11, m11 +pabsw m12, m12 +pabsw m13, m13 +pabsw m14, m14 +paddw m4, m11 +paddw m5, m12 +paddw m6, m13 +paddw m7, m14 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 + + +movum8, [r0 + 2 * FENC_STRIDE] +movum10, [r0 + 2 * FENC_STRIDE + mmsize] +movum4, [r1 + r5] +movum11, [r1 + r5 + mmsize] +movum5, [r2 + r5] +movum12, [r2 + r5 + mmsize] +movum6, [r3 + r5] +movum13, [r3 + r5 + mmsize] +movum7, [r4 + r5] +movum14, [r4 + r5 + mmsize] + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +psubw m11, m10 +psubw m12, m10 +psubw m13, m10 +psubw m14, m10 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 +pabsw m11, m11 +pabsw m12, m12 +pabsw m13, m13 +pabsw m14, m14 +paddw m4, m11 +paddw m5, m12 +paddw m6, m13 +paddw m7, m14 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 + +movum8, [r0 + 4 * FENC_STRIDE] +movum10, [r0 + 4 * FENC_STRIDE + mmsize] +movum4, [r1 + 2 * r5] +movum11, [r1 + 2 * r5 + mmsize] +movum5, [r2 + 2 * r5] +movum12, [r2 + 2 * r5 + mmsize] +movum6, [r3 + 2 * r5] +movum13, [r3 + 2 * r5 + mmsize] +movum7, [r4 + 2 * r5] +movum14, [r4 + 2 * r5 + mmsize] + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +psubw m11, m10 +psubw m12, m10 +psubw m13, m10 +psubw m14, m10 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 +pabsw m11, m11 +pabsw m12, m12 +pabsw m13, m13 +pabsw m14, m14 +paddw m4, m11 +paddw m5, m12 +paddw m6, m13 +paddw m7, m14 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 + +movum8, [r0 + 6 * FENC_STRIDE] +movum10, [r0 + 6 * FENC_STRIDE + mmsize] +movum4, [r1 + r7] +movum11, [r1 + r7 + mmsize] +movum5, [r2 + r7] +movum12, [r2 + r7 + mmsize] +movum6, [r3 + r7] +movum13, [r3 + r7 + mmsize] +movum7, [r4 + r7] +movum14, [r4 + r7 + mmsize] + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +psubw m11, m10 +psubw m1
[x265] [PATCH 066 of 307] [x265-avx512]x86: AVX512 pixel_sad_x3_32xN for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty # Date 1501765251 -19800 # Thu Aug 03 18:30:51 2017 +0530 # Node ID 241f318be574498b7bb77939937a907e4721dc32 # Parent df45017fca906d5f3370dcc78e43284622753a73 [x265-avx512]x86: AVX512 pixel_sad_x3_32xN for high bit depth Size| AVX2 performance | AVX512 performance 32x8| 20.72x | 29.20x 32x16 | 19.31x | 30.53x 32x24 | 19.78x | 33.32x 32x32 | 20.02x | 32.71x 32x64 | 20.40x | 33.30x diff -r df45017fca90 -r 241f318be574 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Aug 03 18:30:51 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Aug 03 18:30:51 2017 +0530 @@ -2313,6 +2313,12 @@ p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512); p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512); +p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx512); +p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx512); +p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512); +p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512); +p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512); + } } #else // if HIGH_BIT_DEPTH diff -r df45017fca90 -r 241f318be574 source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Thu Aug 03 18:30:51 2017 +0530 +++ b/source/common/x86/sad16-a.asm Thu Aug 03 18:30:51 2017 +0530 @@ -2856,3 +2856,362 @@ PROCESS_SAD_X3_END_AVX512 RET +; +; SAD x3/x4 avx512 code start +; + +%macro PROCESS_SAD_X3_32x4_AVX512 0 +movum6, [r0] +movum3, [r1] +movum4, [r2] +movum5, [r3] + + +psubw m3, m6 +psubw m4, m6 +psubw m5, m6 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 + +movum6, [r0 + 2 * FENC_STRIDE] +movum3, [r1 + r4] +movum4, [r2 + r4] +movum5, [r3 + r4] + +psubw m3, m6 +psubw m4, m6 +psubw m5, m6 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 + +movum6, [r0 + 4 * FENC_STRIDE] +movum3, [r1 + 2 * r4] +movum4, [r2 + 2 * r4] +movum5, [r3 + 2 * r4] + +psubw m3, m6 +psubw m4, m6 +psubw m5, m6 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 + +movum6, [r0 + 6 * FENC_STRIDE] +movum3, [r1 + r6] +movum4, [r2 + r6] +movum5, [r3 + r6] + +psubw m3, m6 +psubw m4, m6 +psubw m5, m6 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 +%endmacro + + +%macro PROCESS_SAD_X3_END_AVX512 0 +vextracti32x8 ym3, m0, 1 +vextracti32x8 ym4, m1, 1 +vextracti32x8 ym5, m2, 1 + +paddd ym0, ym3 +paddd ym1, ym4 +paddd ym2, ym5 + +vextracti64x2 xm3, m0, 1 +vextracti64x2 xm4, m1, 1 +vextracti64x2 xm5, m2, 1 + +paddd xm0, xm3 +paddd xm1, xm4 +paddd xm2, xm5 + +pshufd xm3, xm0, 1110b +pshufd xm4, xm1, 1110b +pshufd xm5, xm2, 1110b + +paddd xm0, xm3 +paddd xm1, xm4 +paddd xm2, xm5 + +pshufd xm3, xm0, 0001b +pshufd xm4, xm1, 0001b +pshufd xm5, xm2, 0001b + +paddd xm0, xm3 +paddd xm1, xm4 +paddd xm2, xm5 + +movd [r5 + 0], xm0 +movd [r5 + 4], xm1 +movd [r5 + 8], xm2 +%endmacro + + +;-- +; void pixel_sad_x3_32x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res ) +;-- + +INIT_ZMM avx512 +cglobal pixel_sad_x3_32x8, 6,7,8 +pxorm0, m0 +pxorm1, m1 +pxorm2, m2 + +vbroadcasti32x8 m7, [pw_1] + +add r4d, r4d +lea r6d, [r4 * 3] + +PROCESS_SAD_X3_32x4_AVX512 +add r0, FENC_STRIDE * 8 +lea r1, [r1 + r4 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r4 * 4] +
[x265] [PATCH 067 of 307] [x265-avx512]x86: AVX512 pixel_sad_x4_32xN for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty # Date 1501837071 -19800 # Fri Aug 04 14:27:51 2017 +0530 # Node ID c3a2abd8e46f8db3ba7c276f39fe41ed002ce295 # Parent 241f318be574498b7bb77939937a907e4721dc32 [x265-avx512]x86: AVX512 pixel_sad_x4_32xN for high bit depth Size| AVX2 performance | AVX512 performance 32x8| 16.73x | 25.16x 32x16 | 18.36x | 29.04x 32x24 | 19.52x | 31.03x 32x32 | 18.78x | 31.95x 32x64 | 19.01x | 34.20x diff -r 241f318be574 -r c3a2abd8e46f source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Aug 03 18:30:51 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Aug 04 14:27:51 2017 +0530 @@ -2319,6 +2319,12 @@ p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512); p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512); +p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx512); +p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512); +p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512); +p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512); +p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512); + } } #else // if HIGH_BIT_DEPTH diff -r 241f318be574 -r c3a2abd8e46f source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Thu Aug 03 18:30:51 2017 +0530 +++ b/source/common/x86/sad16-a.asm Fri Aug 04 14:27:51 2017 +0530 @@ -2501,6 +2501,160 @@ ; SAD x3/x4 avx512 code start ; +%macro PROCESS_SAD_X4_32x4_AVX512 0 +movum8, [r0] +movum4, [r1] +movum5, [r2] +movum6, [r3] +movum7, [r4] + + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 + + +movum8, [r0 + 2 * FENC_STRIDE] +movum4, [r1 + r5] +movum5, [r2 + r5] +movum6, [r3 + r5] +movum7, [r4 + r5] + + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 + +movum8, [r0 + 4 * FENC_STRIDE] +movum4, [r1 + 2 * r5] +movum5, [r2 + 2 * r5] +movum6, [r3 + 2 * r5] +movum7, [r4 + 2 * r5] + + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 + +movum8, [r0 + 6 * FENC_STRIDE] +movum4, [r1 + r7] +movum5, [r2 + r7] +movum6, [r3 + r7] +movum7, [r4 + r7] + + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 +%endmacro + + +%macro PROCESS_SAD_X4_END_AVX512 0 +vextracti32x8 ym4, m0, 1 +vextracti32x8 ym5, m1, 1 +vextracti32x8 ym6, m2, 1 +vextracti32x8 ym7, m3, 1 + +paddd ym0, ym4 +paddd ym1, ym5 +paddd ym2, ym6 +paddd ym3, ym7 + +vextracti64x2 xm4, m0, 1 +vextracti64x2 xm5, m1, 1 +vextracti64x2 xm6, m2, 1 +vextracti64x2 xm7, m3, 1 + +paddd xm0, xm4 +paddd xm1, xm5 +paddd xm2, xm6 +paddd xm3, xm7 + +pshufd xm4, xm0, 1110b +pshufd xm5, xm1, 1110b +pshufd xm6, xm2, 1110b +pshufd xm7, xm3, 1110b + +paddd xm0, xm4 +paddd xm1, xm5 +paddd xm2, xm6 +paddd xm3, xm7 + +pshufd xm4, xm0, 0001b +pshufd xm5, xm1, 0001b +pshufd xm6, xm2, 0001b +pshufd xm7, xm3, 0001b + +paddd xm0, xm4 +paddd xm1, xm5 +paddd xm2, xm6 +paddd xm3, xm7 + +mov r0, r6mp +movd [r0 + 0], xm0 +movd [r0 + 4], xm1 +movd [r0 + 8], xm2 +movd [r0 + 12], xm3 +%endmacro + + + %macro PROCESS_SAD_X3_32x4_AVX512 0 movum6, [r0] movum3, [r1] @@ -2856,6 +3010,278 @@ PROCESS_SAD_X3_
[x265] [PATCH 077 of 307] [x265-avx512]x86: AVX512 pixel_sad_x3_64xN for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty # Date 1502105663 -19800 # Mon Aug 07 17:04:23 2017 +0530 # Node ID d0e43a0e3b531f3e4f42be169c224563753b0210 # Parent f489bc0b864c48f557cc40b739e84fe1040e8728 [x265-avx512]x86: AVX512 pixel_sad_x3_64xN for high bit depth Size| AVX2 performance | AVX512 performance 64x16 | 19.69x | 36.23x 64x32 | 20.33x | 37.94x 64x48 | 20.64x | 38.48x 64x64 | 20.51x | 38.49x diff -r f489bc0b864c -r d0e43a0e3b53 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Aug 10 12:22:39 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Aug 07 17:04:23 2017 +0530 @@ -2302,6 +2302,10 @@ p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512); p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512); p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512); +p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx512); +p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512); +p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512); +p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx512); p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx512); p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512); diff -r f489bc0b864c -r d0e43a0e3b53 source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Thu Aug 10 12:22:39 2017 +0530 +++ b/source/common/x86/sad16-a.asm Mon Aug 07 17:04:23 2017 +0530 @@ -2266,6 +2266,135 @@ paddd m2, m5 %endmacro +%macro PROCESS_SAD_X3_64x4_AVX512 0 +movum6, [r0] +movum8, [r0 + mmsize] +movum3, [r1] +movum9, [r1 + mmsize] +movum4, [r2] +movum10, [r2 + mmsize] +movum5, [r3] +movum11, [r3 + mmsize] + +psubw m3, m6 +psubw m9, m8 +psubw m4, m6 +psubw m10, m8 +psubw m5, m6 +psubw m11, m8 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 +pabsw m9, m9 +pabsw m10, m10 +pabsw m11, m11 +paddw m3, m9 +paddw m4, m10 +paddw m5, m11 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 + +movum6, [r0 + 2 * FENC_STRIDE] +movum8, [r0 + 2 * FENC_STRIDE + mmsize] +movum3, [r1 + r4] +movum9, [r1 + r4 + mmsize] +movum4, [r2 + r4] +movum10, [r2 + r4 + mmsize] +movum5, [r3 + r4] +movum11, [r3 + r4 + mmsize] + +psubw m3, m6 +psubw m9, m8 +psubw m4, m6 +psubw m10, m8 +psubw m5, m6 +psubw m11, m8 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 +pabsw m9, m9 +pabsw m10, m10 +pabsw m11, m11 +paddw m3, m9 +paddw m4, m10 +paddw m5, m11 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 + +movum6, [r0 + 4 * FENC_STRIDE] +movum8, [r0 + 4 * FENC_STRIDE + mmsize] +movum3, [r1 + 2 * r4] +movum9, [r1 + 2 * r4 + mmsize] +movum4, [r2 + 2 * r4] +movum10, [r2 + 2 * r4 + mmsize] +movum5, [r3 + 2 * r4] +movum11, [r3 + 2 * r4 + mmsize] + +psubw m3, m6 +psubw m9, m8 +psubw m4, m6 +psubw m10, m8 +psubw m5, m6 +psubw m11, m8 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 +pabsw m9, m9 +pabsw m10, m10 +pabsw m11, m11 +paddw m3, m9 +paddw m4, m10 +paddw m5, m11 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 + +movum6, [r0 + 6 * FENC_STRIDE] +movum8, [r0 + 6 * FENC_STRIDE + mmsize] +movum3, [r1 + r6] +movum9, [r1 + r6 + mmsize] +movum4, [r2 + r6] +movum10, [r2 + r6 + mmsize] +movum5, [r3 + r6] +movum11, [r3 + r6 + mmsize] + +psubw m3, m6 +psubw m9, m8 +psubw m4, m6 +psubw m10, m8 +psubw m5, m6 +psubw m11, m8 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 +pabsw m9, m9 +pabsw m10, m10 +pabsw m11, m11 +paddw m3, m9 +paddw m4, m10 +paddw m5, m11 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 +%endmacro %macro PROCESS_SAD_X3_END_AVX512 0 vextracti32x8 ym3, m0, 1 @@ -2300,9 +2429,16 @@ paddd xm1, xm4 paddd xm2, xm5 -movd [r5 + 0], xm0 -movd [r5 + 4], xm1 -movd [r5 + 8], xm2 +%if UNIX64 +movd [r5 + 0], xm0 +movd [r5 + 4], xm1 +movd [r5 + 8], xm2 +%else +movr
[x265] [PATCH 079 of 307] [x265-avx512]x86: AVX512 pixel_sad_x3_48x64 for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty # Date 1502187312 -19800 # Tue Aug 08 15:45:12 2017 +0530 # Node ID 95c8818a26eea8a17a6a9471f861b89ab9e210c6 # Parent aa1747a46469afe6fc2d5e6295a4b43a14ea [x265-avx512]x86: AVX512 pixel_sad_x3_48x64 for high bit depth AVX2 performance: 20.10x AVX512 performance: 36.00x diff -r aa1747a46469 -r 95c8818a26ee source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 08 11:18:41 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 08 15:45:12 2017 +0530 @@ -2302,6 +2302,7 @@ p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512); p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512); p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512); +p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx512); p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx512); p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512); p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512); diff -r aa1747a46469 -r 95c8818a26ee source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Tue Aug 08 11:18:41 2017 +0530 +++ b/source/common/x86/sad16-a.asm Tue Aug 08 15:45:12 2017 +0530 @@ -2844,6 +2844,133 @@ PROCESS_SAD_X3_END_AVX512 RET +; +; int pixel_sad_x3_48x64( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res ) +; +INIT_ZMM avx512 +cglobal pixel_sad_x3_48x64, 4, 8, 17 +pxorm0, m0 +pxorm1, m1 +pxorm2, m2 +mov r7d, 64/4 +vbroadcasti32x8 m16, [pw_1] + +add r4d, r4d +lea r6d, [r4 * 3] +.loop: +movum4, [r0] +movum5, [r0 + 2 * FENC_STRIDE] +movu ym6, [r0 + mmsize] +vinserti32x8m6, [r0 + 2 * FENC_STRIDE + mmsize], 1 +movum7, [r1] +movum8, [r1 + r4] +movu ym9, [r1 + mmsize] +vinserti32x8m9, [r1 + r4 + mmsize], 1 +movum10, [r2] +movum11, [r2 + r4] +movu ym12, [r2 + mmsize] +vinserti32x8m12, [r2 + r4 + mmsize], 1 +movum13, [r3] +movum14, [r3 + r4] +movu ym15, [r3 + mmsize] +vinserti32x8m15, [r3 + r4 + mmsize], 1 + +psubw m7, m4 +psubw m8, m5 +psubw m9, m6 +psubw m10, m4 +psubw m11, m5 +psubw m12, m6 +psubw m13, m4 +psubw m14, m5 +psubw m15, m6 + +pabsw m7, m7 +pabsw m8, m8 +pabsw m9, m9 +pabsw m10, m10 +pabsw m11, m11 +pabsw m12, m12 +pabsw m13, m13 +pabsw m14, m14 +pabsw m15, m15 + +paddw m7, m8 +paddw m7, m9 +paddw m10, m11 +paddw m10, m12 +paddw m13, m14 +paddw m13, m15 + +pmaddwd m7, m16 +paddd m0, m7 +pmaddwd m10, m16 +paddd m1, m10 +pmaddwd m13, m16 +paddd m2, m13 + +movum4, [r0 + 4 * FENC_STRIDE] +movum5, [r0 + 6 * FENC_STRIDE] +movu ym6, [r0 + 4 * FENC_STRIDE + mmsize] +vinserti32x8m6, [r0 + 6 * FENC_STRIDE + mmsize], 1 +movum7, [r1 + 2 * r4] +movum8, [r1 + r6] +movu ym9, [r1 + 2 * r4 + mmsize] +vinserti32x8m9, [r1 + r6 + mmsize], 1 +movum10, [r2 + 2 * r4] +movum11, [r2 + r6] +movu ym12, [r2 + 2 * r4 + mmsize] +vinserti32x8m12, [r2 + r6 + mmsize], 1 +movum13, [r3 + 2 * r4] +movum14, [r3 + r6] +movu ym15, [r3 + 2 * r4 + mmsize] +vinserti32x8m15, [r3 + r6 + mmsize], 1 + +psubw m7, m4 +psubw m8, m5 +psubw m9, m6 +psubw m10, m4 +psubw m11, m5 +psubw m12, m6 +psubw m13, m4 +psubw m14, m5 +psubw m15, m6 + +pabsw m7, m7 +pabsw m8, m8 +pabsw m9, m9 +pabsw m10, m10 +pabsw m11, m11 +pabsw m12, m12 +pabsw m13, m13 +pabsw m14, m14 +pabsw m15, m15 + +paddw m7, m8 +paddw m7, m9 +paddw m10, m11 +paddw m10, m12 +paddw m13, m14 +paddw m13, m15 + +pmaddwd m7, m16 +paddd m0, m7 +pmaddwd m10, m16 +paddd m1, m10 +pmaddwd m13, m16 +paddd m2, m13 + +add r0, FENC_STRIDE * 8 +lea r1, [r1 + r4 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r4 * 4] + +dec r7d +jg .loop + +PROCESS_SAD_X3_END_AVX512 +RET + ;
[x265] [PATCH 081 of 307] x86: AVX512 cleanup blockcopy_sp_64x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1502709712 -19800 # Mon Aug 14 16:51:52 2017 +0530 # Node ID 5c18b655a88a739b87c6b071d186a2b9286b8266 # Parent 4a643ecb8c3bcc4dab96bfe56217d4449564bae0 x86: AVX512 cleanup blockcopy_sp_64x64 diff -r 4a643ecb8c3b -r 5c18b655a88a source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Tue Aug 08 17:01:50 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Mon Aug 14 16:51:52 2017 +0530 @@ -26,7 +26,10 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA 32 +SECTION_RODATA 64 + +ALIGN 64 +const shuf1_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7 cextern pb_4 cextern pb_1 @@ -2162,7 +2165,7 @@ BLOCKCOPY_SP_W64_H4_avx2 64, 64 -%macro PROCESS_BLOCKCOPY_SP_64x8_AVX512 0 +%macro PROCESS_BLOCKCOPY_SP_64x4_AVX512 0 movu m0, [r2] movu m1, [r2 + 64] movu m2, [r2 + r3] @@ -2170,10 +2173,8 @@ packuswb m0, m1 packuswb m2, m3 -vpermq m0, m0, 11011000b -vpermq m2, m2, 11011000b -vshufi64x2 m0, m0, 11011000b -vshufi64x2 m2, m2, 11011000b +vpermq m0, m4, m0 +vpermq m2, m4, m2 movu [r0], m0 movu [r0 + r1], m2 @@ -2184,73 +2185,25 @@ packuswb m0, m1 packuswb m2, m3 -vpermq m0, m0, 11011000b -vpermq m2, m2, 11011000b -vshufi64x2 m0, m0, 11011000b -vshufi64x2 m2, m2, 11011000b -movu [r0 + 2 * r1], m0 -movu [r0 + r5], m2 - -lear0, [r0 + 4 * r1] -lear2, [r2 + 4 * r3] - -movu m0, [r2] -movu m1, [r2 + 64] -movu m2, [r2 + r3] -movu m3, [r2 + r3 + 64] - -packuswb m0, m1 -packuswb m2, m3 -vpermq m0, m0, 11011000b -vpermq m2, m2, 11011000b -vshufi64x2 m0, m0, 11011000b -vshufi64x2 m2, m2, 11011000b -movu [r0], m0 -movu [r0 + r1], m2 - -movu m0, [r2 + 2 * r3] -movu m1, [r2 + 2 * r3 + 64] -movu m2, [r2 + r4] -movu m3, [r2 + r4 + 64] - -packuswb m0, m1 -packuswb m2, m3 -vpermq m0, m0, 11011000b -vpermq m2, m2, 11011000b -vshufi64x2 m0, m0, 11011000b -vshufi64x2 m2, m2, 11011000b +vpermq m0, m4, m0 +vpermq m2, m4, m2 movu [r0 + 2 * r1], m0 movu [r0 + r5], m2 %endmacro INIT_ZMM avx512 -cglobal blockcopy_sp_64x64, 4, 6, 4 +cglobal blockcopy_sp_64x64, 4, 6, 5 +mova m4, [shuf1_avx512] addr3, r3 lear4, [3 * r3] lear5, [3 * r1] -PROCESS_BLOCKCOPY_SP_64x8_AVX512 +%rep 15 +PROCESS_BLOCKCOPY_SP_64x4_AVX512 lear0, [r0 + 4 * r1] lear2, [r2 + 4 * r3] -PROCESS_BLOCKCOPY_SP_64x8_AVX512 -lear0, [r0 + 4 * r1] -lear2, [r2 + 4 * r3] -PROCESS_BLOCKCOPY_SP_64x8_AVX512 -lear0, [r0 + 4 * r1] -lear2, [r2 + 4 * r3] -PROCESS_BLOCKCOPY_SP_64x8_AVX512 -lear0, [r0 + 4 * r1] -lear2, [r2 + 4 * r3] -PROCESS_BLOCKCOPY_SP_64x8_AVX512 -lear0, [r0 + 4 * r1] -lear2, [r2 + 4 * r3] -PROCESS_BLOCKCOPY_SP_64x8_AVX512 -lear0, [r0 + 4 * r1] -lear2, [r2 + 4 * r3] -PROCESS_BLOCKCOPY_SP_64x8_AVX512 -lear0, [r0 + 4 * r1] -lear2, [r2 + 4 * r3] -PROCESS_BLOCKCOPY_SP_64x8_AVX512 +%endrep +PROCESS_BLOCKCOPY_SP_64x4_AVX512 RET ;- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 084 of 307] x86: AVX512 interp_4tap_horiz_ps_64xN
# HG changeset patch # User Jayashri Murugan # Date 1502430475 25200 # Thu Aug 10 22:47:55 2017 -0700 # Node ID 951e9a16296e5d1e528c0083630fde8122bd15c1 # Parent 3d8c45642752803c560891fdfbe0a8b5c03ca76a x86: AVX512 interp_4tap_horiz_ps_64xN Size | AVX2 performance | AVX512 performance -- 64x16 | 26.50x | 35.13x 64x32 | 25.48x | 38.62x 64x48 | 27.52x | 40.34x 64x64 | 27.85x | 40.43x diff -r 3d8c45642752 -r 951e9a16296e source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Aug 11 14:36:18 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Aug 10 22:47:55 2017 -0700 @@ -4029,6 +4029,11 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512); p.weight_pp = PFX(weight_pp_avx512); +//i444 chroma_hps +p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hps = PFX(interp_4tap_horiz_ps_64x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hps = PFX(interp_4tap_horiz_ps_64x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hps = PFX(interp_4tap_horiz_ps_64x48_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hps = PFX(interp_4tap_horiz_ps_64x16_avx512); } #endif } diff -r 3d8c45642752 -r 951e9a16296e source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Fri Aug 11 14:36:18 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Thu Aug 10 22:47:55 2017 -0700 @@ -26,7 +26,7 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA 32 +SECTION_RODATA 64 const tab_Tm,db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14 @@ -152,6 +152,9 @@ const interp4_horiz_shuf_store1_avx512, dd 0 ,8, 1, 9, 4, 12, 5, 13, 2, 10, 3, 11, 6, 14, 7, 15 +ALIGN 64 +const interp8_hps_shuf_avx512, dq 0, 4, 1, 5, 2, 6, 3, 7 + SECTION .text cextern pb_128 @@ -9836,7 +9839,7 @@ FILTER_VER_LUMA_S_AVX2_32x24 sp FILTER_VER_LUMA_S_AVX2_32x24 ss ;- -;ipfilter_chroma_pp_avx512 code start +;ipfilter_chroma_avx512 code start ;- %macro PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512 0 ; register map @@ -9976,6 +9979,86 @@ IPFILTER_CHROMA_PP_32xN_AVX512 32 IPFILTER_CHROMA_PP_32xN_AVX512 64 IPFILTER_CHROMA_PP_32xN_AVX512 48 + +%macro PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512 0 +movu ym6, [r0] +vinserti32x8 m6, [r0 + 4], 1 +pshufb m7, m6, m2 +pshufb m6, m1 +pmaddubsw m6, m0 +pmaddubsw m7, m0 +pmaddwdm6, m3 +pmaddwdm7, m3 + +movu ym8, [r0 + 32] +vinserti32x8 m8, [r0 + 36], 1 +pshufb m9, m8, m2 +pshufb m8, m1 +pmaddubsw m8, m0 +pmaddubsw m9, m0 +pmaddwdm8, m3 +pmaddwdm9, m3 + +packssdw m6, m7 +packssdw m8, m9 +psubw m6, m4 +psubw m8, m4 +vpermq m6, m10, m6 +vpermq m8, m10, m8 +movu [r2], m6 +movu [r2 + mmsize],m8 +%endmacro + ;- -;ipfilter_chroma_pp_avx512 code end +; void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;- +%macro IPFILTER_CHROMA_PS_64xN_AVX512 1 +INIT_ZMM avx512 +cglobal interp_4tap_horiz_ps_64x%1, 4,7,11 +mov r4d, r4m +mov r5d, r5m + +%ifdef PIC +lea r6, [tab_ChromaCoeff] +vpbroadcastd m0, [r6 + r4 * 4] +%else +vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + +vbroadcasti32x8m1, [interp4_horiz_shuf_load1_avx512] +vbroadcasti32x8m2, [interp4_horiz_shuf_load2_avx512] +vbroadcasti32x8m3, [pw_1] +vbroadcasti32x8m4, [pw_2000] +mova m10, [interp8_hps_shuf_avx512] + +; register map +; m0- interpolate coeff +; m1,m2 - load shuffle order table +;
[x265] [PATCH 082 of 307] x86: AVX512 blockcopy_sp_32xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1502711388 -19800 # Mon Aug 14 17:19:48 2017 +0530 # Node ID b30539ebe5c9b2d9412d3a39458a90a7574ac744 # Parent 5c18b655a88a739b87c6b071d186a2b9286b8266 x86: AVX512 blockcopy_sp_32xN Size | AVX2 performance | AVX512 performance -- 32x32 | 6.77x | 11.27x i420 32x32 | 6.73x | 11.43x i422 32x64 | 6.68x | 12.19x diff -r 5c18b655a88a -r b30539ebe5c9 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 14 16:51:52 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Aug 14 17:19:48 2017 +0530 @@ -3948,6 +3948,10 @@ p.pu[LUMA_64x16].copy_pp = PFX(blockcopy_pp_64x16_avx512); p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512); +p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx512); +p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx512); +p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_avx512); + p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512); p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512); p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx512); diff -r 5c18b655a88a -r b30539ebe5c9 source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Mon Aug 14 16:51:52 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Mon Aug 14 17:19:48 2017 +0530 @@ -2191,6 +2191,25 @@ movu [r0 + r5], m2 %endmacro +%macro PROCESS_BLOCKCOPY_SP_32x4_AVX512 0 +movu m0, [r2] +movu m1, [r2 + r3] +movu m2, [r2 + 2 * r3] +movu m3, [r2 + r4] + +packuswb m0, m1 +packuswb m2, m3 +vpermq m0, m4, m0 +vpermq m2, m4, m2 +movu [r0], ym0 +vextracti32x8 [r0 + r1], m0, 1 +movu [r0 + 2 * r1], ym2 +vextracti32x8 [r0 + r5], m2, 1 +%endmacro + +;- +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) +;- INIT_ZMM avx512 cglobal blockcopy_sp_64x64, 4, 6, 5 mova m4, [shuf1_avx512] @@ -2206,6 +2225,26 @@ PROCESS_BLOCKCOPY_SP_64x4_AVX512 RET +%macro BLOCKCOPY_SP_32xN_AVX512 1 +INIT_ZMM avx512 +cglobal blockcopy_sp_32x%1, 4, 6, 5 +mova m4, [shuf1_avx512] +addr3, r3 +lear4, [3 * r3] +lear5, [3 * r1] + +%rep %1/4 - 1 +PROCESS_BLOCKCOPY_SP_32x4_AVX512 +lear0, [r0 + 4 * r1] +lear2, [r2 + 4 * r3] +%endrep +PROCESS_BLOCKCOPY_SP_32x4_AVX512 +RET +%endmacro + +BLOCKCOPY_SP_32xN_AVX512 32 +BLOCKCOPY_SP_32xN_AVX512 64 + ;- ; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val) ;- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 074 of 307] x86: AVX512 interp_4tap_horiz_pp_64xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1501222403 -19800 # Fri Jul 28 11:43:23 2017 +0530 # Node ID 563b3c4f91eb20374311ed18fb18ad12aeebaf26 # Parent 7d7f2a4e771c7c2b573db9bc298d1a35bb72f32d x86: AVX512 interp_4tap_horiz_pp_64xN Size | AVX2 performance | AVX512 performance -- 64x16 | 21.45x | 39.29x 64x32 | 22.27x | 39.37x 64x48 | 22.76x | 40.75x 64x64 | 22.76x | 40.90x diff -r 7d7f2a4e771c -r 563b3c4f91eb source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 08 15:25:11 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jul 28 11:43:23 2017 +0530 @@ -3996,6 +3996,11 @@ p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = PFX(interp_4tap_horiz_pp_64x16_avx512); + } #endif } diff -r 7d7f2a4e771c -r 563b3c4f91eb source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Tue Aug 08 15:25:11 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Fri Jul 28 11:43:23 2017 +0530 @@ -137,6 +137,10 @@ const interp8_hps_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7 +const interp4_horiz_shuf_load1_avx512, times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + +const interp4_horiz_shuf_load2_avx512, times 2 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + SECTION .text cextern pb_128 @@ -9820,3 +9824,75 @@ FILTER_VER_LUMA_S_AVX2_32x24 sp FILTER_VER_LUMA_S_AVX2_32x24 ss +;- +;ipfilter_chroma_pp_avx512 code start +;- +%macro PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512 0 +; register map +; m0 - interpolate coeff +; m1, m2 - shuffle order table +; m3 - constant word 1 +; m4 - constant word 512 + +movu m5, [r0] +pshufb m6, m5, m2 +pshufb m5, m5, m1 +pmaddubsw m5, m0 +pmaddubsw m6, m0 +pmaddwdm5, m3 +pmaddwdm6, m3 + +movu m7, [r0 + 4] +pshufb m8, m7, m2 +pshufb m7, m7, m1 +pmaddubsw m7, m0 +pmaddubsw m8, m0 +pmaddwdm7, m3 +pmaddwdm8, m3 + +packssdw m5, m7 +packssdw m6, m8 +pmulhrsw m5, m4 +pmulhrsw m6, m4 +packuswb m5, m6 +movu [r2], m5 +%endmacro + +;- +; void interp_4tap_horiz_pp_64xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;- +%macro IPFILTER_CHROMA_PP_64xN_AVX512 1 +INIT_ZMM avx512 +cglobal interp_4tap_horiz_pp_64x%1, 4,6,9 +mov r4d, r4m + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +vpbroadcastd m0, [r5 + r4 * 4] +%else +vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + +vbroadcasti32x8 m1, [interp4_horiz_shuf_load1_avx512] +vbroadcasti32x8 m2, [interp4_horiz_shuf_load2_avx512] +vbroadcasti32x8 m3, [pw_1] +vbroadcasti32x8 m4, [pw_512] +dec r0 + +%rep %1 - 1 +PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512 +lea r2, [r2 + r3] +lea r0, [r0 + r1] +%endrep +PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512 +RET +%endmacro + +IPFILTER_CHROMA_PP_64xN_AVX512 64 +IPFILTER_CHROMA_PP_64xN_AVX512 32 +IPFILTER_CHROMA_PP_64xN_AVX512 48 +IPFILTER_CHROMA_PP_64xN_AVX512 16 + +;- +;ipfilter_chroma_pp_avx512 code end +;- ___ x265-devel mailing list x265-devel@
[x265] [PATCH 080 of 307] [x265-avx512]x86: AVX512 pixel_sad_x4_48x64 for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty # Date 1502191910 -19800 # Tue Aug 08 17:01:50 2017 +0530 # Node ID 4a643ecb8c3bcc4dab96bfe56217d4449564bae0 # Parent 95c8818a26eea8a17a6a9471f861b89ab9e210c6 [x265-avx512]x86: AVX512 pixel_sad_x4_48x64 for high bit depth AVX2 performance: 19.96x AVX512 performance: 34.24x diff -r 95c8818a26ee -r 4a643ecb8c3b source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 08 15:45:12 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 08 17:01:50 2017 +0530 @@ -2313,6 +2313,7 @@ p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512); p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512); p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512); +p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_avx512); p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx512); p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512); p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512); diff -r 95c8818a26ee -r 4a643ecb8c3b source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Tue Aug 08 15:45:12 2017 +0530 +++ b/source/common/x86/sad16-a.asm Tue Aug 08 17:01:50 2017 +0530 @@ -3487,6 +3487,165 @@ RET ; +; void pixel_sad_x4_48x64( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res ) +; +INIT_ZMM avx512 +cglobal pixel_sad_x4_48x64, 4, 9, 20 +pxorm0, m0 +pxorm1, m1 +pxorm2, m2 +pxorm3, m3 +mov r8d, 64/4 + +vbroadcasti32x8 m19, [pw_1] + +add r5d, r5d +lea r7d, [r5 * 3] +.loop: +movum4, [r0] +movum5, [r0 + 2 * FENC_STRIDE] +movu ym6, [r0 + mmsize] +vinserti32x8m6, [r0 + 2 * FENC_STRIDE + mmsize], 1 +movum7, [r1] +movum8, [r1 + r5] +movu ym9, [r1 + mmsize] +vinserti32x8m9, [r1 + r5 + mmsize], 1 +movum10, [r2] +movum11, [r2 + r5] +movu ym12, [r2 + mmsize] +vinserti32x8m12, [r2 + r5 + mmsize], 1 +movum13, [r3] +movum14, [r3 + r5] +movu ym15, [r3 + mmsize] +vinserti32x8m15, [r3 + r5 + mmsize], 1 +movum16, [r4] +movum17, [r4 + r5] +movu ym18, [r4 + mmsize] +vinserti32x8m18, [r4 + r5 + mmsize], 1 + +psubw m7, m4 +psubw m8, m5 +psubw m9, m6 +psubw m10, m4 +psubw m11, m5 +psubw m12, m6 +psubw m13, m4 +psubw m14, m5 +psubw m15, m6 +psubw m16, m4 +psubw m17, m5 +psubw m18, m6 + +pabsw m7, m7 +pabsw m8, m8 +pabsw m9, m9 +pabsw m10, m10 +pabsw m11, m11 +pabsw m12, m12 +pabsw m13, m13 +pabsw m14, m14 +pabsw m15, m15 +pabsw m16, m16 +pabsw m17, m17 +pabsw m18, m18 + +paddw m7, m8 +paddw m7, m9 +paddw m10, m11 +paddw m10, m12 +paddw m13, m14 +paddw m13, m15 +paddw m16, m17 +paddw m16, m18 + +pmaddwd m7, m19 +paddd m0, m7 +pmaddwd m10, m19 +paddd m1, m10 +pmaddwd m13, m19 +paddd m2, m13 +pmaddwd m16, m19 +paddd m3, m16 + +movum4, [r0 + 4 * FENC_STRIDE] +movum5, [r0 + 6 * FENC_STRIDE] +movu ym6, [r0 + 4 * FENC_STRIDE + mmsize] +vinserti32x8m6, [r0 + 6 * FENC_STRIDE + mmsize], 1 +movum7, [r1 + 2 * r5] +movum8, [r1 + r7] +movu ym9, [r1 + 2 * r5 + mmsize] +vinserti32x8m9, [r1 + r7 + mmsize], 1 +movum10, [r2 + 2 * r5] +movum11, [r2 + r7] +movu ym12, [r2 + 2 * r5 + mmsize] +vinserti32x8m12, [r2 + r7 + mmsize], 1 +movum13, [r3 + 2 * r5] +movum14, [r3 + r7] +movu ym15, [r3 + 2 * r5 + mmsize] +vinserti32x8m15, [r3 + r7 + mmsize], 1 +movum16, [r4 + 2 * r5] +movum17, [r4 + r7] +movu ym18, [r4 + 2 * r5 + mmsize] +vinserti32x8m18, [r4 + r7 + mmsize], 1 + + +psubw m7, m4 +psubw m8, m5 +psubw m9, m6 +psubw m10, m4 +psubw m11, m5 +psubw m12, m6 +psubw m13, m4 +psubw m14, m5 +psubw m15, m6 +psubw m16, m4 +psubw m17, m5 +psubw m18, m6 + +pabsw m7, m7 +pabsw m8, m8 +pabsw m9, m9 +pabsw m
[x265] [PATCH 083 of 307] [x265-avx512]x86: AVX512 weight_pp
# HG changeset patch # User Gopi Satykrishna Akisetty # Date 1502442378 -19800 # Fri Aug 11 14:36:18 2017 +0530 # Node ID 3d8c45642752803c560891fdfbe0a8b5c03ca76a # Parent b30539ebe5c9b2d9412d3a39458a90a7574ac744 [x265-avx512]x86: AVX512 weight_pp BitDepth | AVX2 performance | AVX512 performance 8 | 6.23x| 10.60x 10 | 9.43x| 14.59x diff -r b30539ebe5c9 -r 3d8c45642752 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 14 17:19:48 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Aug 11 14:36:18 2017 +0530 @@ -2322,6 +2322,7 @@ p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512); p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); +p.weight_pp = PFX(weight_pp_avx512); } } @@ -4026,6 +4027,7 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512); +p.weight_pp = PFX(weight_pp_avx512); } #endif diff -r b30539ebe5c9 -r 3d8c45642752 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Mon Aug 14 17:19:48 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Fri Aug 11 14:36:18 2017 +0530 @@ -1662,6 +1662,116 @@ jnz .loopH RET %endif + +%if HIGH_BIT_DEPTH +INIT_ZMM avx512 +cglobal weight_pp, 6, 7, 7 +%define correction (14 - BIT_DEPTH) +mov r6d, r6m +shl r6d, 16 - correction +or r6d, r5d + +movd xm0, r6d +vpbroadcastd m0, xm0 +mov r5d, r7m +sub r5d, correction +movd xm1, r5d + +vpbroadcastdm2, r8m +vbroadcasti32x8 m5, [pw_1] +vbroadcasti32x8 m6, [pw_pixel_max] + +add r2d, r2d +add r3d, r3d +sub r2d, r3d +shr r3d, 6 + +.loopH: +mov r5d, r3d + +.loopW: +movum4, [r0] +punpcklwd m3, m4, m5 +pmaddwd m3, m0 +psrad m3, xm1 +paddd m3, m2 + +punpckhwd m4, m5 +pmaddwd m4, m0 +psrad m4, xm1 +paddd m4, m2 + +packusdwm3, m4 +pminuw m3, m6 +movu[r1], m3 + +add r0, 64 +add r1, 64 + +dec r5d +jnz .loopW + +lea r0, [r0 + r2] +lea r1, [r1 + r2] + +dec r4d +jnz .loopH +%undef correction +RET +%else +INIT_ZMM avx512 +cglobal weight_pp, 6, 7, 6 + +shl r5d, 6 +mov r6d, r6m +shl r6d, 16 +or r6d, r5d + +movd xm0, r6d +vpbroadcastd m0, xm0 +movd xm1, r7m +vpbroadcastd m2, r8m + +vbroadcasti32x8 m5, [pw_1] + +sub r2d, r3d +shr r3d, 5 + +.loopH: +mov r5d, r3d + +.loopW: +pmovzxbwm4, [r0] +punpcklwd m3, m4, m5 +pmaddwd m3, m0 +psrad m3, xm1 +paddd m3, m2 + +punpckhwd m4, m5 +pmaddwd m4, m0 +psrad m4, xm1 +paddd m4, m2 + +packssdw m3, m4 +vextracti64x4 ym4, m3, 1 +packuswb ym3, ym4 +vpermqym3, ym3, q3120 +movu [r1], ym3 + +add r0, 32 +add r1, 32 + +dec r5d +jnz .loopW + +lea r0, [r0 + r2] +lea r1, [r1 + r2] + +dec r4d +jnz .loopH +RET +%endif + ;- ;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset) ;- diff -r b30539ebe5c9 -r 3d8c45642752 source/encoder/reference.cpp --- a/source/encoder/reference.cpp Mon Aug 14 17:19:48 2017 +0530 +++ b/source/encoder/reference.cpp Fri Aug 11 14:36:18 2017 +0530 @@ -155,12 +155,10 @@ const pixel* src = reconPic->m_picOrg[c] + numWeightedRows * cuHeight * stride; pixel* dst = fpelPlane[c] + numWeightedRows * cuHeight * stride; - // Computing weighted CU rows int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth -int padwidth = (width + 15) & ~15; // weightp assembly needs even 16 byte widths +int padwidth = (width + 31) & ~31; // weightp assembly needs even 32 byte wid
[x265] [PATCH 086 of 307] x86: AVX512 cleanup add_ps code
# HG changeset patch # User Vignesh Vijayakumar # Date 1502773372 -19800 # Tue Aug 15 10:32:52 2017 +0530 # Node ID 2db192bac0f14d55f7f82b8964d6c67c3a3637c3 # Parent 6f811dfd5690866f4c432911982a30665dc0e91c x86: AVX512 cleanup add_ps code diff -r 6f811dfd5690 -r 2db192bac0f1 source/common/x86/pixeladd8.asm --- a/source/common/x86/pixeladd8.asm Fri Aug 11 12:32:50 2017 +0530 +++ b/source/common/x86/pixeladd8.asm Tue Aug 15 10:32:52 2017 +0530 @@ -24,11 +24,11 @@ %include "x86inc.asm" %include "x86util.asm" +SECTION_RODATA 64 -SECTION_RODATA 32 - +ALIGN 64 +const store_shuf1_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7 SECTION .text - cextern pw_pixel_max ;- @@ -1148,157 +1148,46 @@ ;- ; pixel_add_ps avx512 code start ;- -%macro PROCESS_ADD_PS_64x8_AVX512 0 +%macro PROCESS_ADD_PS_64x4_AVX512 0 pmovzxbwm0, [r2] pmovzxbwm1, [r2 + 32] movum2, [r3] movum3, [r3 + 64] -pmovzxbwm4, [r2 + r4] -pmovzxbwm5, [r2 + r4 + 32] -movum6, [r3 + r5] -movum7, [r3 + r5 + 64] - paddw m0, m2 paddw m1, m3 -paddw m4, m6 -paddw m5, m7 packuswbm0, m1 -packuswbm4, m5 -vpermq m0, m0, 11011000b -vpermq m4, m4, 11011000b -vshufi64x2 m0, m0, 11011000b -vshufi64x2 m4, m4, 11011000b +vpermq m0, m4, m0 movu[r0], m0 -movu[r0 + r1], m4 - -lea r2, [r2 + r4 * 2] -lea r3, [r3 + r5 * 2] -lea r0, [r0 + r1 * 2] - -pmovzxbwm0, [r2] -pmovzxbwm1, [r2 + 32] -movum2, [r3] -movum3, [r3 + 64] -pmovzxbwm4, [r2 + r4] -pmovzxbwm5, [r2 + r4 + 32] -movum6, [r3 + r5] -movum7, [r3 + r5 + 64] - +pmovzxbwm0, [r2 + r4] +pmovzxbwm1, [r2 + r4 + 32] +movum2, [r3 + r5] +movum3, [r3 + r5 + 64] paddw m0, m2 paddw m1, m3 -paddw m4, m6 -paddw m5, m7 packuswbm0, m1 -packuswbm4, m5 -vpermq m0, m0, 11011000b -vpermq m4, m4, 11011000b -vshufi64x2 m0, m0, 11011000b -vshufi64x2 m4, m4, 11011000b -movu[r0], m0 -movu[r0 + r1], m4 - -lea r2, [r2 + r4 * 2] -lea r3, [r3 + r5 * 2] -lea r0, [r0 + r1 * 2] - -pmovzxbwm0, [r2] -pmovzxbwm1, [r2 + 32] -movum2, [r3] -movum3, [r3 + 64] -pmovzxbwm4, [r2 + r4] -pmovzxbwm5, [r2 + r4 + 32] -movum6, [r3 + r5] -movum7, [r3 + r5 + 64] - +vpermq m0, m4, m0 +movu[r0 + r1], m0 +pmovzxbwm0, [r2 + 2 * r4] +pmovzxbwm1, [r2 + 2 * r4 + 32] +movum2, [r3 + 2 * r5] +movum3, [r3 + 2 * r5 + 64] paddw m0, m2 paddw m1, m3 -paddw m4, m6 -paddw m5, m7 packuswbm0, m1 -packuswbm4, m5 -vpermq m0, m0, 11011000b -vpermq m4, m4, 11011000b -vshufi64x2 m0, m0, 11011000b -vshufi64x2 m4, m4, 11011000b -movu[r0], m0 -movu[r0 + r1], m4 +vpermq m0, m4, m0 +movu[r0 + 2 * r1], m0 -lea r2, [r2 + r4 * 2] -lea r3, [r3 + r5 * 2] -lea r0, [r0 + r1 * 2] - -pmovzxbwm0, [r2] -pmovzxbwm1, [r2 + 32] -movum2, [r3] -movum3, [r3 + 64] -pmovzxbwm4, [r2 + r4] -pmovzxbwm5, [r2 + r4 + 32] -movum6, [r3 + r5] -movum7, [r3 + r5 + 64] - +pmovzxbwm0, [r2 + r7] +pmovzxbwm1, [r2 + r7 + 32] +movum2, [r3 + r8] +movum3, [r3 + r8 + 64] paddw m0, m2 paddw m1, m3 -paddw m4, m6 -paddw m5, m7 packuswbm0, m1 -packuswbm4, m5 -vpermq m0, m0, 11011000b -vpermq m4, m4, 11011000b -vshufi64x2 m0,
[x265] [PATCH 071 of 307] x86: AVX512 addAvg_48x64 for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1501589225 -19800 # Tue Aug 01 17:37:05 2017 +0530 # Node ID aac415b7223acced7fc844c4a07225704b811df0 # Parent ad756cf6d35f0d1460c5a079bea8781ffd67b7c7 x86: AVX512 addAvg_48x64 for high bit depth AVX2 performance: 10.61x AVX512 performance: 13.18x diff -r ad756cf6d35f -r aac415b7223a source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 07 16:30:18 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 01 17:37:05 2017 +0530 @@ -2276,6 +2276,7 @@ p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx512); p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_avx512); p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_avx512); +p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = PFX(addAvg_32x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_avx512); diff -r ad756cf6d35f -r aac415b7223a source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmMon Aug 07 16:30:18 2017 +0530 +++ b/source/common/x86/mc-a.asmTue Aug 01 17:37:05 2017 +0530 @@ -1812,6 +1812,79 @@ movu[r2 + r8 + mmsize], m0 %endmacro +%macro PROCESS_ADDAVG_48x4_HBD_AVX512 0 +movum0, [r0] +movum1, [r1] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2],m0 + +movuym0, [r0 + mmsize] +movuym1, [r1 + mmsize] +paddw ym0, ym1 +pmulhrswym0, ym3 +paddw ym0, ym4 +pmaxsw ym0, ym2 +pminsw ym0, ym5 +movu[r2 + mmsize],ym0 + +movum0, [r0 + r3] +movum1, [r1 + r4] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2 + r5], m0 + +movuym0, [r0 + r3 + mmsize] +movuym1, [r1 + r4 + mmsize] +paddw ym0, ym1 +pmulhrswym0, ym3 +paddw ym0, ym4 +pmaxsw ym0, ym2 +pminsw ym0, ym5 +movu[r2 + r5 + mmsize], ym0 + +movum0, [r0 + 2 * r3] +movum1, [r1 + 2 * r4] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2 + 2 * r5], m0 + +movuym0, [r0 + 2 * r3 + mmsize] +movuym1, [r1 + 2 * r4 + mmsize] +paddw ym0, ym1 +pmulhrswym0, ym3 +paddw ym0, ym4 +pmaxsw ym0, ym2 +pminsw ym0, ym5 +movu[r2 + 2 * r5 + mmsize], ym0 + +movum0, [r0 + r6] +movum1, [r1 + r7] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2 + r8], m0 + +movuym0, [r0 + r6 + mmsize] +movuym1, [r1 + r7 + mmsize] +paddw ym0, ym1 +pmulhrswym0, ym3 +paddw ym0, ym4 +pmaxsw ym0, ym2 +pminsw ym0, ym5 +movu[r2 + r8 + mmsize], ym0 +%endmacro ;- ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) ;- @@ -1874,6 +1947,28 @@ ADDAVG_W64_HBD_AVX512 32 ADDAVG_W64_HBD_AVX512 48 ADDAVG_W64_HBD_AVX512 64 + +INIT_ZMM avx512 +cglobal addAvg_48x64, 6,9,6 +vbroadcasti32x8m4, [pw_ %+ ADDAVG_ROUND] +vbroadcasti32x8m5, [pw_pixel_max] +vbroadcasti32x8m3, [pw_ %+ ADDAVG_FACTOR] +pxorm2, m2 +add r3, r3 +add r4, r4 +add r5, r5 +lea r6, [3 * r3] +lea r7, [3 * r4] +lea r8, [3 * r5] + +%rep 15 +PROCESS_ADDAVG_48x4_H
[x265] [PATCH 069 of 307] x86: AVX512 pixel_var_32x32
# HG changeset patch # User Vignesh Vijayakumar # Date 1501843838 -19800 # Fri Aug 04 16:20:38 2017 +0530 # Node ID 039ed71e123c3e14bfaabbe3aada944157784b36 # Parent c5b5b7cb9bbef4365692bfaf05a2a83796d5f1b0 x86: AVX512 pixel_var_32x32 AVX2 performance : 9.15x AVX512 performance : 13.49x diff -r c5b5b7cb9bbe -r 039ed71e123c source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Aug 04 14:27:51 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Aug 04 16:20:38 2017 +0530 @@ -3929,6 +3929,7 @@ p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512); p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512); +p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx512); p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx512); p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx512); p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx512); diff -r c5b5b7cb9bbe -r 039ed71e123c source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Fri Aug 04 14:27:51 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Fri Aug 04 16:20:38 2017 +0530 @@ -7105,6 +7105,82 @@ RET %endif ; !HIGH_BIT_DEPTH +%macro PROCESS_VAR_32x8_AVX512 0 +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + r1] +pmovzxbwm2, [r0 + 2 * r1] +pmovzxbwm3, [r0 + r2] + +paddw m4, m0 +paddw m4, m1 +paddw m4, m2 +paddw m4, m3 +pmaddwd m0, m0 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +paddd m5, m0 +paddd m5, m1 +paddd m5, m2 +paddd m5, m3 + +lea r0, [r0 + r1 * 4] + +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + r1] +pmovzxbwm2, [r0 + 2 * r1] +pmovzxbwm3, [r0 + r2] + +paddw m4, m0 +paddw m4, m1 +paddw m4, m2 +paddw m4, m3 +pmaddwd m0, m0 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +paddd m5, m0 +paddd m5, m1 +paddd m5, m2 +paddd m5, m3 +%endmacro + +%macro PROCESS_VAR_AVX512_END 0 +vextracti32x8 ym0, m4, 1 +vextracti32x8 ym1, m5, 1 +paddw ym4, ym0 +paddd ym5, ym1 +vextracti32x4 xm0, m4, 1 +vextracti32x4 xm1, m5, 1 +paddw xm4, xm0 +paddd xm5, xm1 +HADDW xm4, xm2 +HADDD xm5, xm1 +punpckldq xm4, xm5 +movq rax, xm4 +%endmacro + +%if HIGH_BIT_DEPTH==0 +;- +; int pixel_var_wxh( uint8_t *, intptr_t ) +;- +INIT_ZMM avx512 +cglobal pixel_var_32x32, 2,4,6 +pxor m4, m4; sum +pxor m5, m5; sum squared +lea r2, [3 * r1] + +PROCESS_VAR_32x8_AVX512 +lea r0, [r0 + r1 * 4] +PROCESS_VAR_32x8_AVX512 +lea r0, [r0 + r1 * 4] +PROCESS_VAR_32x8_AVX512 +lea r0, [r0 + r1 * 4] +PROCESS_VAR_32x8_AVX512 +PROCESS_VAR_AVX512_END +RET +%endif + %macro VAR_AVX512_CORE 1 ; accum %if %1 paddwm0, m2 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 085 of 307] x86: AVX512 interp_4tap_horiz_ps_32xN
# HG changeset patch # User Jayashri Murugan # Date 1502434970 -19800 # Fri Aug 11 12:32:50 2017 +0530 # Node ID 6f811dfd5690866f4c432911982a30665dc0e91c # Parent 951e9a16296e5d1e528c0083630fde8122bd15c1 x86: AVX512 interp_4tap_horiz_ps_32xN Color Space i444 Size| AVX2 performance | AVX512 performance 32x8| 25.91x | 38.35x 32x16 | 25.45x | 32.02x 32x24 | 25.80x | 32.73x 32x32 | 33.49x | 38.02x 32x64 | 27.42x | 36.20x Color Space i422 Size| AVX2 performance | AVX512 performance 32x16 | 24.74x | 33.95x 32x32 | 33.31x | 34.28x 32x48 | 27.11x | 35.98x 32x64 | 27.32x | 35.02x Color Space i420 Size| AVX2 performance | AVX512 performance 32x8| 27.16x | 36.68x 32x16 | 24.87x | 31.40x 32x24 | 25.98x | 34.08x 32x32 | 33.01x | 34.71x diff -r 951e9a16296e -r 6f811dfd5690 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Aug 10 22:47:55 2017 -0700 +++ b/source/common/x86/asm-primitives.cpp Fri Aug 11 12:32:50 2017 +0530 @@ -4034,6 +4034,25 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hps = PFX(interp_4tap_horiz_ps_64x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hps = PFX(interp_4tap_horiz_ps_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hps = PFX(interp_4tap_horiz_ps_64x16_avx512); + +p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512); + +//i422 chroma_hps +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hps = PFX(interp_4tap_horiz_ps_32x48_avx512); + +//i420 chroma_hps +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512); + } #endif } diff -r 951e9a16296e -r 6f811dfd5690 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Thu Aug 10 22:47:55 2017 -0700 +++ b/source/common/x86/ipfilter8.asm Fri Aug 11 12:32:50 2017 +0530 @@ -10010,7 +10010,7 @@ %endmacro ;- -; void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) +; void interp_horiz_ps_64xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;- %macro IPFILTER_CHROMA_PS_64xN_AVX512 1 INIT_ZMM avx512 @@ -10059,6 +10059,74 @@ IPFILTER_CHROMA_PS_64xN_AVX512 48 IPFILTER_CHROMA_PS_64xN_AVX512 16 +%macro PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512 0 +movu ym6, [r0] +vinserti32x8 m6, [r0 + 4], 1 +pshufb m7, m6, m2 +pshufb m6, m6, m1 +pmaddubsw m6, m0 +pmaddubsw m7, m0 +pmaddwdm6, m3 +pmaddwdm7, m3 + +packssdw m6, m7 +psubw m6, m4 +vpermq m6, m8, m6 +movu [r2], m6 +%endmacro + +;- +; void interp_horiz_ps_32xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) +;- +%macro IPFILTER_
[x265] [PATCH 091 of 307] x86: AVX512 cleanup interp_4tap_horiz_pp_32xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1504242228 -19800 # Fri Sep 01 10:33:48 2017 +0530 # Node ID dbfcd0ee40e9bd4ee351eb064d8aa0819bd9b3fd # Parent d9200885420957bccd4edea62bf87bbe8831bc62 x86: AVX512 cleanup interp_4tap_horiz_pp_32xN diff -r d92008854209 -r dbfcd0ee40e9 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Sun Aug 13 15:12:25 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Sep 01 10:33:48 2017 +0530 @@ -4011,22 +4011,29 @@ p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); +//i444 chroma_hpp p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = PFX(interp_4tap_horiz_pp_64x16_avx512); - p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512); - p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); +//i422 chroma_hpp p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512); + +//i420 chroma_hpp +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512); + p.weight_pp = PFX(weight_pp_avx512); //i444 chroma_hps diff -r d92008854209 -r dbfcd0ee40e9 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Sun Aug 13 15:12:25 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Fri Sep 01 10:33:48 2017 +0530 @@ -150,8 +150,6 @@ const interp4_horiz_shuf_load2_avx512, times 2 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 const interp4_horiz_shuf_load3_avx512, times 2 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 -const interp4_horiz_shuf_store1_avx512, dd 0 ,8, 1, 9, 4, 12, 5, 13, 2, 10, 3, 11, 6, 14, 7, 15 - ALIGN 64 const interp8_hps_shuf_avx512, dq 0, 4, 1, 5, 2, 6, 3, 7 @@ -9881,31 +9879,30 @@ ; m9 - store shuffle order table movu ym5, [r0] -vinserti32x8 m5, [r0 + 4], 1 +vinserti32x8 m5, [r0 + r1], 1 +movu ym7, [r0 + 4] +vinserti32x8 m7, [r0 + r1 + 4], 1 pshufb m6, m5, m2 -pshufb m5, m5, m1 +pshufb m5, m1 +pshufb m8, m7, m2 +pshufb m7, m1 + pmaddubsw m5, m0 +pmaddubsw m7, m0 +pmaddwdm5, m3 +pmaddwdm7, m3 + pmaddubsw m6, m0 -pmaddwdm5, m3 +pmaddubsw m8, m0 pmaddwdm6, m3 - -movu ym7, [r0 + r1] -vinserti32x8 m7, [r0 + r1 + 4], 1 - -pshufb m8, m7, m2 -pshufb m7, m7, m1 -pmaddubsw m7, m0 -pmaddubsw m8, m0 -pmaddwdm7, m3 pmaddwdm8, m3 -packssdw m5, m6 -packssdw m7, m8 +packssdw m5, m7 +packssdw m6, m8 pmulhrsw m5, m4 -pmulhrsw m7, m4 -packuswb m5, m7 -vpermd m5, m9, m5 +pmulhrsw m6, m4 +packuswb m5, m6 movu [r2], ym5 vextracti32x8[r2 + r3], m5,1
[x265] [PATCH 097 of 307] x86: AVX512 convert_p2s link 32xN and 64xN chroma_444 primitives
# HG changeset patch # User Vignesh Vijayakumar # Date 1503901717 -19800 # Mon Aug 28 11:58:37 2017 +0530 # Node ID bf199a5eca5be148be8a0c91cd9f2e8e0e908059 # Parent 0355f0128b7d713c4a21c91d3cc5bed1e8b43c47 x86: AVX512 convert_p2s link 32xN and 64xN chroma_444 primitives diff -r 0355f0128b7d -r bf199a5eca5b source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Aug 24 12:20:07 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Aug 28 11:58:37 2017 +0530 @@ -2253,6 +2253,15 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s = PFX(filterPixelToShort_32x8_avx2); +p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s = PFX(filterPixelToShort_32x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s = PFX(filterPixelToShort_32x24_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s = PFX(filterPixelToShort_32x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s = PFX(filterPixelToShort_64x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s = PFX(filterPixelToShort_64x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s = PFX(filterPixelToShort_64x48_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s = PFX(filterPixelToShort_64x64_avx512); p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512); @@ -4041,6 +4050,15 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s = PFX(filterPixelToShort_32x8_avx2); +p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s = PFX(filterPixelToShort_32x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s = PFX(filterPixelToShort_32x24_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s = PFX(filterPixelToShort_32x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s = PFX(filterPixelToShort_64x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s = PFX(filterPixelToShort_64x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s = PFX(filterPixelToShort_64x48_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s = PFX(filterPixelToShort_64x64_avx512); p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512); p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 093 of 307] x86: AVX512 addAvg_32xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1503385834 -19800 # Tue Aug 22 12:40:34 2017 +0530 # Node ID 738f07186eb1d4bca84e9acdf70921ee9e2fee92 # Parent ed1932a414bf5962bbeccfd5c9e208b7db90f77f x86: AVX512 addAvg_32xN Size | AVX2 performance | AVX512 performance -- 32x8 | 15.31x | 19.98x 32x16 | 15.14x | 23.25x 32x24 | 14.65x | 23.95x 32x32 | 15.41x | 24.76x 32x64 | 14.56x | 24.53x diff -r ed1932a414bf -r 738f07186eb1 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Sun Aug 13 18:18:28 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 22 12:40:34 2017 +0530 @@ -3964,6 +3964,19 @@ p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx512); p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512); p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512); +p.pu[LUMA_32x8].addAvg = PFX(addAvg_32x8_avx512); +p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_avx512); +p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx512); +p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_avx512); +p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = PFX(addAvg_32x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_avx512); p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_avx512); diff -r ed1932a414bf -r 738f07186eb1 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmSun Aug 13 18:18:28 2017 +0530 +++ b/source/common/x86/mc-a.asmTue Aug 22 12:40:34 2017 +0530 @@ -3317,6 +3317,24 @@ movu[r2 + r5], m0 %endmacro +%macro PROCESS_ADDAVG_32x2_AVX512 0 +movum0, [r0] +movum1, [r1] +movum2, [r0 + r3] +movum3, [r1 + r4] + +paddw m0, m1 +pmulhrswm0, m4 +paddw m0, m5 +paddw m2, m3 +pmulhrswm2, m4 +paddw m2, m5 + +packuswbm0, m2 +vpermq m0, m6, m0 +movu[r2], ym0 +vextracti32x8 [r2 + r5], m0, 1 +%endmacro ; ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) ; @@ -3344,6 +3362,32 @@ ADDAVG_W64_AVX512 32 ADDAVG_W64_AVX512 48 ADDAVG_W64_AVX512 64 + +%macro ADDAVG_W32_AVX512 1 +INIT_ZMM avx512 +cglobal addAvg_32x%1, 6,6,7 +vbroadcasti32x8 m4, [pw_256] +vbroadcasti32x8 m5, [pw_128] +movam6, [shuf_avx512] +add r3, r3 +add r4, r4 + +%rep %1/2 - 1 +PROCESS_ADDAVG_32x2_AVX512 +lea r2, [r2 + 2 * r5] +lea r0, [r0 + 2 * r3] +lea r1, [r1 + 2 * r4] +%endrep +PROCESS_ADDAVG_32x2_AVX512 +RET +%endmacro + +ADDAVG_W32_AVX512 8 +ADDAVG_W32_AVX512 16 +ADDAVG_W32_AVX512 24 +ADDAVG_W32_AVX512 32 +ADDAVG_W32_AVX512 48 +ADDAVG_W32_AVX512 64 ;- ; addAvg avx512 code end ;- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 098 of 307] x86: AVX512 pixel_avg_weight_64xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1503908210 -19800 # Mon Aug 28 13:46:50 2017 +0530 # Node ID 45e4dd746cfd9380dbe2344a5754a6ff6e9feed5 # Parent bf199a5eca5be148be8a0c91cd9f2e8e0e908059 x86: AVX512 pixel_avg_weight_64xN Size | AVX2 performance | AVX512 performance -- 64x16 | 41.70x | 60.98x 64x32 | 36.75x | 68.91x 64x48 | 37.31x | 59.07x 64x64 | 37.92x | 58.85x diff -r bf199a5eca5b -r 45e4dd746cfd source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 28 11:58:37 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Aug 28 13:46:50 2017 +0530 @@ -4159,6 +4159,11 @@ p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx512); p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_avx512); +p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx512); +p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx512); +p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx512); +p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx512); + } #endif } diff -r bf199a5eca5b -r 45e4dd746cfd source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmMon Aug 28 11:58:37 2017 +0530 +++ b/source/common/x86/mc-a.asmMon Aug 28 13:46:50 2017 +0530 @@ -5020,6 +5020,58 @@ RET %endif +;- +;pixel_avg_pp avx512 code start +;- +%macro PROCESS_PIXELAVG_64x4_AVX512 0 +movum0, [r2] +movum2, [r2 + r3] +movum1, [r4] +movum3, [r4 + r5] +pavgb m0, m1 +pavgb m2, m3 +movu[r0], m0 +movu[r0 + r1], m2 + +movum0, [r2 + 2 * r3] +movum2, [r2 + r7] +movum1, [r4 + 2 * r5] +movum3, [r4 + r8] +pavgb m0, m1 +pavgb m2, m3 +movu[r0 + 2 * r1], m0 +movu[r0 + r6], m2 +%endmacro + +;--- +;void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) +;--- +%if ARCH_X86_64 && BIT_DEPTH == 8 +%macro PIXEL_AVG_64xN_AVX512 1 +INIT_ZMM avx512 +cglobal pixel_avg_64x%1, 6, 9, 4 +lea r6, [3 * r1] +lea r7, [3 * r3] +lea r8, [3 * r5] + +%rep %1/4 - 1 +PROCESS_PIXELAVG_64x4_AVX512 +lea r2, [r2 + r3 * 4] +lea r4, [r4 + r5 * 4] +lea r0, [r0 + r1 * 4] +%endrep +PROCESS_PIXELAVG_64x4_AVX512 +RET +%endmacro + +PIXEL_AVG_64xN_AVX512 16 +PIXEL_AVG_64xN_AVX512 32 +PIXEL_AVG_64xN_AVX512 48 +PIXEL_AVG_64xN_AVX512 64 +%endif +;- +;pixel_avg_pp avx512 code end +;- ;= ; pixel avg2 ;= ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 092 of 307] x86: AVX512 interp_4tap_horiz_pp_16xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1502628508 -19800 # Sun Aug 13 18:18:28 2017 +0530 # Node ID ed1932a414bf5962bbeccfd5c9e208b7db90f77f # Parent dbfcd0ee40e9bd4ee351eb064d8aa0819bd9b3fd x86: AVX512 interp_4tap_horiz_pp_16xN Color Space i444 Size | AVX2 performance | AVX512 performance -- 16x4 | 12.87x | 20.91x 16x8 | 18.03x | 27.40x 16x12 | 16.95x | 24.97x 16x16 | 18.82x | 27.13x 16x32 | 16.21x | 25.76x 16x64 | 17.41x | 26.04x diff -r dbfcd0ee40e9 -r ed1932a414bf source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Sep 01 10:33:48 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Sun Aug 13 18:18:28 2017 +0530 @@ -4021,14 +4021,30 @@ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx512); //i422 chroma_hpp +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hpp = PFX(interp_4tap_horiz_pp_16x24_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512); //i420 chroma_hpp +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512); diff -r dbfcd0ee40e9 -r ed1932a414bf source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Fri Sep 01 10:33:48 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Sun Aug 13 18:18:28 2017 +0530 @@ -9907,6 +9907,48 @@ vextracti32x8[r2 + r3], m5,1 %endmacro +%macro PROCESS_IPFILTER_CHROMA_PP_16x4_AVX512 0 +; register map +; m0 - interpolate coeff +; m1, m2 - shuffle order table +; m3 - constant word 1 +; m4 - constant word 512 + +movu xm5, [r0] +vinserti32x4 m5, [r0 + r1],1 +vinserti32x4 m5, [r0 + 2 * r1],2 +vinserti32x4 m5, [r0 + r6],3 +pshufb m6, m5, m2 +pshufb m5, m1 + +movu xm7, [r0 + 4] +vinserti32x4 m7, [r0 + r1 + 4],1 +vinserti32x4 m7, [r0 + 2 * r1 + 4],2 +vinserti32x4 m7, [r0 + r6 + 4],3 +pshufb m8, m7, m2 +pshufb m7, m1 + +pmaddubsw m5, m0 +pmaddubsw m7, m0 +pmaddwdm5, m3 +pmaddwdm7, m3 + +pmaddubsw m6
[x265] [PATCH 090 of 307] x86: AVX512 interp_8tap_horiz_pp_16xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1502617345 -19800 # Sun Aug 13 15:12:25 2017 +0530 # Node ID d9200885420957bccd4edea62bf87bbe8831bc62 # Parent 4be3c35eb7510f269a548f248e4f5904b4107d74 x86: AVX512 interp_8tap_horiz_pp_16xN Size | AVX2 performance | AVX512 performance -- 16x4 | 19.10x | 26.27x 16x8 | 19.37x | 26.59x 16x12 | 19.99x | 32.66x 16x16 | 19.13x | 31.47x 16x32 | 18.94x | 33.38x 16x64 | 18.07x | 29.97x diff -r 4be3c35eb751 -r d92008854209 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Sep 01 10:24:43 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Sun Aug 13 15:12:25 2017 +0530 @@ -4053,6 +4053,12 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512); +p.pu[LUMA_16x4].luma_hpp = PFX(interp_8tap_horiz_pp_16x4_avx512); +p.pu[LUMA_16x8].luma_hpp = PFX(interp_8tap_horiz_pp_16x8_avx512); +p.pu[LUMA_16x12].luma_hpp = PFX(interp_8tap_horiz_pp_16x12_avx512); +p.pu[LUMA_16x16].luma_hpp = PFX(interp_8tap_horiz_pp_16x16_avx512); +p.pu[LUMA_16x32].luma_hpp = PFX(interp_8tap_horiz_pp_16x32_avx512); +p.pu[LUMA_16x64].luma_hpp = PFX(interp_8tap_horiz_pp_16x64_avx512); p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_avx512); p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_avx512); p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_avx512); diff -r 4be3c35eb751 -r d92008854209 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Fri Sep 01 10:24:43 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Sun Aug 13 15:12:25 2017 +0530 @@ -10233,6 +10233,65 @@ vextracti32x8 [r2 + r3], m7, 1 %endmacro +%macro PROCESS_IPFILTER_LUMA_PP_16x4_AVX512 0 +; register map +; m0 , m1 interpolate coeff +; m2 , m3, m4 shuffle order table +; m5 - pw_1 +; m6 - pw_512 + +movu xm7,[r0] +vinserti32x4 m7,[r0 + r1], 1 +vinserti32x4 m7,[r0 + 2 * r1], 2 +vinserti32x4 m7,[r0 + r6], 3 + +pshufbm8,m7,m3 +pshufbm7,m2 + +movu xm9,[r0 + 8] +vinserti32x4 m9,[r0 + r1 + 8], 1 +vinserti32x4 m9,[r0 + 2 * r1 + 8], 2 +vinserti32x4 m9,[r0 + r6 + 8], 3 + +pshufbm10, m9,m3 +pshufbm11, m9,m4 +pshufbm9,m2 + +pmaddubsw m7,m0 +pmaddubsw m12, m8,m1 +pmaddwd m7,m5 +pmaddwd m12, m5 +paddd m7,m12 + +pmaddubsw m8,m0 +pmaddubsw m12, m9,m1 +pmaddwd m8,m5 +pmaddwd m12, m5 +paddd m8,m12 + +pmaddubsw m9,m0 +pmaddubsw m12, m10, m1 +pmaddwd m9,m5 +pmaddwd m12, m5 +paddd m9,m12 + +pmaddubsw m10, m0 +pmaddubsw m12, m11,m1 +pmaddwd m10, m5 +pmaddwd m12, m5 +paddd m10, m12 + +packssdw m7, m8 +packssdw m9, m10 +pmulhrsw m7, m6 +pmulhrsw m9, m6 +packuswb m7, m9 +movu [r2], xm7 +vextracti32x4 [r2 + r3], m7,1 +vextracti32x4 [r2 + 2 * r3], m7,2 +vextracti32x4 [r2 + r7], m7,3 +%endmacro + %macro IPFILTER_LUMA_64xN_AVX512 1 INIT_ZMM avx512 cglobal interp_8tap_horiz_pp_64x%1, 4,6,13 @@ -10299,6 +10358,43 @@ IPFILTER_LUMA_32xN_AVX512 24 IPFILTER_LUMA_32xN_AVX512 32 IPFILTER_LUMA_32xN_AVX512 64 + +%macro IPFILTER_LUMA_16xN_AVX512 1 +INIT_ZMM avx512 +cglobal interp_8tap_horiz_pp_16x%1, 4,8,14 +sub r0,3 +mov r4d, r4m +lea r6,[3 * r1] +lea r7,[3 * r3] +%ifdef PIC +lea r5,[tab_LumaCoeff] +vpbroadcastd m0,[r5 + r4 * 8] +vpbroadcastd m1,[r5 + r4 * 8 + 4] +%else +vpbroadcastd m0,[tab_LumaCoeff + r4 * 8] +vpbroadcastd m1,[tab_LumaCoeff + r4 * 8 + 4] +%endif +vbroadcasti32x8 m2,[interp4_horiz_shuf_load1_avx512] +vbroadcasti32x8 m3,[interp4_horiz_shuf_load3_avx512] +vbroadcasti32x8 m4,[interp4_horiz_shuf_load2_avx512] +vpbroadcast
[x265] [PATCH 106 of 307] x86: AVX512 interp_4tap_horiz_pp_8xN for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1504171458 -19800 # Thu Aug 31 14:54:18 2017 +0530 # Node ID 1fb1948309a0a9218a07e060300b9d5a7ff58321 # Parent 9928b3e5b4d4235bea9ffb22434446e68c3aacdb x86: AVX512 interp_4tap_horiz_pp_8xN for high bit depth Color Space i444 Size | AVX2 performance | AVX512 performance -- 8x4 | 5.14x| 9.51x 8x8 | 6.20x| 12.75x 8x16 | 6.32x| 12.44x 8x32 | 6.01x| 13.68x diff -r 9928b3e5b4d4 -r 1fb1948309a0 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Aug 31 14:24:24 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Aug 31 14:54:18 2017 +0530 @@ -2354,6 +2354,10 @@ p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512); p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hpp = PFX(interp_4tap_horiz_pp_8x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_hpp = PFX(interp_4tap_horiz_pp_8x32_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512); @@ -2364,6 +2368,12 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_hpp = PFX(interp_4tap_horiz_pp_8x12_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_hpp = PFX(interp_4tap_horiz_pp_8x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_hpp = PFX(interp_4tap_horiz_pp_8x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_hpp = PFX(interp_4tap_horiz_pp_8x64_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hpp = PFX(interp_4tap_horiz_pp_16x24_avx512); @@ -2374,6 +2384,10 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_hpp = PFX(interp_4tap_horiz_pp_8x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_hpp = PFX(interp_4tap_horiz_pp_8x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512); diff -r 9928b3e5b4d4 -r 1fb1948309a0 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Thu Aug 31 14:24:24 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Thu Aug 31 14:54:18 2017 +0530 @@ -5082,6 +5082,49 @@ ;- ;ipfilter_chroma_avx512 code start ;- +%macro PROCESS_IPFILTER_CHROMA_PP_8x4_AVX512 0 +; register map +; m0 , m1 interpolate coeff +; m2 , m3 shuffle order table +; m4 - pd_32 +; m5 - zero +; m6 - pw_pixel_max + +movuxm7, [r0] +vinserti32x4m7,[r0 + r1], 1 +vinserti32x4m7,[r0 + 2 * r1], 2 +vinserti32x4m7,[r0 + r6], 3 + +pshufb m9,m7,m3 +pshufb m7,m2 +pmaddwd m7,m0 +pmaddwd m9,m1 +paddd m7,m9 +paddd m7,m4 +psrad m7,6 + +movuxm8, [r0 + 8] +vinserti32x4
[x265] [PATCH 100 of 307] x86: AVX512 interp_8tap_horiz_pp_48x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1503912578 -19800 # Mon Aug 28 14:59:38 2017 +0530 # Node ID 562c00d2153193eec85ab907b60eeb5aca7cc609 # Parent a7bf0a24cfc8eb8edc95d340b240b91d03dac5bd x86: AVX512 interp_8tap_horiz_pp_48x64 AVX2 performance: 19.57x AVX512 perfornamce : 35.25x diff -r a7bf0a24cfc8 -r 562c00d21531 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 28 14:46:28 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Aug 28 14:59:38 2017 +0530 @@ -4159,6 +4159,7 @@ p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_avx512); p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx512); p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_avx512); +p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx512); p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx512); p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx512); diff -r a7bf0a24cfc8 -r 562c00d21531 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Mon Aug 28 14:46:28 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Mon Aug 28 14:59:38 2017 +0530 @@ -10489,6 +10489,151 @@ vextracti32x4 [r2 + r7], m7,3 %endmacro +%macro PROCESS_IPFILTER_LUMA_PP_48x4_AVX512 0 +; register map +; m0 , m1 interpolate coeff +; m2 , m3, m4 shuffle order table +; m5 - pw_1 +; m6 - pw_512 + +movu ym7,[r0] +vinserti32x8 m7,[r0 + r1], 1 +movu ym9,[r0 + 8] +vinserti32x8 m9,[r0 + r1 + 8], 1 + +pshufbm8,m7,m3 +pshufbm7,m2 +pshufbm10, m9,m3 +pshufbm11, m9,m4 +pshufbm9,m2 + +pmaddubsw m7,m0 +pmaddubsw m12, m8,m1 +pmaddwd m7,m5 +pmaddwd m12, m5 +paddd m7,m12 + +pmaddubsw m8,m0 +pmaddubsw m12, m9,m1 +pmaddwd m8,m5 +pmaddwd m12, m5 +paddd m8,m12 + +pmaddubsw m9,m0 +pmaddubsw m12, m10, m1 +pmaddwd m9,m5 +pmaddwd m12, m5 +paddd m9,m12 + +pmaddubsw m10, m0 +pmaddubsw m12, m11,m1 +pmaddwd m10, m5 +pmaddwd m12, m5 +paddd m10, m12 + +packssdw m7, m8 +packssdw m9, m10 +pmulhrsw m7, m6 +pmulhrsw m9, m6 +packuswb m7, m9 +movu [r2], ym7 +vextracti32x8 [r2 + r3], m7, 1 + +movu ym7,[r0 + 2 * r1] +vinserti32x8 m7,[r0 + r6], 1 +movu ym9,[r0 + 2 * r1 + 8] +vinserti32x8 m9,[r0 + r6 + 8], 1 + +pshufbm8,m7,m3 +pshufbm7,m2 +pshufbm10, m9,m3 +pshufbm11, m9,m4 +pshufbm9,m2 + +pmaddubsw m7,m0 +pmaddubsw m12, m8,m1 +pmaddwd m7,m5 +pmaddwd m12, m5 +paddd m7,m12 + +pmaddubsw m8,m0 +pmaddubsw m12, m9,m1 +pmaddwd m8,m5 +pmaddwd m12, m5 +paddd m8,m12 + +pmaddubsw m9,m0 +pmaddubsw m12, m10, m1 +pmaddwd m9,m5 +pmaddwd m12, m5 +paddd m9,m12 + +pmaddubsw m10, m0 +pmaddubsw m12, m11,m1 +pmaddwd m10, m5 +pmaddwd m12, m5 +paddd m10, m12 + +packssdw m7, m8 +packssdw m9, m10 +pmulhrsw m7, m6 +pmulhrsw m9, m6 +packuswb m7, m9 +movu [r2 + 2 * r3], ym7 +vextracti32x8 [r2 + r7], m7,1 + +movu xm7,[r0 + mmsize/2] +vinserti32x4 m7,[r0 + r1 + mmsize/2], 1 +vinserti32x4 m7,[r0 + 2 * r1 + mmsize/2], 2 +vinserti32x4 m7,[r0 + r6 + mmsize/2], 3 + +pshufbm8,m7,m3 +pshufbm7,m2 + +movu xm9,[r0 + 40] +vinserti32x4 m9,[r0 + r1 + 40], 1 +vinserti32x4 m9,[r0 + 2 * r1 + 40], 2 +vinserti32x4 m9,[r0 + r6 + 40],
[x265] [PATCH 089 of 307] x86: AVX512 interp_8tap_horiz_pp_32xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1504241683 -19800 # Fri Sep 01 10:24:43 2017 +0530 # Node ID 4be3c35eb7510f269a548f248e4f5904b4107d74 # Parent 354f848c3793b459c005667cdf7158eb6394eb0f x86: AVX512 interp_8tap_horiz_pp_32xN Size | AVX2 performance | AVX512 performance -- 32x8 | 18.92x | 37.84x 32x16 | 17.46x | 36.15x 32x24 | 17.77x | 35.98x 32x32 | 17.91x | 36.69x 32x64 | 18.10x | 35.47x diff -r 354f848c3793 -r 4be3c35eb751 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Aug 11 17:18:16 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Sep 01 10:24:43 2017 +0530 @@ -4052,6 +4052,12 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512); + +p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_avx512); +p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_avx512); +p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_avx512); +p.pu[LUMA_32x32].luma_hpp = PFX(interp_8tap_horiz_pp_32x32_avx512); +p.pu[LUMA_32x64].luma_hpp = PFX(interp_8tap_horiz_pp_32x64_avx512); p.pu[LUMA_64x16].luma_hpp = PFX(interp_8tap_horiz_pp_64x16_avx512); p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_avx512); p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx512); diff -r 354f848c3793 -r 4be3c35eb751 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Fri Aug 11 17:18:16 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Fri Sep 01 10:24:43 2017 +0530 @@ -10182,6 +10182,57 @@ movu [r2], m7 %endmacro +%macro PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 0 +; register map +; m0 , m1 interpolate coeff +; m2 , m3, m4 shuffle order table +; m5 - pw_1 +; m6 - pw_512 + +movu ym7,[r0] +vinserti32x8 m7,[r0 + r1], 1 +movu ym9,[r0 + 8] +vinserti32x8 m9,[r0 + r1 + 8], 1 + +pshufbm8,m7,m3 +pshufbm7,m2 +pshufbm10, m9,m3 +pshufbm11, m9,m4 +pshufbm9,m2 + +pmaddubsw m7,m0 +pmaddubsw m12, m8,m1 +pmaddwd m7,m5 +pmaddwd m12, m5 +paddd m7,m12 + +pmaddubsw m8,m0 +pmaddubsw m12, m9,m1 +pmaddwd m8,m5 +pmaddwd m12, m5 +paddd m8,m12 + +pmaddubsw m9,m0 +pmaddubsw m12, m10, m1 +pmaddwd m9,m5 +pmaddwd m12, m5 +paddd m9,m12 + +pmaddubsw m10, m0 +pmaddubsw m12, m11,m1 +pmaddwd m10, m5 +pmaddwd m12, m5 +paddd m10, m12 + +packssdw m7, m8 +packssdw m9, m10 +pmulhrsw m7, m6 +pmulhrsw m9, m6 +packuswb m7, m9 +movu [r2], ym7 +vextracti32x8 [r2 + r3], m7, 1 +%endmacro + %macro IPFILTER_LUMA_64xN_AVX512 1 INIT_ZMM avx512 cglobal interp_8tap_horiz_pp_64x%1, 4,6,13 @@ -10214,6 +10265,40 @@ IPFILTER_LUMA_64xN_AVX512 32 IPFILTER_LUMA_64xN_AVX512 48 IPFILTER_LUMA_64xN_AVX512 64 + +%macro IPFILTER_LUMA_32xN_AVX512 1 +INIT_ZMM avx512 +cglobal interp_8tap_horiz_pp_32x%1, 4,6,13 +sub r0,3 +mov r4d, r4m +%ifdef PIC +lea r5,[tab_LumaCoeff] +vpbroadcastd m0,[r5 + r4 * 8] +vpbroadcastd m1,[r5 + r4 * 8 + 4] +%else +vpbroadcastd m0,[tab_LumaCoeff + r4 * 8] +vpbroadcastd m1,[tab_LumaCoeff + r4 * 8 + 4] +%endif +vbroadcasti32x8 m2,[interp4_horiz_shuf_load1_avx512] +vbroadcasti32x8 m3,[interp4_horiz_shuf_load3_avx512] +vbroadcasti32x8 m4,[interp4_horiz_shuf_load2_avx512] +vpbroadcastd m5,[pw_1] +vbroadcasti32x8 m6,[pw_512] + +%rep %1/2 -1 +PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 +lea r0,[r0 + 2 * r1] +lea r2,[r2 + 2 * r3] +%endrep +PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 +RET +%endmacro + +IPFILTER_LUMA_32xN_AVX512 8 +IPFILTER_LUMA_32xN_AVX512 16 +IPFILTER_LUMA_32xN_AVX512 24 +IPFILTER_LUMA_32xN_AVX512 32 +IPFILTER_LUMA_32xN_