Re: [x265] [PATCH 000 of 307 ] AVX-512 implementataion in x265
Your request is on the way, soon we will share the performance related details. Thanks. Regards, Praveen Tiwari On Fri, Apr 6, 2018 at 9:36 PM, Vittorio Giovarawrote: > just curious, what kind of general speed improvement does this give? > I could have missed them in the series, but it would be nice to have some > sort of benchmarks > thanks > Vittorio > > On Sat, Apr 7, 2018 at 4:29 AM, wrote: > >> This series of patches enables AVX-512 in x265. USe CLI option --asm >> avx512 to enable AVX-512 kernels. >> ___ >> x265-devel mailing list >> x265-devel@videolan.org >> https://mailman.videolan.org/listinfo/x265-devel >> > > > > -- > Vittorio > > ___ > x265-devel mailing list > x265-devel@videolan.org > https://mailman.videolan.org/listinfo/x265-devel > > ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
Re: [x265] [PATCH 300 of 307] x86: AVX512 'count_nonzero_16x16' avx-512 kernel, 22% speedup over avx2
Sorry, I miss a line, resend with addition comment At 2018-04-07 01:27:34, "chen"wrote: At 2018-04-06 21:17:37, mythr...@multicorewareinc.com wrote: ># HG changeset patch ># User Jayashree ># Date 1517283539 28800 ># Mon Jan 29 19:38:59 2018 -0800 ># Node ID 3c6e5ce07dbca7f967e4b5b62fe450979da3bf81 ># Parent 624c83571d1df840e1206c46e589044fbf87ff32 >x86: AVX512 'count_nonzero_16x16' avx-512 kernel, 22% speedup over avx2 > >count_nonzero[16x16] 18.88x -> 23.04x > >+;- >+; int x265_count_nonzero_16x16_avx512(const int16_t *quantCoeff); >+;- >+INIT_ZMM avx512 >+cglobal count_nonzero_16x16, 1,4,2 >+mov r1, 0x >+kmovq k2, r1 https://www.cs.utexas.edu/~hunt/class/2017-spring/cs350c/documents/Intel-x86-Docs/64-ia-32-architectures-instruction-set-extensions-reference-manual.pdf 2.5.1.1 Opmask Register K0 The only exception to the opmask rules described above is that opmask k0 can not be used as a predicate operand. Opmask k0 cannot be encoded as a predicate operand for a vector operation; the encoding value that would select opmask k0 will instead selects an implicit opmask value of 0x, thereby effectively disabling masking. Opmask register k0 can still be used for any instruction that takes opmask register(s) as operand(s) (either source or destination). >+xor r3, r3 >+pxorm0, m0 >+ >+%assign x 0 >+%rep 4 unroll 4 times only, so unnecessary unroll in here I suggest load all of bytes in same time, it can be hidden memory latency with calculate instructions. >+movum1, [r0 + x] >+vpacksswb m1, [r0 + x + 64] >+%assign x x+128 >+vpcmpb k1 {k2}, m1, m0, 0100b could you please declare a new macro/const, the developers are difficult to understand that the '0100b' (4) means NE (on Intel's document). >+kmovq r1, k1 >+popcnt r2, r1 >+add r3d, r2d >+%endrep >+mov eax, r3d >+ >+RET >+ ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 105 of 307] x86: AVX512 interp_4tap_horiz_pp_48x64 for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1504169664 -19800 # Thu Aug 31 14:24:24 2017 +0530 # Node ID 9928b3e5b4d4235bea9ffb22434446e68c3aacdb # Parent 052b8b5061d84b791489c01e114a0441f96e4ec2 x86: AVX512 interp_4tap_horiz_pp_48x64 for high bit depth AVX2 performance : 9.46x AVX512 performance : 18.97x diff -r 052b8b5061d8 -r 9928b3e5b4d4 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Aug 31 13:03:39 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Aug 31 14:24:24 2017 +0530 @@ -2389,6 +2389,7 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = PFX(interp_4tap_horiz_pp_48x64_avx512); } } diff -r 052b8b5061d8 -r 9928b3e5b4d4 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Thu Aug 31 13:03:39 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Thu Aug 31 14:24:24 2017 +0530 @@ -5175,6 +5175,90 @@ movu[r2 + r3], m7 %endmacro +%macro PROCESS_IPFILTER_CHROMA_PP_48x2_AVX512 0 +; register map +; m0 , m1 interpolate coeff +; m2 , m3 shuffle order table +; m4 - pd_32 +; m5 - zero +; m6 - pw_pixel_max + +movum7,[r0] +movum8,[r0 + 8] + +pshufb m9,m7,m3 +pshufb m7,m2 +pmaddwd m7,m0 +pmaddwd m9,m1 +paddd m7,m9 +paddd m7,m4 +psrad m7,6 + +pshufb m9,m8,m3 +pshufb m8,m2 +pmaddwd m8,m0 +pmaddwd m9,m1 +paddd m8,m9 +paddd m8,m4 +psrad m8,6 + +packusdwm7,m8 +CLIPW m7,m5,m6 +pshufb m7,m10 +movu[r2], m7 + +movum7,[r0 + r1] +movum8,[r0 + r1 + 8] + +pshufb m9,m7,m3 +pshufb m7,m2 +pmaddwd m7,m0 +pmaddwd m9,m1 +paddd m7,m9 +paddd m7,m4 +psrad m7,6 + +pshufb m9,m8,m3 +pshufb m8,m2 +pmaddwd m8,m0 +pmaddwd m9,m1 +paddd m8,m9 +paddd m8,m4 +psrad m8,6 + +packusdwm7,m8 +CLIPW m7,m5,m6 +pshufb m7,m10 +movu[r2 + r3], m7 + +movuym7, [r0 + mmsize] +vinserti32x8m7,[r0 + r1 + mmsize], 1 +movuym8, [r0 + mmsize + 8] +vinserti32x8m8,[r0 + r1 + mmsize + 8], 1 + +pshufb m9,m7,m3 +pshufb m7,m2 +pmaddwd m7,m0 +pmaddwd m9,m1 +paddd m7,m9 +paddd m7,m4 +psrad m7,6 + +pshufb m9,m8,m3 +pshufb m8,m2 +pmaddwd m8,m0 +pmaddwd m9,m1 +paddd m8,m9 +paddd m8,m4 +psrad m8,6 + +packusdwm7,m8 +CLIPW m7,m5,m6 +pshufb m7,m10 +movu[r2 + mmsize], ym7 +vextracti32x8 [r2 + r3 + mmsize], m7,1 +%endmacro + %macro PROCESS_IPFILTER_CHROMA_PP_64x2_AVX512 0 ; register map ; m0 , m1 interpolate coeff @@ -5394,6 +5478,35 @@ IPFILTER_CHROMA_AVX512_64xN 32 IPFILTER_CHROMA_AVX512_64xN 48 IPFILTER_CHROMA_AVX512_64xN 64 + +INIT_ZMM avx512 +cglobal interp_4tap_horiz_pp_48x64, 5,6,11 +add r1d, r1d +add r3d, r3d +sub r0, 2 +mov r4d, r4m +%ifdef PIC +lea r5, [tab_ChromaCoeff] +vpbroadcastdm0, [r5 + r4 * 8] +vpbroadcastdm1, [r5 + r4 * 8 + 4] +%else +vpbroadcastdm0, [tab_ChromaCoeff + r4 * 8] +vpbroadcastdm1, [tab_ChromaCoeff + r4 * 8 + 4] +%endif +vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512] +vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512] +vbroadcasti32x8 m4, [pd_32] +pxorm5, m5 +vbroadcasti32x8 m6, [pw_pixel_max] +vbroadcasti32x8 m10, [interp8_hpp_shuf1_store_avx512] + +%rep 31 +PROCESS_IPFILTER_CHROMA_PP_48x2_AVX512 +lea r0, [r0 + 2 * r1] +lea
[x265] [PATCH 095 of 307] x86: AVX512 copy_pp_32xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1503462961 -19800 # Wed Aug 23 10:06:01 2017 +0530 # Node ID 31a180bcef33fae436ad7e3aa4378b283a86d56a # Parent 7868f1cb521d554dc77d768ec1f838e0b29824e4 x86: AVX512 copy_pp_32xN Size | AVX2 performance | AVX512 performance -- 32x16 | 1.63x | 2.58x 32x24 | 2.51x | 2.87x 32x32 | 2.48x | 2.95x 32x64 | 2.03x | 2.53x This patch also clean up code for 64xN diff -r 7868f1cb521d -r 31a180bcef33 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 22 13:51:33 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Aug 23 10:06:01 2017 +0530 @@ -3965,6 +3965,18 @@ p.pu[LUMA_64x32].copy_pp = PFX(blockcopy_pp_64x32_avx512); p.pu[LUMA_64x48].copy_pp = PFX(blockcopy_pp_64x48_avx512); p.pu[LUMA_64x16].copy_pp = PFX(blockcopy_pp_64x16_avx512); +p.pu[LUMA_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512); +p.pu[LUMA_32x24].copy_pp = PFX(blockcopy_pp_32x24_avx512); +p.pu[LUMA_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx512); +p.pu[LUMA_32x64].copy_pp = PFX(blockcopy_pp_32x64_avx512); + +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].copy_pp = PFX(blockcopy_pp_32x24_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].copy_pp = PFX(blockcopy_pp_32x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].copy_pp = PFX(blockcopy_pp_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].copy_pp = PFX(blockcopy_pp_32x48_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_avx512); p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512); p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx512); diff -r 7868f1cb521d -r 31a180bcef33 source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Tue Aug 22 13:51:33 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Wed Aug 23 10:06:01 2017 +0530 @@ -1107,7 +1107,7 @@ BLOCKCOPY_PP_W64_H4_avx 64, 64 ;-- -; Macro to calculate blockcopy_pp_64x4_avx512 +; blockcopy_pp avx512 code start ;-- %macro PROCESS_BLOCKCOPY_PP_64X4_avx512 0 movum0, [r2] @@ -1121,16 +1121,28 @@ movu[r0 + r5] , m3 %endmacro +%macro PROCESS_BLOCKCOPY_PP_32X4_avx512 0 +movu ym0, [r2] +vinserti32x8 m0, [r2 + r3], 1 +movu ym1, [r2 + 2 * r3] +vinserti32x8 m1, [r2 + r4], 1 + +movu [r0] , ym0 +vextracti32x8 [r0 + r1] , m0,1 +movu [r0 + 2 * r1] ,ym1 +vextracti32x8 [r0 + r5] , m1,1 +%endmacro + ;-- ; void blockcopy_pp_64x%1(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride) ;-- %macro BLOCKCOPY_PP_W64_H4_avx512 1 INIT_ZMM avx512 -cglobal blockcopy_pp_64x%1, 4, 4, 6 +cglobal blockcopy_pp_64x%1, 4, 6, 4 lear4, [3 * r3] lear5, [3 * r1] -%rep %1/4 - 1 +%rep %1/4 - 1 PROCESS_BLOCKCOPY_PP_64X4_avx512 lea r2, [r2 + 4 * r3] lea r0, [r0 + 4 * r1] @@ -1145,7 +1157,30 @@ BLOCKCOPY_PP_W64_H4_avx512 48 BLOCKCOPY_PP_W64_H4_avx512 64 - +%macro BLOCKCOPY_PP_W32_H4_avx512 1 +INIT_ZMM avx512 +cglobal blockcopy_pp_32x%1, 4, 6, 2 +lear4, [3 * r3] +lear5, [3 * r1] + +%rep %1/4 - 1 +PROCESS_BLOCKCOPY_PP_32X4_avx512 +lea r2, [r2 + 4 * r3] +lea r0, [r0 + 4 * r1] +%endrep +PROCESS_BLOCKCOPY_PP_32X4_avx512 +RET +%endmacro + +BLOCKCOPY_PP_W32_H4_avx512 8 +BLOCKCOPY_PP_W32_H4_avx512 16 +BLOCKCOPY_PP_W32_H4_avx512 24 +BLOCKCOPY_PP_W32_H4_avx512 32 +BLOCKCOPY_PP_W32_H4_avx512 48 +BLOCKCOPY_PP_W32_H4_avx512 64 +;-- +; blockcopy_pp avx512 code end +;-- ;- ; void blockcopy_sp_2x4(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 101 of 307] x86: revoke some changes in ipfilter16
# HG changeset patch # User Aasaipriya Chandran# Date 1522962868 25200 # Thu Apr 05 14:14:28 2018 -0700 # Node ID 1a31df496144c526fd5eba9d960bb286a81ae2d5 # Parent 562c00d2153193eec85ab907b60eeb5aca7cc609 x86: revoke some changes in ipfilter16 diff -r 562c00d21531 -r 1a31df496144 source/common/x86/h-ipfilter16.asm --- a/source/common/x86/h-ipfilter16.asmMon Aug 28 14:59:38 2017 +0530 +++ b/source/common/x86/h-ipfilter16.asmThu Apr 05 14:14:28 2018 -0700 @@ -47,7 +47,7 @@ h_pd_524800:times 8 dd 524800 -tab_LumaCoeff:dw 0, 0, 0, 64, 0, 0, 0, 0 +h_tab_LumaCoeff:dw 0, 0, 0, 64, 0, 0, 0, 0 dw -1, 4, -10, 58, 17, -5, 1, 0 dw -1, 4, -11, 40, 40, -11, 4, -1 dw 0, 1, -5, 17, 58, -10, 4, -1 @@ -207,10 +207,10 @@ add r3d,r3d %ifdef PIC -lea r6, [tab_LumaCoeff] +lea r6, [h_tab_LumaCoeff] movam0, [r6 + r4] %else -movam0, [tab_LumaCoeff + r4] +movam0, [h_tab_LumaCoeff + r4] %endif %ifidn %3, pp @@ -625,10 +625,10 @@ add r3, r3 %ifdef PIC -lea r6, [tab_LumaCoeff] +lea r6, [h_tab_LumaCoeff] movam0, [r6 + r4] %else -movam0, [tab_LumaCoeff + r4] +movam0, [h_tab_LumaCoeff + r4] %endif %ifidn %3, pp @@ -712,10 +712,10 @@ shl r4d, 4 %ifdef PIC -lea r6, [tab_LumaCoeff] +lea r6, [h_tab_LumaCoeff] movam0, [r6 + r4] %else -movam0, [tab_LumaCoeff + r4] +movam0, [h_tab_LumaCoeff + r4] %endif %ifidn %3, pp @@ -815,10 +815,10 @@ shl r4d, 4 %ifdef PIC -lea r6, [tab_LumaCoeff] +lea r6, [h_tab_LumaCoeff] movam0, [r6 + r4] %else -movam0, [tab_LumaCoeff + r4] +movam0, [h_tab_LumaCoeff + r4] %endif %ifidn %3, pp movam1, [INTERP_OFFSET_PP] @@ -936,10 +936,10 @@ shl r4d, 4 %ifdef PIC -lea r6, [tab_LumaCoeff] +lea r6, [h_tab_LumaCoeff] movam0, [r6 + r4] %else -movam0, [tab_LumaCoeff + r4] +movam0, [h_tab_LumaCoeff + r4] %endif %ifidn %3, pp @@ -1132,10 +1132,10 @@ shl r4d, 4 %ifdef PIC -lea r6, [tab_LumaCoeff] +lea r6, [h_tab_LumaCoeff] movam0, [r6 + r4] %else -movam0, [tab_LumaCoeff + r4] +movam0, [h_tab_LumaCoeff + r4] %endif %ifidn %3, pp movam1, [pd_32] @@ -1307,12 +1307,12 @@ mov r4d, r4m shl r4d, 4 %ifdef PIC -lea r5, [tab_LumaCoeff] +lea r5, [h_tab_LumaCoeff] vpbroadcastq m0, [r5 + r4] vpbroadcastq m1, [r5 + r4 + 8] %else -vpbroadcastq m0, [tab_LumaCoeff + r4] -vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] +vpbroadcastq m0, [h_tab_LumaCoeff + r4] +vpbroadcastq m1, [h_tab_LumaCoeff + r4 + 8] %endif lea r6, [pw_pixel_max] mova m3, [interp8_hpp_shuf] @@ -1385,11 +1385,11 @@ mov r4d, r4m shl r4d, 4 %ifdef PIC -lea r5, [tab_LumaCoeff] +lea r5, [h_tab_LumaCoeff] vpbroadcastq m0, [r5 + r4] vpbroadcastq m1, [r5 + r4 + 8] %else -vpbroadcastq m0, [tab_LumaCoeff + r4] +vpbroadcastq m0, [h_tab_LumaCoeff + r4] vpbroadcastq m1, [h_ab_LumaCoeff + r4 + 8] %endif mova m3, [interp8_hpp_shuf] @@ -1481,12 +1481,12 @@ mov r4d, r4m shl r4d, 4 %ifdef PIC -lea r5, [tab_LumaCoeff] +lea r5, [h_tab_LumaCoeff] vpbroadcastq m0, [r5 + r4] vpbroadcastq m1, [r5 + r4 + 8] %else -vpbroadcastq m0, [tab_LumaCoeff + r4] -vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] +vpbroadcastq m0, [h_tab_LumaCoeff + r4] +vpbroadcastq m1, [h_tab_LumaCoeff + r4 + 8] %endif mova m3, [interp8_hpp_shuf] mova m7, [pd_32] @@ -1579,12 +1579,12 @@ mov r4d, r4m shl r4d, 4 %ifdef PIC -lea r5, [tab_LumaCoeff] +lea r5, [h_tab_LumaCoeff] vpbroadcastq m0, [r5 + r4] vpbroadcastq m1, [r5 + r4 + 8] %else -vpbroadcastq m0, [tab_LumaCoeff + r4] -vpbroadcastq m1, [tab_LumaCoeff + r4 + 8] +vpbroadcastq m0, [h_tab_LumaCoeff + r4] +vpbroadcastq m1, [h_tab_LumaCoeff + r4 + 8] %endif mova m3, [interp8_hpp_shuf] mova m7, [pd_32] @@ -1684,12 +1684,12 @@ mov r4d, r4m shl r4d, 4 %ifdef PIC -lea r5, [tab_LumaCoeff] +lea r5,
[x265] [PATCH 099 of 307] x86: AVX512 interp_4tap_horiz_pp_48x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1503911788 -19800 # Mon Aug 28 14:46:28 2017 +0530 # Node ID a7bf0a24cfc8eb8edc95d340b240b91d03dac5bd # Parent 45e4dd746cfd9380dbe2344a5754a6ff6e9feed5 x86: AVX512 interp_4tap_horiz_pp_48x64 AVX2 performance: 17.53x AVX512 performance : 33.60x diff -r 45e4dd746cfd -r a7bf0a24cfc8 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 28 13:46:50 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Aug 28 14:46:28 2017 +0530 @@ -4094,6 +4094,7 @@ p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = PFX(interp_4tap_horiz_pp_48x64_avx512); //i422 chroma_hpp p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512); diff -r 45e4dd746cfd -r a7bf0a24cfc8 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Mon Aug 28 13:46:50 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Mon Aug 28 14:46:28 2017 +0530 @@ -9949,6 +9949,103 @@ vextracti32x4 [r2 + r7], m5, 3 %endmacro +%macro PROCESS_IPFILTER_CHROMA_PP_48x4_AVX512 0 +; register map +; m0 - interpolate coeff +; m1, m2 - shuffle order table +; m3 - constant word 1 +; m4 - constant word 512 +movu ym5, [r0] +vinserti32x8 m5, [r0 + r1], 1 +movu ym7, [r0 + 4] +vinserti32x8 m7, [r0 + r1 + 4], 1 + +pshufb m6, m5, m2 +pshufb m5, m1 +pshufb m8, m7, m2 +pshufb m7, m1 + +pmaddubsw m5, m0 +pmaddubsw m7, m0 +pmaddwdm5, m3 +pmaddwdm7, m3 + +pmaddubsw m6, m0 +pmaddubsw m8, m0 +pmaddwdm6, m3 +pmaddwdm8, m3 + +packssdw m5, m7 +packssdw m6, m8 +pmulhrsw m5, m4 +pmulhrsw m6, m4 +packuswb m5, m6 +movu [r2], ym5 +vextracti32x8[r2 + r3], m5,1 + +movu ym5, [r0 + 2 * r1] +vinserti32x8 m5, [r0 + r6], 1 +movu ym7, [r0 + 2 * r1 + 4] +vinserti32x8 m7, [r0 + r6 + 4], 1 + +pshufb m6, m5, m2 +pshufb m5, m1 +pshufb m8, m7, m2 +pshufb m7, m1 + +pmaddubsw m5, m0 +pmaddubsw m7, m0 +pmaddwdm5, m3 +pmaddwdm7, m3 + +pmaddubsw m6, m0 +pmaddubsw m8, m0 +pmaddwdm6, m3 +pmaddwdm8, m3 + +packssdw m5, m7 +packssdw m6, m8 +pmulhrsw m5, m4 +pmulhrsw m6, m4 +packuswb m5, m6 +movu [r2 + 2 * r3], ym5 +vextracti32x8[r2 + r7], m5,1 + +movu xm5, [r0 + mmsize/2] +vinserti32x4 m5, [r0 + r1 + mmsize/2],1 +vinserti32x4 m5, [r0 + 2 * r1 + mmsize/2],2 +vinserti32x4 m5, [r0 + r6 + mmsize/2],3 +pshufb m6, m5, m2 +pshufb m5, m1 + +movu xm7, [r0 + 36] +vinserti32x4 m7, [r0 + r1 + 36],1 +vinserti32x4 m7, [r0 + 2 * r1 + 36],2 +vinserti32x4 m7, [r0 + r6 + 36],3 +pshufb m8, m7, m2 +pshufb m7, m1 + +pmaddubsw m5, m0 +pmaddubsw m7, m0 +pmaddwdm5, m3 +pmaddwdm7, m3 + +pmaddubsw m6, m0 +pmaddubsw m8, m0 +pmaddwdm6, m3 +pmaddwdm8, m3 + +packssdw m5, m7 +packssdw m6, m8 +pmulhrsw m5, m4 +pmulhrsw m6, m4 +packuswb m5, m6 +movu [r2 + mmsize/2], xm5 +vextracti32x4 [r2 + r3 +
[x265] [PATCH 112 of 307] x86: Aligned routine implementation for addavg primitive
# HG changeset patch # User Jayashri Murugan# Date 1506512312 -19800 # Wed Sep 27 17:08:32 2017 +0530 # Node ID 762682acf5c25bdecbfec2d0f4f32da7dea3a9e2 # Parent b31fc8889e0f8a433be25fb6267552f7d03efeaf x86: Aligned routine implementation for addavg primitive diff -r b31fc8889e0f -r 762682acf5c2 source/common/pixel.cpp --- a/source/common/pixel.cpp Mon Sep 25 13:11:24 2017 +0530 +++ b/source/common/pixel.cpp Wed Sep 27 17:08:32 2017 +0530 @@ -987,6 +987,7 @@ #define LUMA_PU(W, H) \ p.pu[LUMA_ ## W ## x ## H].copy_pp = blockcopy_pp_c ; \ p.pu[LUMA_ ## W ## x ## H].addAvg = addAvg ; \ +p.pu[LUMA_ ## W ## x ## H].addAvg_aligned = addAvg ; \ p.pu[LUMA_ ## W ## x ## H].sad = sad ; \ p.pu[LUMA_ ## W ## x ## H].sad_x3 = sad_x3 ; \ p.pu[LUMA_ ## W ## x ## H].sad_x4 = sad_x4 ; \ @@ -1103,6 +1104,7 @@ #define CHROMA_PU_420(W, H) \ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg = addAvg ; \ +p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].addAvg_aligned = addAvg ; \ p.chroma[X265_CSP_I420].pu[CHROMA_420_ ## W ## x ## H].copy_pp = blockcopy_pp_c ; \ CHROMA_PU_420(2, 2); @@ -1180,6 +1182,7 @@ #define CHROMA_PU_422(W, H) \ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg = addAvg ; \ +p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].addAvg_aligned = addAvg ; \ p.chroma[X265_CSP_I422].pu[CHROMA_422_ ## W ## x ## H].copy_pp = blockcopy_pp_c ; \ CHROMA_PU_422(2, 4); diff -r b31fc8889e0f -r 762682acf5c2 source/common/primitives.cpp --- a/source/common/primitives.cpp Mon Sep 25 13:11:24 2017 +0530 +++ b/source/common/primitives.cpp Wed Sep 27 17:08:32 2017 +0530 @@ -115,6 +115,7 @@ { p.chroma[X265_CSP_I444].pu[i].copy_pp = p.pu[i].copy_pp; p.chroma[X265_CSP_I444].pu[i].addAvg = p.pu[i].addAvg; +p.chroma[X265_CSP_I444].pu[i].addAvg_aligned = p.pu[i].addAvg_aligned; p.chroma[X265_CSP_I444].pu[i].satd= p.pu[i].satd; p.chroma[X265_CSP_I444].pu[i].p2s = p.pu[i].convert_p2s; } diff -r b31fc8889e0f -r 762682acf5c2 source/common/primitives.h --- a/source/common/primitives.hMon Sep 25 13:11:24 2017 +0530 +++ b/source/common/primitives.hWed Sep 27 17:08:32 2017 +0530 @@ -245,6 +245,7 @@ pixelavg_pp_t pixelavg_pp; // quick bidir using pixels (borrowed from x264) addAvg_t addAvg; // bidir motion compensation, uses 16bit values +addAvg_t addAvg_aligned; copy_pp_t copy_pp; filter_p2s_t convert_p2s; @@ -386,6 +387,7 @@ filter_pp_t filter_hpp; filter_hps_t filter_hps; addAvg_t addAvg; +addAvg_t addAvg_aligned; copy_pp_tcopy_pp; filter_p2s_t p2s; filter_p2s_t p2s_aligned; diff -r b31fc8889e0f -r 762682acf5c2 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Sep 25 13:11:24 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Sep 27 17:08:32 2017 +0530 @@ -2510,6 +2510,65 @@ p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = PFX(interp_4tap_horiz_pp_48x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_hpp = PFX(interp_4tap_horiz_pp_24x32_avx512); +p.pu[LUMA_8x4].addAvg_aligned = PFX(addAvg_8x4_avx2); +p.pu[LUMA_8x8].addAvg_aligned = PFX(addAvg_8x8_avx2); +p.pu[LUMA_8x16].addAvg_aligned = PFX(addAvg_8x16_avx2); +p.pu[LUMA_8x32].addAvg_aligned = PFX(addAvg_8x32_avx2); +p.pu[LUMA_12x16].addAvg_aligned = PFX(addAvg_12x16_avx2); +p.pu[LUMA_16x4].addAvg_aligned = PFX(addAvg_aligned_16x4_avx512); +p.pu[LUMA_16x8].addAvg_aligned = PFX(addAvg_aligned_16x8_avx512); +p.pu[LUMA_16x12].addAvg_aligned = PFX(addAvg_aligned_16x12_avx512); +p.pu[LUMA_16x16].addAvg_aligned = PFX(addAvg_aligned_16x16_avx512); +p.pu[LUMA_16x32].addAvg_aligned = PFX(addAvg_aligned_16x32_avx512); +p.pu[LUMA_16x64].addAvg_aligned = PFX(addAvg_aligned_16x64_avx512); +p.pu[LUMA_48x64].addAvg_aligned = PFX(addAvg_aligned_48x64_avx512); +p.pu[LUMA_24x32].addAvg_aligned = PFX(addAvg_24x32_avx2); +p.pu[LUMA_32x8].addAvg_aligned = PFX(addAvg_aligned_32x8_avx512); +p.pu[LUMA_32x16].addAvg_aligned = PFX(addAvg_aligned_32x16_avx512); +p.pu[LUMA_32x24].addAvg_aligned = PFX(addAvg_aligned_32x24_avx512); +p.pu[LUMA_32x32].addAvg_aligned = PFX(addAvg_aligned_32x32_avx512); +p.pu[LUMA_32x64].addAvg_aligned = PFX(addAvg_aligned_32x64_avx512); +p.pu[LUMA_48x64].addAvg_aligned = PFX(addAvg_48x64_avx2); +p.pu[LUMA_64x16].addAvg_aligned = PFX(addAvg_aligned_64x16_avx512); +p.pu[LUMA_64x32].addAvg_aligned =
[x265] [PATCH 088 of 307] x86: AVX512 interp_8tap_horiz_pp_64xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1502452096 -19800 # Fri Aug 11 17:18:16 2017 +0530 # Node ID 354f848c3793b459c005667cdf7158eb6394eb0f # Parent 2fa52ac34d8a8248d183fccfc78393c45a5f0839 x86: AVX512 interp_8tap_horiz_pp_64xN Size | AVX2 performance | AVX512 performance -- 64x16 | 18.05x | 39.92x 64x32 | 18.10x | 40.28x 64x48 | 18.16x | 40.02x 64x64 | 18.03x | 40.43x diff -r 2fa52ac34d8a -r 354f848c3793 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 15 11:24:19 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Aug 11 17:18:16 2017 +0530 @@ -4052,6 +4052,10 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512); +p.pu[LUMA_64x16].luma_hpp = PFX(interp_8tap_horiz_pp_64x16_avx512); +p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_avx512); +p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx512); +p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_avx512); } #endif diff -r 2fa52ac34d8a -r 354f848c3793 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Tue Aug 15 11:24:19 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Fri Aug 11 17:18:16 2017 +0530 @@ -147,8 +147,8 @@ const interp8_hps_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7 const interp4_horiz_shuf_load1_avx512, times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 - const interp4_horiz_shuf_load2_avx512, times 2 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +const interp4_horiz_shuf_load3_avx512, times 2 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 const interp4_horiz_shuf_store1_avx512, dd 0 ,8, 1, 9, 4, 12, 5, 13, 2, 10, 3, 11, 6, 14, 7, 15 @@ -10130,3 +10130,90 @@ ;- ;ipfilter_chroma_avx512 code end ;- +;- +;ipfilter_luma_avx512 code start +;- +%macro PROCESS_IPFILTER_LUMA_PP_64x1_AVX512 0 +; register map +; m0 , m1 interpolate coeff +; m2 , m3, m4 shuffle order table +; m5 - pw_1 +; m6 - pw_512 + +movu m7,[r0] +movu m9,[r0 + 8] + +pshufbm8,m7,m3 +pshufbm7,m2 +pshufbm10, m9,m3 +pshufbm11, m9,m4 +pshufbm9,m2 + + +pmaddubsw m7,m0 +pmaddubsw m12, m8,m1 +pmaddwd m7,m5 +pmaddwd m12, m5 +paddd m7,m12 + +pmaddubsw m8,m0 +pmaddubsw m12, m9,m1 +pmaddwd m8,m5 +pmaddwd m12, m5 +paddd m8,m12 + +pmaddubsw m9,m0 +pmaddubsw m12, m10, m1 +pmaddwd m9,m5 +pmaddwd m12, m5 +paddd m9,m12 + +pmaddubsw m10, m0 +pmaddubsw m12, m11,m1 +pmaddwd m10, m5 +pmaddwd m12, m5 +paddd m10, m12 + +packssdw m7, m8 +packssdw m9, m10 +pmulhrsw m7, m6 +pmulhrsw m9, m6 +packuswb m7, m9 +movu [r2], m7 +%endmacro + +%macro IPFILTER_LUMA_64xN_AVX512 1 +INIT_ZMM avx512 +cglobal interp_8tap_horiz_pp_64x%1, 4,6,13 +sub r0,3 +mov r4d, r4m +%ifdef PIC +lea r5,[tab_LumaCoeff] +vpbroadcastd m0,[r5 + r4 * 8] +vpbroadcastd m1,[r5 + r4 * 8 + 4] +%else +vpbroadcastd m0,[tab_LumaCoeff + r4 * 8] +vpbroadcastd m1,[tab_LumaCoeff + r4 * 8 + 4] +%endif +vbroadcasti32x8 m2,[interp4_horiz_shuf_load1_avx512] +vbroadcasti32x8 m3,[interp4_horiz_shuf_load3_avx512] +vbroadcasti32x8 m4,[interp4_horiz_shuf_load2_avx512] +vpbroadcastd m5,[pw_1] +vbroadcasti32x8 m6,[pw_512] + +%rep %1-1 +PROCESS_IPFILTER_LUMA_PP_64x1_AVX512 +lea r0,
[x265] [PATCH 087 of 307] x86: AVX512 cleanup addAvg low bit depth code
# HG changeset patch # User Vignesh Vijayakumar # Date 1502776459 -19800 # Tue Aug 15 11:24:19 2017 +0530 # Node ID 2fa52ac34d8a8248d183fccfc78393c45a5f0839 # Parent 2db192bac0f14d55f7f82b8964d6c67c3a3637c3 x86: AVX512 cleanup addAvg low bit depth code diff -r 2db192bac0f1 -r 2fa52ac34d8a source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmTue Aug 15 10:32:52 2017 +0530 +++ b/source/common/x86/mc-a.asmTue Aug 15 11:24:19 2017 +0530 @@ -46,13 +46,10 @@ %error Unsupport bit depth! %endif -SECTION_RODATA 32 - -ch_shuf: times 2 db 0,2,2,4,4,6,6,8,1,3,3,5,5,7,7,9 -ch_shuf_adj: times 8 db 0 - times 8 db 2 - times 8 db 4 - times 8 db 6 +SECTION_RODATA 64 + +ALIGN 64 +const shuf_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7 SECTION .text @@ -3289,8 +3286,9 @@ %macro PROCESS_ADDAVG_64x2_AVX512 0 movum0, [r0] movum1, [r1] -movum2, [r0 + 64] -movum3, [r1 + 64] +movum2, [r0 + mmsize] +movum3, [r1 + mmsize] + paddw m0, m1 pmulhrswm0, m4 paddw m0, m5 @@ -3299,14 +3297,14 @@ paddw m2, m5 packuswbm0, m2 -vpermq m0, m0, 11011000b -vshufi64x2 m0, m0, 11011000b +vpermq m0, m6, m0 movu[r2], m0 movum0, [r0 + r3] movum1, [r1 + r4] -movum2, [r0 + r3 + 64] -movum3, [r1 + r4 + 64] +movum2, [r0 + r3 + mmsize] +movum3, [r1 + r4 + mmsize] + paddw m0, m1 pmulhrswm0, m4 paddw m0, m5 @@ -3315,8 +3313,7 @@ paddw m2, m5 packuswbm0, m2 -vpermq m0, m0, 11011000b -vshufi64x2 m0, m0, 11011000b +vpermq m0, m6, m0 movu[r2 + r5], m0 %endmacro @@ -3325,9 +3322,11 @@ ; %macro ADDAVG_W64_AVX512 1 INIT_ZMM avx512 -cglobal addAvg_64x%1, 6,6,6 +cglobal addAvg_64x%1, 6,6,7 vbroadcasti32x8 m4, [pw_256] vbroadcasti32x8 m5, [pw_128] +movam6, [shuf_avx512] + add r3, r3 add r4, r4 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 092 of 307] x86: AVX512 interp_4tap_horiz_pp_16xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1502628508 -19800 # Sun Aug 13 18:18:28 2017 +0530 # Node ID ed1932a414bf5962bbeccfd5c9e208b7db90f77f # Parent dbfcd0ee40e9bd4ee351eb064d8aa0819bd9b3fd x86: AVX512 interp_4tap_horiz_pp_16xN Color Space i444 Size | AVX2 performance | AVX512 performance -- 16x4 | 12.87x | 20.91x 16x8 | 18.03x | 27.40x 16x12 | 16.95x | 24.97x 16x16 | 18.82x | 27.13x 16x32 | 16.21x | 25.76x 16x64 | 17.41x | 26.04x diff -r dbfcd0ee40e9 -r ed1932a414bf source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Sep 01 10:33:48 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Sun Aug 13 18:18:28 2017 +0530 @@ -4021,14 +4021,30 @@ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx512); //i422 chroma_hpp +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_hpp = PFX(interp_4tap_horiz_pp_16x64_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hpp = PFX(interp_4tap_horiz_pp_16x24_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512); //i420 chroma_hpp +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_hpp = PFX(interp_4tap_horiz_pp_16x32_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512); diff -r dbfcd0ee40e9 -r ed1932a414bf source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Fri Sep 01 10:33:48 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Sun Aug 13 18:18:28 2017 +0530 @@ -9907,6 +9907,48 @@ vextracti32x8[r2 + r3], m5,1 %endmacro +%macro PROCESS_IPFILTER_CHROMA_PP_16x4_AVX512 0 +; register map +; m0 - interpolate coeff +; m1, m2 - shuffle order table +; m3 - constant word 1 +; m4 - constant word 512 + +movu xm5, [r0] +vinserti32x4 m5, [r0 + r1],1 +vinserti32x4 m5, [r0 + 2 * r1],2 +vinserti32x4 m5, [r0 + r6],3 +pshufb m6, m5, m2 +pshufb m5, m1 + +movu xm7, [r0 + 4] +vinserti32x4 m7, [r0 + r1 + 4],1 +vinserti32x4 m7, [r0 + 2 * r1 + 4],2 +vinserti32x4 m7, [r0 + r6 + 4],3 +pshufb m8, m7, m2 +pshufb m7, m1 + +pmaddubsw m5, m0 +pmaddubsw m7, m0 +pmaddwdm5, m3 +pmaddwdm7, m3 + +pmaddubsw
[x265] [PATCH 090 of 307] x86: AVX512 interp_8tap_horiz_pp_16xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1502617345 -19800 # Sun Aug 13 15:12:25 2017 +0530 # Node ID d9200885420957bccd4edea62bf87bbe8831bc62 # Parent 4be3c35eb7510f269a548f248e4f5904b4107d74 x86: AVX512 interp_8tap_horiz_pp_16xN Size | AVX2 performance | AVX512 performance -- 16x4 | 19.10x | 26.27x 16x8 | 19.37x | 26.59x 16x12 | 19.99x | 32.66x 16x16 | 19.13x | 31.47x 16x32 | 18.94x | 33.38x 16x64 | 18.07x | 29.97x diff -r 4be3c35eb751 -r d92008854209 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Sep 01 10:24:43 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Sun Aug 13 15:12:25 2017 +0530 @@ -4053,6 +4053,12 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512); +p.pu[LUMA_16x4].luma_hpp = PFX(interp_8tap_horiz_pp_16x4_avx512); +p.pu[LUMA_16x8].luma_hpp = PFX(interp_8tap_horiz_pp_16x8_avx512); +p.pu[LUMA_16x12].luma_hpp = PFX(interp_8tap_horiz_pp_16x12_avx512); +p.pu[LUMA_16x16].luma_hpp = PFX(interp_8tap_horiz_pp_16x16_avx512); +p.pu[LUMA_16x32].luma_hpp = PFX(interp_8tap_horiz_pp_16x32_avx512); +p.pu[LUMA_16x64].luma_hpp = PFX(interp_8tap_horiz_pp_16x64_avx512); p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_avx512); p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_avx512); p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_avx512); diff -r 4be3c35eb751 -r d92008854209 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Fri Sep 01 10:24:43 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Sun Aug 13 15:12:25 2017 +0530 @@ -10233,6 +10233,65 @@ vextracti32x8 [r2 + r3], m7, 1 %endmacro +%macro PROCESS_IPFILTER_LUMA_PP_16x4_AVX512 0 +; register map +; m0 , m1 interpolate coeff +; m2 , m3, m4 shuffle order table +; m5 - pw_1 +; m6 - pw_512 + +movu xm7,[r0] +vinserti32x4 m7,[r0 + r1], 1 +vinserti32x4 m7,[r0 + 2 * r1], 2 +vinserti32x4 m7,[r0 + r6], 3 + +pshufbm8,m7,m3 +pshufbm7,m2 + +movu xm9,[r0 + 8] +vinserti32x4 m9,[r0 + r1 + 8], 1 +vinserti32x4 m9,[r0 + 2 * r1 + 8], 2 +vinserti32x4 m9,[r0 + r6 + 8], 3 + +pshufbm10, m9,m3 +pshufbm11, m9,m4 +pshufbm9,m2 + +pmaddubsw m7,m0 +pmaddubsw m12, m8,m1 +pmaddwd m7,m5 +pmaddwd m12, m5 +paddd m7,m12 + +pmaddubsw m8,m0 +pmaddubsw m12, m9,m1 +pmaddwd m8,m5 +pmaddwd m12, m5 +paddd m8,m12 + +pmaddubsw m9,m0 +pmaddubsw m12, m10, m1 +pmaddwd m9,m5 +pmaddwd m12, m5 +paddd m9,m12 + +pmaddubsw m10, m0 +pmaddubsw m12, m11,m1 +pmaddwd m10, m5 +pmaddwd m12, m5 +paddd m10, m12 + +packssdw m7, m8 +packssdw m9, m10 +pmulhrsw m7, m6 +pmulhrsw m9, m6 +packuswb m7, m9 +movu [r2], xm7 +vextracti32x4 [r2 + r3], m7,1 +vextracti32x4 [r2 + 2 * r3], m7,2 +vextracti32x4 [r2 + r7], m7,3 +%endmacro + %macro IPFILTER_LUMA_64xN_AVX512 1 INIT_ZMM avx512 cglobal interp_8tap_horiz_pp_64x%1, 4,6,13 @@ -10299,6 +10358,43 @@ IPFILTER_LUMA_32xN_AVX512 24 IPFILTER_LUMA_32xN_AVX512 32 IPFILTER_LUMA_32xN_AVX512 64 + +%macro IPFILTER_LUMA_16xN_AVX512 1 +INIT_ZMM avx512 +cglobal interp_8tap_horiz_pp_16x%1, 4,8,14 +sub r0,3 +mov r4d, r4m +lea r6,[3 * r1] +lea r7,[3 * r3] +%ifdef PIC +lea r5,[tab_LumaCoeff] +vpbroadcastd m0,[r5 + r4 * 8] +vpbroadcastd m1,[r5 + r4 * 8 + 4] +%else +vpbroadcastd m0,[tab_LumaCoeff + r4 * 8] +vpbroadcastd m1,[tab_LumaCoeff + r4 * 8 + 4] +%endif +vbroadcasti32x8 m2,[interp4_horiz_shuf_load1_avx512] +vbroadcasti32x8 m3,[interp4_horiz_shuf_load3_avx512] +vbroadcasti32x8 m4,[interp4_horiz_shuf_load2_avx512] +
[x265] [PATCH 089 of 307] x86: AVX512 interp_8tap_horiz_pp_32xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1504241683 -19800 # Fri Sep 01 10:24:43 2017 +0530 # Node ID 4be3c35eb7510f269a548f248e4f5904b4107d74 # Parent 354f848c3793b459c005667cdf7158eb6394eb0f x86: AVX512 interp_8tap_horiz_pp_32xN Size | AVX2 performance | AVX512 performance -- 32x8 | 18.92x | 37.84x 32x16 | 17.46x | 36.15x 32x24 | 17.77x | 35.98x 32x32 | 17.91x | 36.69x 32x64 | 18.10x | 35.47x diff -r 354f848c3793 -r 4be3c35eb751 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Aug 11 17:18:16 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Sep 01 10:24:43 2017 +0530 @@ -4052,6 +4052,12 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512); + +p.pu[LUMA_32x8].luma_hpp = PFX(interp_8tap_horiz_pp_32x8_avx512); +p.pu[LUMA_32x16].luma_hpp = PFX(interp_8tap_horiz_pp_32x16_avx512); +p.pu[LUMA_32x24].luma_hpp = PFX(interp_8tap_horiz_pp_32x24_avx512); +p.pu[LUMA_32x32].luma_hpp = PFX(interp_8tap_horiz_pp_32x32_avx512); +p.pu[LUMA_32x64].luma_hpp = PFX(interp_8tap_horiz_pp_32x64_avx512); p.pu[LUMA_64x16].luma_hpp = PFX(interp_8tap_horiz_pp_64x16_avx512); p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_avx512); p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx512); diff -r 354f848c3793 -r 4be3c35eb751 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Fri Aug 11 17:18:16 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Fri Sep 01 10:24:43 2017 +0530 @@ -10182,6 +10182,57 @@ movu [r2], m7 %endmacro +%macro PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 0 +; register map +; m0 , m1 interpolate coeff +; m2 , m3, m4 shuffle order table +; m5 - pw_1 +; m6 - pw_512 + +movu ym7,[r0] +vinserti32x8 m7,[r0 + r1], 1 +movu ym9,[r0 + 8] +vinserti32x8 m9,[r0 + r1 + 8], 1 + +pshufbm8,m7,m3 +pshufbm7,m2 +pshufbm10, m9,m3 +pshufbm11, m9,m4 +pshufbm9,m2 + +pmaddubsw m7,m0 +pmaddubsw m12, m8,m1 +pmaddwd m7,m5 +pmaddwd m12, m5 +paddd m7,m12 + +pmaddubsw m8,m0 +pmaddubsw m12, m9,m1 +pmaddwd m8,m5 +pmaddwd m12, m5 +paddd m8,m12 + +pmaddubsw m9,m0 +pmaddubsw m12, m10, m1 +pmaddwd m9,m5 +pmaddwd m12, m5 +paddd m9,m12 + +pmaddubsw m10, m0 +pmaddubsw m12, m11,m1 +pmaddwd m10, m5 +pmaddwd m12, m5 +paddd m10, m12 + +packssdw m7, m8 +packssdw m9, m10 +pmulhrsw m7, m6 +pmulhrsw m9, m6 +packuswb m7, m9 +movu [r2], ym7 +vextracti32x8 [r2 + r3], m7, 1 +%endmacro + %macro IPFILTER_LUMA_64xN_AVX512 1 INIT_ZMM avx512 cglobal interp_8tap_horiz_pp_64x%1, 4,6,13 @@ -10214,6 +10265,40 @@ IPFILTER_LUMA_64xN_AVX512 32 IPFILTER_LUMA_64xN_AVX512 48 IPFILTER_LUMA_64xN_AVX512 64 + +%macro IPFILTER_LUMA_32xN_AVX512 1 +INIT_ZMM avx512 +cglobal interp_8tap_horiz_pp_32x%1, 4,6,13 +sub r0,3 +mov r4d, r4m +%ifdef PIC +lea r5,[tab_LumaCoeff] +vpbroadcastd m0,[r5 + r4 * 8] +vpbroadcastd m1,[r5 + r4 * 8 + 4] +%else +vpbroadcastd m0,[tab_LumaCoeff + r4 * 8] +vpbroadcastd m1,[tab_LumaCoeff + r4 * 8 + 4] +%endif +vbroadcasti32x8 m2,[interp4_horiz_shuf_load1_avx512] +vbroadcasti32x8 m3,[interp4_horiz_shuf_load3_avx512] +vbroadcasti32x8 m4,[interp4_horiz_shuf_load2_avx512] +vpbroadcastd m5,[pw_1] +vbroadcasti32x8 m6,[pw_512] + +%rep %1/2 -1 +PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 +lea r0,[r0 + 2 * r1] +lea r2,[r2 + 2 * r3] +%endrep +PROCESS_IPFILTER_LUMA_PP_32x2_AVX512 +RET +%endmacro + +IPFILTER_LUMA_32xN_AVX512 8 +IPFILTER_LUMA_32xN_AVX512 16 +IPFILTER_LUMA_32xN_AVX512 24 +IPFILTER_LUMA_32xN_AVX512 32
[x265] [PATCH 096 of 307] x86: AVX512 copy_cnt_32 and copy_cnt_16
# HG changeset patch # User Vignesh Vijayakumar # Date 1503557407 -19800 # Thu Aug 24 12:20:07 2017 +0530 # Node ID 0355f0128b7d713c4a21c91d3cc5bed1e8b43c47 # Parent 31a180bcef33fae436ad7e3aa4378b283a86d56a x86: AVX512 copy_cnt_32 and copy_cnt_16 Size | BitDepth | AVX2 performance | AVX512 performance --- 16x16|8 | 6.92x| 8.07x 16x16|10| 6.72x| 7.75x 32x32|8 | 6.08x| 10.33x 32x32|10| 6.04x| 10.16x diff -r 31a180bcef33 -r 0355f0128b7d source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Aug 23 10:06:01 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Aug 24 12:20:07 2017 +0530 @@ -2342,6 +2342,9 @@ p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); p.weight_pp = PFX(weight_pp_avx512); +p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512); +p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512); + } } #else // if HIGH_BIT_DEPTH @@ -4054,6 +4057,9 @@ p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); +p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512); +p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512); + //i444 chroma_hpp p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512); diff -r 31a180bcef33 -r 0355f0128b7d source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Wed Aug 23 10:06:01 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Thu Aug 24 12:20:07 2017 +0530 @@ -5958,7 +5958,91 @@ movd eax, xm4 RET - +;-- +; copy_cnt avx512 code start +;-- +%macro PROCESS_COPY_CNT_32x4_AVX512 0 +movum0, [r1] +movum1, [r1 + r2] +movu[r0],m0 +movu[r0 + mmsize], m1 +packsswbm0, m1 +pminub m0, m3 + +movum1, [r1 + 2 * r2] +movum2, [r1 + r3] +movu[r0 + 2 * mmsize], m1 +movu[r0 + 3 * mmsize], m2 +packsswbm1, m2 +pminub m1, m3 + +paddb m0, m1 +paddb m4, m0 +%endmacro + +%macro PROCESS_COPY_CNT_16x4_AVX512 0 +movu ym0, [r1] +vinserti32x8 m0, [r1 + r2],1 +movu ym1, [r1 + 2 * r2] +vinserti32x8 m1, [r1 + r3],1 +movu [r0], m0 +movu [r0 + mmsize], m1 +packsswb m0, m1 +pminub m0, m3 +paddb m4, m0 +%endmacro + +%macro PROCESS_COPY_CNT_END_AVX512 0 +pxor m0, m0 +vextracti32x8 ym1, m4, 1 +paddb ym4, ym1 +vextracti32x4 xm1, ym4, 1 +paddb xm4, xm1 +psadbw xm4, xm0 +movhlpsxm1, xm4 +paddd xm4, xm1 +movd eax, xm4 +%endmacro + +;-- +; uint32_t copy_cnt(int32_t* dst, const int16_t* src, intptr_t stride); +;-- +INIT_ZMM avx512 +cglobal copy_cnt_32, 3, 4, 5 +add r2d, r2d +lea r3, [3 * r2] + +vbroadcasti32x8 m3, [pb_1] +pxor m4, m4 + +%rep 7 +PROCESS_COPY_CNT_32x4_AVX512 +add r0, 4 * mmsize +lea r1, [r1 + 4 * r2] +%endrep +PROCESS_COPY_CNT_32x4_AVX512 +PROCESS_COPY_CNT_END_AVX512 +RET + +INIT_ZMM avx512 +cglobal copy_cnt_16, 3, 4, 5 +add r2d, r2d +lea r3, [3 * r2] + +vbroadcasti32x8 m3, [pb_1] +pxor m4, m4 + +%rep 3 +PROCESS_COPY_CNT_16x4_AVX512 +add r0, 2 * mmsize +lea r1, [r1 + 4 * r2] +%endrep +PROCESS_COPY_CNT_16x4_AVX512 +PROCESS_COPY_CNT_END_AVX512 +RET +;-- +; copy_cnt avx512 code end +;-- ;-- ; void cpy2Dto1D_shl(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
[x265] [PATCH 106 of 307] x86: AVX512 interp_4tap_horiz_pp_8xN for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1504171458 -19800 # Thu Aug 31 14:54:18 2017 +0530 # Node ID 1fb1948309a0a9218a07e060300b9d5a7ff58321 # Parent 9928b3e5b4d4235bea9ffb22434446e68c3aacdb x86: AVX512 interp_4tap_horiz_pp_8xN for high bit depth Color Space i444 Size | AVX2 performance | AVX512 performance -- 8x4 | 5.14x| 9.51x 8x8 | 6.20x| 12.75x 8x16 | 6.32x| 12.44x 8x32 | 6.01x| 13.68x diff -r 9928b3e5b4d4 -r 1fb1948309a0 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Aug 31 14:24:24 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Aug 31 14:54:18 2017 +0530 @@ -2354,6 +2354,10 @@ p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_avx512); p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_hpp = PFX(interp_4tap_horiz_pp_8x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_hpp = PFX(interp_4tap_horiz_pp_8x32_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512); @@ -2364,6 +2368,12 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].filter_hpp = PFX(interp_4tap_horiz_pp_8x12_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_hpp = PFX(interp_4tap_horiz_pp_8x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_hpp = PFX(interp_4tap_horiz_pp_8x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_hpp = PFX(interp_4tap_horiz_pp_8x64_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_hpp = PFX(interp_4tap_horiz_pp_16x16_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_hpp = PFX(interp_4tap_horiz_pp_16x24_avx512); @@ -2374,6 +2384,10 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_hpp = PFX(interp_4tap_horiz_pp_8x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_hpp = PFX(interp_4tap_horiz_pp_8x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_hpp = PFX(interp_4tap_horiz_pp_16x4_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_hpp = PFX(interp_4tap_horiz_pp_16x8_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_hpp = PFX(interp_4tap_horiz_pp_16x12_avx512); diff -r 9928b3e5b4d4 -r 1fb1948309a0 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Thu Aug 31 14:24:24 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Thu Aug 31 14:54:18 2017 +0530 @@ -5082,6 +5082,49 @@ ;- ;ipfilter_chroma_avx512 code start ;- +%macro PROCESS_IPFILTER_CHROMA_PP_8x4_AVX512 0 +; register map +; m0 , m1 interpolate coeff +; m2 , m3 shuffle order table +; m4 - pd_32 +; m5 - zero +; m6 - pw_pixel_max + +movuxm7, [r0] +vinserti32x4m7,[r0 + r1], 1 +vinserti32x4m7,[r0 + 2 * r1], 2 +vinserti32x4m7,[r0 + r6], 3 + +pshufb m9,m7,m3 +pshufb m7,m2 +pmaddwd m7,m0 +pmaddwd m9,m1 +paddd m7,m9 +paddd m7,m4 +psrad m7,6 + +movuxm8, [r0 + 8] +
[x265] [PATCH 100 of 307] x86: AVX512 interp_8tap_horiz_pp_48x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1503912578 -19800 # Mon Aug 28 14:59:38 2017 +0530 # Node ID 562c00d2153193eec85ab907b60eeb5aca7cc609 # Parent a7bf0a24cfc8eb8edc95d340b240b91d03dac5bd x86: AVX512 interp_8tap_horiz_pp_48x64 AVX2 performance: 19.57x AVX512 perfornamce : 35.25x diff -r a7bf0a24cfc8 -r 562c00d21531 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 28 14:46:28 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Aug 28 14:59:38 2017 +0530 @@ -4159,6 +4159,7 @@ p.pu[LUMA_64x32].luma_hpp = PFX(interp_8tap_horiz_pp_64x32_avx512); p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx512); p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_avx512); +p.pu[LUMA_48x64].luma_hpp = PFX(interp_8tap_horiz_pp_48x64_avx512); p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx512); p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx512); diff -r a7bf0a24cfc8 -r 562c00d21531 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Mon Aug 28 14:46:28 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Mon Aug 28 14:59:38 2017 +0530 @@ -10489,6 +10489,151 @@ vextracti32x4 [r2 + r7], m7,3 %endmacro +%macro PROCESS_IPFILTER_LUMA_PP_48x4_AVX512 0 +; register map +; m0 , m1 interpolate coeff +; m2 , m3, m4 shuffle order table +; m5 - pw_1 +; m6 - pw_512 + +movu ym7,[r0] +vinserti32x8 m7,[r0 + r1], 1 +movu ym9,[r0 + 8] +vinserti32x8 m9,[r0 + r1 + 8], 1 + +pshufbm8,m7,m3 +pshufbm7,m2 +pshufbm10, m9,m3 +pshufbm11, m9,m4 +pshufbm9,m2 + +pmaddubsw m7,m0 +pmaddubsw m12, m8,m1 +pmaddwd m7,m5 +pmaddwd m12, m5 +paddd m7,m12 + +pmaddubsw m8,m0 +pmaddubsw m12, m9,m1 +pmaddwd m8,m5 +pmaddwd m12, m5 +paddd m8,m12 + +pmaddubsw m9,m0 +pmaddubsw m12, m10, m1 +pmaddwd m9,m5 +pmaddwd m12, m5 +paddd m9,m12 + +pmaddubsw m10, m0 +pmaddubsw m12, m11,m1 +pmaddwd m10, m5 +pmaddwd m12, m5 +paddd m10, m12 + +packssdw m7, m8 +packssdw m9, m10 +pmulhrsw m7, m6 +pmulhrsw m9, m6 +packuswb m7, m9 +movu [r2], ym7 +vextracti32x8 [r2 + r3], m7, 1 + +movu ym7,[r0 + 2 * r1] +vinserti32x8 m7,[r0 + r6], 1 +movu ym9,[r0 + 2 * r1 + 8] +vinserti32x8 m9,[r0 + r6 + 8], 1 + +pshufbm8,m7,m3 +pshufbm7,m2 +pshufbm10, m9,m3 +pshufbm11, m9,m4 +pshufbm9,m2 + +pmaddubsw m7,m0 +pmaddubsw m12, m8,m1 +pmaddwd m7,m5 +pmaddwd m12, m5 +paddd m7,m12 + +pmaddubsw m8,m0 +pmaddubsw m12, m9,m1 +pmaddwd m8,m5 +pmaddwd m12, m5 +paddd m8,m12 + +pmaddubsw m9,m0 +pmaddubsw m12, m10, m1 +pmaddwd m9,m5 +pmaddwd m12, m5 +paddd m9,m12 + +pmaddubsw m10, m0 +pmaddubsw m12, m11,m1 +pmaddwd m10, m5 +pmaddwd m12, m5 +paddd m10, m12 + +packssdw m7, m8 +packssdw m9, m10 +pmulhrsw m7, m6 +pmulhrsw m9, m6 +packuswb m7, m9 +movu [r2 + 2 * r3], ym7 +vextracti32x8 [r2 + r7], m7,1 + +movu xm7,[r0 + mmsize/2] +vinserti32x4 m7,[r0 + r1 + mmsize/2], 1 +vinserti32x4 m7,[r0 + 2 * r1 + mmsize/2], 2 +vinserti32x4 m7,[r0 + r6 + mmsize/2], 3 + +pshufbm8,m7,m3 +pshufbm7,m2 + +movu xm9,[r0 + 40] +vinserti32x4 m9,[r0 + r1 + 40], 1 +vinserti32x4 m9,[r0 + 2 * r1 + 40], 2 +vinserti32x4 m9,[r0 + r6 +
[x265] [PATCH 107 of 307] x86: AVX512 interp_4tap_horiz_pp_24xN for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1504173085 -19800 # Thu Aug 31 15:21:25 2017 +0530 # Node ID c726239a07580fd13c4177f0206d615ee02c5975 # Parent 1fb1948309a0a9218a07e060300b9d5a7ff58321 x86: AVX512 interp_4tap_horiz_pp_24xN for high bit depth i444 24x32 AVX2 performance : 8.85x AVX512 performance : 19.37x diff -r 1fb1948309a0 -r c726239a0758 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Aug 31 14:54:18 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Aug 31 15:21:25 2017 +0530 @@ -2367,6 +2367,7 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_hpp = PFX(interp_4tap_horiz_pp_24x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx512); @@ -2383,6 +2384,7 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_hpp = PFX(interp_4tap_horiz_pp_24x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_8x4].filter_hpp = PFX(interp_4tap_horiz_pp_8x4_avx512); p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_hpp = PFX(interp_4tap_horiz_pp_8x8_avx512); @@ -2404,6 +2406,7 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hpp = PFX(interp_4tap_horiz_pp_48x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_hpp = PFX(interp_4tap_horiz_pp_24x32_avx512); } } diff -r 1fb1948309a0 -r c726239a0758 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Thu Aug 31 14:54:18 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Thu Aug 31 15:21:25 2017 +0530 @@ -5161,6 +5161,103 @@ vextracti32x8 [r2 + r3], m7,1 %endmacro +%macro PROCESS_IPFILTER_CHROMA_PP_24x4_AVX512 0 +; register map +; m0 , m1 interpolate coeff +; m2 , m3 shuffle order table +; m4 - pd_32 +; m5 - zero +; m6 - pw_pixel_max + +movuym7, [r0] +vinserti32x8m7,[r0 + r1], 1 +movuym8, [r0 + 8] +vinserti32x8m8,[r0 + r1 + 8], 1 + +pshufb m9,m7,m3 +pshufb m7,m2 +pmaddwd m7,m0 +pmaddwd m9,m1 +paddd m7,m9 +paddd m7,m4 +psrad m7,6 + +pshufb m9,m8,m3 +pshufb m8,m2 +pmaddwd m8,m0 +pmaddwd m9,m1 +paddd m8,m9 +paddd m8,m4 +psrad m8,6 + +packusdwm7,m8 +CLIPW m7,m5,m6 +pshufb m7,m10 +movu[r2], ym7 +vextracti32x8 [r2 + r3], m7,1 + +movuym7, [r0 + 2 * r1] +vinserti32x8m7,[r0 + r6], 1 +movuym8, [r0 + 2 * r1 + 8] +vinserti32x8m8,[r0 + r6 + 8], 1 + +pshufb m9,m7,m3 +pshufb m7,m2 +pmaddwd m7,m0 +pmaddwd m9,m1 +paddd m7,m9 +paddd m7,m4 +psrad m7,6 + +pshufb m9,m8,m3 +pshufb m8,m2 +pmaddwd m8,m0 +pmaddwd m9,m1 +paddd m8,m9 +paddd m8,m4 +psrad m8,6 + +packusdwm7,m8 +CLIPW m7,m5,m6 +pshufb m7,m10 +movu[r2 + 2 * r3],ym7 +vextracti32x8 [r2 + r7], m7,1 + +movuxm7, [r0 + mmsize/2] +vinserti32x4m7,[r0 + r1 + mmsize/2], 1 +vinserti32x4m7,[r0 + 2 * r1 + mmsize/2], 2 +vinserti32x4m7,[r0 + r6 + mmsize/2], 3 + +pshufb m9,m7,m3 +pshufb m7,m2 +pmaddwd m7,m0 +
[x265] [PATCH 098 of 307] x86: AVX512 pixel_avg_weight_64xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1503908210 -19800 # Mon Aug 28 13:46:50 2017 +0530 # Node ID 45e4dd746cfd9380dbe2344a5754a6ff6e9feed5 # Parent bf199a5eca5be148be8a0c91cd9f2e8e0e908059 x86: AVX512 pixel_avg_weight_64xN Size | AVX2 performance | AVX512 performance -- 64x16 | 41.70x | 60.98x 64x32 | 36.75x | 68.91x 64x48 | 37.31x | 59.07x 64x64 | 37.92x | 58.85x diff -r bf199a5eca5b -r 45e4dd746cfd source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 28 11:58:37 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Aug 28 13:46:50 2017 +0530 @@ -4159,6 +4159,11 @@ p.pu[LUMA_64x48].luma_hpp = PFX(interp_8tap_horiz_pp_64x48_avx512); p.pu[LUMA_64x64].luma_hpp = PFX(interp_8tap_horiz_pp_64x64_avx512); +p.pu[LUMA_64x16].pixelavg_pp = PFX(pixel_avg_64x16_avx512); +p.pu[LUMA_64x32].pixelavg_pp = PFX(pixel_avg_64x32_avx512); +p.pu[LUMA_64x48].pixelavg_pp = PFX(pixel_avg_64x48_avx512); +p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx512); + } #endif } diff -r bf199a5eca5b -r 45e4dd746cfd source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmMon Aug 28 11:58:37 2017 +0530 +++ b/source/common/x86/mc-a.asmMon Aug 28 13:46:50 2017 +0530 @@ -5020,6 +5020,58 @@ RET %endif +;- +;pixel_avg_pp avx512 code start +;- +%macro PROCESS_PIXELAVG_64x4_AVX512 0 +movum0, [r2] +movum2, [r2 + r3] +movum1, [r4] +movum3, [r4 + r5] +pavgb m0, m1 +pavgb m2, m3 +movu[r0], m0 +movu[r0 + r1], m2 + +movum0, [r2 + 2 * r3] +movum2, [r2 + r7] +movum1, [r4 + 2 * r5] +movum3, [r4 + r8] +pavgb m0, m1 +pavgb m2, m3 +movu[r0 + 2 * r1], m0 +movu[r0 + r6], m2 +%endmacro + +;--- +;void pixelavg_pp(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int) +;--- +%if ARCH_X86_64 && BIT_DEPTH == 8 +%macro PIXEL_AVG_64xN_AVX512 1 +INIT_ZMM avx512 +cglobal pixel_avg_64x%1, 6, 9, 4 +lea r6, [3 * r1] +lea r7, [3 * r3] +lea r8, [3 * r5] + +%rep %1/4 - 1 +PROCESS_PIXELAVG_64x4_AVX512 +lea r2, [r2 + r3 * 4] +lea r4, [r4 + r5 * 4] +lea r0, [r0 + r1 * 4] +%endrep +PROCESS_PIXELAVG_64x4_AVX512 +RET +%endmacro + +PIXEL_AVG_64xN_AVX512 16 +PIXEL_AVG_64xN_AVX512 32 +PIXEL_AVG_64xN_AVX512 48 +PIXEL_AVG_64xN_AVX512 64 +%endif +;- +;pixel_avg_pp avx512 code end +;- ;= ; pixel avg2 ;= ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 178 of 307] x86: AVX512 interp_4tap_vert_pp_16xN
# HG changeset patch # User Vignesh Vijayakumar# Date 1510823813 -19800 # Thu Nov 16 14:46:53 2017 +0530 # Node ID 963884afd8f38dbcc8335ff1d3a39385e317d6d4 # Parent 2c24c0aadbe3e76eabde711a94c57aed077b7347 x86: AVX512 interp_4tap_vert_pp_16xN i444 Size | AVX2 performance | AVX512 performance -- 16x8 | 31.15x | 36.85x 16x16 | 29.18x | 41.50x 16x32 | 30.14x | 43.30x 16x64 | 31.79x | 45.30x This patch also optimises coeffIdx load to register for chroma_vpp diff -r 2c24c0aadbe3 -r 963884afd8f3 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Nov 06 17:13:17 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Nov 16 14:46:53 2017 +0530 @@ -4816,20 +4816,33 @@ p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hps = PFX(interp_4tap_horiz_ps_48x64_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512); + +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = PFX(interp_4tap_vert_pp_16x24_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = PFX(interp_4tap_vert_pp_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512); + +p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512); - p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = PFX(interp_4tap_vert_pp_64x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = PFX(interp_4tap_vert_pp_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx512); diff -r 2c24c0aadbe3 -r 963884afd8f3 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Mon Nov 06 17:13:17 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Thu Nov 16 14:46:53 2017 +0530 @@ -10797,6 +10797,134 @@ IPFILTER_CHROMA_PS_48xN_AVX512 64 %endif +;- +;avx512 chroma_vpp code start +;- +%macro PROCESS_CHROMA_VERT_PP_16x8_AVX512 0 +movu xm1,[r0] +lea r8, [r0 + 4 * r1] +lea r9, [r8 + 2 * r1] +vinserti32x4 m1, [r0 + 2 * r1],1 +vinserti32x4 m1, [r8],2 +vinserti32x4 m1, [r9],3 +movu xm3,[r0 +
[x265] [PATCH 175 of 307] [x265-avx512]x86: AVX512 idct16x16
# HG changeset patch # User Gopi Satykrishna Akisetty# Date 1509948596 -19800 # Mon Nov 06 11:39:56 2017 +0530 # Node ID 8bbcc1bd3c1381e936695a6eff30a17cc2633b6f # Parent df3c576cd32c50b0412ad3d70eeebfe8fb511da1 [x265-avx512]x86: AVX512 idct16x16 AVX2 Performance:11.67x AVX512 Performance :12.80x diff -r df3c576cd32c -r 8bbcc1bd3c13 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Nov 13 16:02:40 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Nov 06 11:39:56 2017 +0530 @@ -2837,6 +2837,8 @@ p.cu[BLOCK_8x8].dct = PFX(dct8_avx512); p.cu[BLOCK_8x8].idct = PFX(idct8_avx512); +p.cu[BLOCK_16x16].idct = PFX(idct16_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512); @@ -4835,6 +4837,7 @@ p.cu[BLOCK_8x8].dct = PFX(dct8_avx512); p.cu[BLOCK_8x8].idct = PFX(idct8_avx512); +p.cu[BLOCK_16x16].idct = PFX(idct16_avx512); } #endif diff -r df3c576cd32c -r 8bbcc1bd3c13 source/common/x86/dct8.asm --- a/source/common/x86/dct8.asmMon Nov 13 16:02:40 2017 +0530 +++ b/source/common/x86/dct8.asmMon Nov 06 11:39:56 2017 +0530 @@ -218,6 +218,27 @@ idct16_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5 + +tab_AVX512_idct16_1: dw 90, 87, 80, 70, 57, 43, 25, 9, 90, 87, 80, 70, 57, 43, 25, 9, 80, 9, -70, -87, -25, 57, 90, 43, 80, 9, -70, -87, -25, 57, 90, 43 + dw 87, 57, 9, -43, -80, -90, -70, -25, 87, 57, 9, -43, -80, -90, -70, -25, 70, -43, -87, 9, 90, 25, -80, -57, 70, -43, -87, 9, 90, 25, -80, -57 + dw 57, -80, -25, 90, -9, -87, 43, 70, 57, -80, -25, 90, -9, -87, 43, 70, 25, -70, 90, -80, 43, 9, -57, 87, 25, -70, 90, -80, 43, 9, -57, 87 + dw 43, -90, 57, 25, -87, 70, 9, -80, 43, -90, 57, 25, -87, 70, 9, -80, 9, -25, 43, -57, 70, -80, 87, -90, 9, -25, 43, -57, 70, -80, 87, -90 + +tab_AVX512_idct16_2: dw 64, 89, 83, 75, 64, 50, 36, 18, 64, 89, 83, 75, 64, 50, 36, 18, 64, 50, -36, -89, -64, 18, 83, 75, 64, 50, -36, -89, -64, 18, 83, 75 + dw 64, 75, 36, -18, -64, -89, -83, -50, 64, 75, 36, -18, -64, -89, -83, -50, 64, 18, -83, -50, 64, 75, -36, -89, 64, 18, -83, -50, 64, 75, -36, -89 + dw 64, -18, -83, 50, 64, -75, -36, 89, 64, -18, -83, 50, 64, -75, -36, 89, 64, -75, 36, 18, -64, 89, -83, 50, 64, -75, 36, 18, -64, 89, -83, 50 + dw 64, -50, -36, 89, -64, -18, 83, -75, 64, -50, -36, 89, -64, -18, 83, -75, 64, -89, 83, -75, 64, -50, 36, -18, 64, -89, 83, -75, 64, -50, 36, -18 + +idct16_AVX512_shuff: dd 0, 4, 2, 6, 1, 5, 3, 7, 8, 12, 10, 14, 9, 13, 11, 15 + +idct16_AVX512_shuff1: dd 2, 6, 0, 4, 3, 7, 1, 5, 10, 14, 8, 12, 11, 15, 9, 13 + +idct16_AVX512_shuff2: dq 0, 1, 8, 9, 4, 5, 12, 13 +idct16_AVX512_shuff3: dq 2, 3, 10, 11, 6, 7, 14, 15 +idct16_AVX512_shuff4: dq 4, 5, 12, 13, 0, 1, 8, 9 +idct16_AVX512_shuff5: dq 6, 7, 14, 15, 2, 3, 10, 11 +idct16_AVX512_shuff6: times 4 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 + tab_idct32_1: dw 90 ,90 ,88 ,85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4 dw 90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13 dw 88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22 @@ -3671,6 +3692,599 @@ jnz .pass2 RET + +%macro IDCT16_AVX512_PASS1 3 +movum5, [tab_AVX512_idct16_2 + %1 * 64] +pmaddwd m9, m0, m5 +pmaddwd m10, m7, m5 + +vpsrldq m16, m9, 4 +padddm9, m16 +vpslldq m17, m10, 4 +padddm10, m17 +vmovdqu32m9 {k1}, m10 + +pmaddwd m10, m6, m5 +pmaddwd m11, m8, m5 + +vpsrldq m16, m10, 4 +padddm10, m16 +vpslldq m17, m11, 4 +padddm11, m17 +vmovdqu32m10 {k1}, m11 + +vpsrldq m16, m9, 8 +padddm9, m16 +vpslldq m17, m10, 8 +padddm10, m17 +vmovdqu32m9 {k2}, m10 + +movum5, [tab_AVX512_idct16_1 + %1 * 64] +pmaddwd m10, m1, m5 +pmaddwd m11, m3, m5 + +vpsrldq m16, m10, 4 +padddm10, m16 +vpslldq m17, m11, 4 +padddm11, m17 +vmovdqu32m10 {k1}, m11 + +pmaddwd m11, m4, m5 +pmaddwd m12, m2, m5 + +vpsrldq m16, m11, 4 +padddm11, m16 +vpslldq m17, m12, 4 +padddm12, m17 +vmovdqu32m11 {k1}, m12 + +vpsrldq m16, m10, 8 +padddm10, m16 +vpslldq m17, m11, 8 +
[x265] [PATCH 173 of 307] x86: AVX512 optimise intermediate register load in chroma_vsp, chroma_vss, chroma_vps
# HG changeset patch # User Vignesh Vijayakumar> # Date 1522976950 25200 # Thu Apr 05 18:09:10 2018 -0700 # Node ID ab41c6957bc2f359e5df82f9936c3fd00a5d2ea5 # Parent 71f7869fac602953ef5e14c344f10adc374d7bfa x86: AVX512 optimise intermediate register load in chroma_vsp, chroma_vss, chroma_vps diff -r 71f7869fac60 -r ab41c6957bc2 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Wed Nov 15 14:35:17 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Thu Apr 05 18:09:10 2018 -0700 @@ -7527,10 +7527,10 @@ vinserti32x4 m4, [r8 + 4 * r1], 2 vinserti32x4 m4, [r9 + 4 * r1], 3 punpcklwd m6, m5, m4 -pmaddwd m6, [r5 + mmsize] +pmaddwd m6, m9 paddd m2, m6 punpckhwd m5, m4 -pmaddwd m5, [r5 + mmsize] +pmaddwd m5, m9 paddd m3, m5 paddd m0, m7 @@ -7561,7 +7561,7 @@ ;- %if ARCH_X86_64 INIT_ZMM avx512 -cglobal interp_4tap_vert_ps_8x8, 5, 11, 8 +cglobal interp_4tap_vert_ps_8x8, 5, 11, 10 add r1d,r1d add r3d,r3d sub r0, r1 @@ -7576,13 +7576,15 @@ vbroadcasti32x4 m7, [INTERP_OFFSET_PS] lea r10,[3 * r1] lea r7, [3 * r3] +mova m8, [r5] +mova m9, [r5 + mmsize] PROCESS_CHROMA_VERT_PS_8x8_AVX512 RET %endif %macro FILTER_VER_PS_CHROMA_8xN_AVX512 1 INIT_ZMM avx512 -cglobal interp_4tap_vert_ps_8x%1, 5, 11, 8 +cglobal interp_4tap_vert_ps_8x%1, 5, 11, 10 add r1d,r1d add r3d,r3d sub r0, r1 @@ -7597,6 +7599,8 @@ vbroadcasti32x4 m7, [INTERP_OFFSET_PS] lea r10,[3 * r1] lea r7, [3 * r3] +mova m8, [r5] +mova m9, [r5 + mmsize] %rep %1/8 - 1 PROCESS_CHROMA_VERT_PS_8x8_AVX512 lea r0, [r8 + 4 * r1] @@ -7619,33 +7623,33 @@ movu ym3,[r0 + r1] vinserti32x8 m3, [r6 + r1], 1 punpcklwd m0, m1, m3 -pmaddwd m0, [r5] +pmaddwd m0, m8 punpckhwd m1, m3 -pmaddwd m1, [r5] +pmaddwd m1, m8 movu ym4,[r0 + 2 * r1] vinserti32x8 m4, [r6 + 2 * r1], 1 punpcklwd m2, m3, m4 -pmaddwd m2, [r5] +pmaddwd m2, m8 punpckhwd m3, m4 -pmaddwd m3, [r5] +pmaddwd m3, m8 movu ym5,[r0 + r8] vinserti32x8 m5, [r6 + r8], 1 punpcklwd m6, m4, m5 -pmaddwd m6, [r5 + mmsize] +pmaddwd m6, m9 paddd m0, m6 punpckhwd m4, m5 -pmaddwd m4, [r5 + mmsize] +pmaddwd m4, m9 paddd m1, m4 movu ym4,[r0 + 4 * r1] vinserti32x8 m4, [r6 + 4 * r1], 1 punpcklwd m6, m5, m4 -pmaddwd m6, [r5 + mmsize] +pmaddwd m6, m9 paddd m2, m6 punpckhwd m5, m4 -pmaddwd m5, [r5 + mmsize] +pmaddwd m5, m9 paddd m3, m5 paddd m0, m7 @@ -7671,7 +7675,7 @@ ;- %if
[x265] [PATCH 164 of 307] Disable all avx512 Kernels with negative IPC gains over avx2 Kernels
# HG changeset patch # User Jayashree # Date 1510736734 -19800 # Wed Nov 15 14:35:34 2017 +0530 # Node ID 7d41838f5d06ad4fbdabd08af99d724fcd599193 # Parent e1dedfae074d765c26efca976538cd06e1ef7cab Disable all avx512 Kernels with negative IPC gains over avx2 Kernels. diff -r e1dedfae074d -r 7d41838f5d06 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Nov 14 02:11:35 2017 -0800 +++ b/source/common/x86/asm-primitives.cpp Wed Nov 15 14:35:34 2017 +0530 @@ -4328,10 +4328,10 @@ if (cpuMask & X265_CPU_AVX512) { p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512); -p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512); + // p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512); p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512); p.pu[LUMA_32x32].sad = PFX(pixel_sad_32x32_avx512); -p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512); +//p.pu[LUMA_32x64].sad = PFX(pixel_sad_32x64_avx512); p.pu[LUMA_64x16].sad = PFX(pixel_sad_64x16_avx512); p.pu[LUMA_64x32].sad = PFX(pixel_sad_64x32_avx512); p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); @@ -4400,14 +4400,14 @@ p.pu[LUMA_8x32].pixelavg_pp = PFX(pixel_avg_8x32_avx512); p.pu[LUMA_8x16].pixelavg_pp = PFX(pixel_avg_8x16_avx512); p.pu[LUMA_8x8].pixelavg_pp = PFX(pixel_avg_8x8_avx512); -p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx512); +//p.pu[LUMA_8x4].pixelavg_pp = PFX(pixel_avg_8x4_avx512); p.pu[LUMA_4x4].sad = PFX(pixel_sad_4x4_avx512); p.pu[LUMA_4x8].sad = PFX(pixel_sad_4x8_avx512); p.pu[LUMA_4x16].sad = PFX(pixel_sad_4x16_avx512); p.pu[LUMA_8x4].sad = PFX(pixel_sad_8x4_avx512); p.pu[LUMA_8x8].sad = PFX(pixel_sad_8x8_avx512); -p.pu[LUMA_8x16].sad = PFX(pixel_sad_8x16_avx512); + // p.pu[LUMA_8x16].sad = PFX(pixel_sad_8x16_avx512); p.pu[LUMA_16x8].sad = PFX(pixel_sad_16x8_avx512); p.pu[LUMA_16x16].sad = PFX(pixel_sad_16x16_avx512); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 165 of 307] x86: AVX512 interp_4tap_vert_sp_64xN for high bit depth
# HG changeset patch # User Vignesh Vijayakumar# Date 1510729226 -19800 # Wed Nov 15 12:30:26 2017 +0530 # Node ID 3f4b7399d14ba72aba0692e61681276f09df8ada # Parent 7d41838f5d06ad4fbdabd08af99d724fcd599193 x86: AVX512 interp_4tap_vert_sp_64xN for high bit depth i444 Size | AVX2 performance | AVX512 performance -- 64x16 | 23.46x | 43.98x 64x32 | 23.54x | 40.59x 64x48 | 23.71x | 40.46x 64x64 | 23.59x | 40.33x This patch also cleanup horiz_ps_8xN for better readability of code diff -r 7d41838f5d06 -r 3f4b7399d14b source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Nov 15 14:35:34 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Nov 15 12:30:26 2017 +0530 @@ -2643,6 +2643,11 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = PFX(interp_4tap_vert_ps_64x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = PFX(interp_4tap_vert_ps_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = PFX(interp_4tap_vert_ps_64x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vsp = PFX(interp_4tap_vert_sp_64x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = PFX(interp_4tap_vert_sp_64x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = PFX(interp_4tap_vert_sp_64x48_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = PFX(interp_4tap_vert_pp_48x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vps = PFX(interp_4tap_vert_ps_48x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512); diff -r 7d41838f5d06 -r 3f4b7399d14b source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Wed Nov 15 14:35:34 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Wed Nov 15 12:30:26 2017 +0530 @@ -7342,6 +7342,131 @@ RET %endif +%macro PROCESS_IPFILTER_CHROMA_PS_8x4_AVX512 0 +; register map +; m0 , m1 - interpolate coeff +; m2 , m3 - shuffle load order table +; m4 - INTERP_OFFSET_PS +; m5 - shuffle store order table + +movuxm6, [r0] +vinserti32x4m6,[r0 + r1], 1 +vinserti32x4m6,[r0 + 2 * r1], 2 +vinserti32x4m6,[r0 + r6], 3 + +pshufb m8,m6,m3 +pshufb m6,m2 +pmaddwd m6,m0 +pmaddwd m8,m1 +paddd m6,m8 +paddd m6,m4 +psrad m6,INTERP_SHIFT_PS + +movuxm7, [r0 + 8] +vinserti32x4m7,[r0 + r1 + 8], 1 +vinserti32x4m7,[r0 + 2 * r1 + 8], 2 +vinserti32x4m7,[r0 + r6 + 8], 3 + +pshufb m8,m7,m3 +pshufb m7,m2 +pmaddwd m7,m0 +pmaddwd m8,m1 +paddd m7,m8 +paddd m7,m4 +psrad m7,INTERP_SHIFT_PS + +packssdwm6,m7 +pshufb m6,m5 +movu[r2], xm6 +vextracti32x4 [r2 + r3], m6,1 +vextracti32x4 [r2 + 2 * r3], m6,2 +vextracti32x4 [r2 + r7], m6,3 +%endmacro + +%macro PROCESS_IPFILTER_CHROMA_PS_8x3_AVX512 0 +movuxm6, [r0] +vinserti32x4m6,[r0 + r1], 1 +vinserti32x4m6,[r0 + 2 * r1], 2 + +pshufb m8,m6,m3 +pshufb m6,m2 +pmaddwd m6,m0 +pmaddwd m8,m1 +paddd m6,m8 +paddd m6,m4 +psrad m6,INTERP_SHIFT_PS + +movuxm7, [r0 + 8] +vinserti32x4m7,[r0 + r1 + 8], 1 +vinserti32x4m7,[r0 + 2 * r1 + 8], 2 + +pshufb m8,m7,m3 +pshufb m7,m2 +pmaddwd m7,m0 +pmaddwd m8,m1 +paddd m7,m8 +paddd m7,m4 +psrad m7,INTERP_SHIFT_PS + +packssdwm6,m7 +pshufb m6,m5 +movu[r2], xm6 +vextracti32x4 [r2 + r3], m6,1 +vextracti32x4 [r2 + 2 * r3], m6,2 +%endmacro + +%macro IPFILTER_CHROMA_PS_AVX512_8xN 1 +INIT_ZMM avx512 +cglobal interp_4tap_horiz_ps_8x%1, 4,9,9 +add r1d, r1d +add r3d, r3d +mov r4d, r4m +mov r5d, r5m + +lea r6, [3 * r1] +lea r7, [3 * r3] +%ifdef PIC +lea
[x265] [PATCH 181 of 307] x86: AVX512 interp_4tap_vert_ss_16xN
# HG changeset patch # User Vignesh Vijayakumar# Date 1511333868 -19800 # Wed Nov 22 12:27:48 2017 +0530 # Node ID ad1814e2ff60904208508512af07472dee380c51 # Parent 83f75ffc0773a2448efa7e6485cb009825edae41 x86: AVX512 interp_4tap_vert_ss_16xN i444 Size | AVX2 performance | AVX512 performance -- 16x4 | 13.31x | 32.24x 16x8 | 16.43x | 31.07x 16x12 | 17.26x | 30.29x 16x16 | 17.62x | 31.74x 16x32 | 16.66x | 35.61x 16x64 | 17.09x | 37.18x diff -r 83f75ffc0773 -r ad1814e2ff60 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Nov 22 11:56:13 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Nov 22 12:27:48 2017 +0530 @@ -4824,6 +4824,11 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vss = PFX(interp_4tap_vert_ss_16x4_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vss = PFX(interp_4tap_vert_ss_16x12_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = PFX(interp_4tap_vert_ss_32x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx512); @@ -4839,6 +4844,11 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = PFX(interp_4tap_vert_pp_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vss = PFX(interp_4tap_vert_ss_16x24_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vss = PFX(interp_4tap_vert_ss_16x64_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vss = PFX(interp_4tap_vert_ss_32x48_avx512); @@ -4858,6 +4868,12 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vss = PFX(interp_4tap_vert_ss_16x4_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vss = PFX(interp_4tap_vert_ss_16x8_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vss = PFX(interp_4tap_vert_ss_16x12_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vss = PFX(interp_4tap_vert_ss_16x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vss = PFX(interp_4tap_vert_ss_16x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vss = PFX(interp_4tap_vert_ss_16x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vss = PFX(interp_4tap_vert_ss_32x8_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx512); diff -r 83f75ffc0773 -r ad1814e2ff60 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Wed Nov 22 11:56:13 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Wed Nov 22 12:27:48 2017 +0530 @@ -11148,6 +11148,118 @@ ;- ;avx512 chroma_vss code start ;- +%macro PROCESS_CHROMA_VERT_SS_16x4_AVX512 0 +movu ym1,[r0] +lea r6, [r0 + 2 * r1] +vinserti32x8 m1, [r6],1 +movu ym3,[r0 + r1] +vinserti32x8 m3, [r6 +
[x265] [PATCH 180 of 307] x86: AVX512 interp_4tap_vert_ss_64xN
# HG changeset patch # User Vignesh Vijayakumar# Date 1511331973 -19800 # Wed Nov 22 11:56:13 2017 +0530 # Node ID 83f75ffc0773a2448efa7e6485cb009825edae41 # Parent 635fbc26941a08a2829a473e13fb5052f5a8471a x86: AVX512 interp_4tap_vert_ss_64xN i444 Size | AVX2 performance | AVX512 performance -- 64x16 | 15.89x | 32.95x 64x32 | 16.11x | 37.31x 64x48 | 16.04x | 36.33x 64x64 | 16.63x | 39.27x diff -r 635fbc26941a -r 83f75ffc0773 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Nov 22 10:51:33 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Nov 22 11:56:13 2017 +0530 @@ -4863,6 +4863,10 @@ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vss = PFX(interp_4tap_vert_ss_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vss = PFX(interp_4tap_vert_ss_64x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = PFX(interp_4tap_vert_ss_64x48_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = PFX(interp_4tap_vert_ss_64x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = PFX(interp_4tap_vert_ss_64x16_avx512); p.cu[BLOCK_8x8].dct = PFX(dct8_avx512); p.cu[BLOCK_8x8].idct = PFX(idct8_avx512); diff -r 635fbc26941a -r 83f75ffc0773 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Wed Nov 22 10:51:33 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Wed Nov 22 11:56:13 2017 +0530 @@ -11261,6 +11261,116 @@ FILTER_VER_SS_CHROMA_32xN_AVX512 64 %endif +%macro PROCESS_CHROMA_VERT_SS_64x2_AVX512 0 +movu m1, [r0] +movu m3, [r0 + r1] +punpcklwdm0, m1, m3 +pmaddwd m0, m15 +punpckhwdm1, m3 +pmaddwd m1, m15 + +movu m9, [r0 + mmsize] +movu m11, [r0 + r1 + mmsize] +punpcklwdm8, m9, m11 +pmaddwd m8, m15 +punpckhwdm9, m11 +pmaddwd m9, m15 +movu m4, [r0 + 2 * r1] +punpcklwdm2, m3, m4 +pmaddwd m2, m15 +punpckhwdm3, m4 +pmaddwd m3, m15 +movu m12, [r0 + 2 * r1 + mmsize] +punpcklwdm10, m11,m12 +pmaddwd m10, m15 +punpckhwdm11, m12 +pmaddwd m11, m15 + +lea r0, [r0 + 2 * r1] +movu m5, [r0 + r1] +punpcklwdm6, m4, m5 +pmaddwd m6, m16 +padddm0, m6 +punpckhwdm4, m5 +pmaddwd m4, m16 +padddm1, m4 + +movu m13, [r0 + r1 + mmsize] +punpcklwdm14, m12,m13 +pmaddwd m14, m16 +padddm8, m14 +punpckhwdm12, m13 +pmaddwd m12, m16 +padddm9, m12 + +movu m4, [r0 + 2 * r1] +punpcklwdm6, m5, m4 +pmaddwd m6, m16 +padddm2, m6 +punpckhwdm5, m4 +pmaddwd m5, m16 +padddm3, m5 + +movu m12, [r0 + 2 * r1 + mmsize] +punpcklwdm14, m13,m12 +pmaddwd m14, m16 +padddm10, m14 +punpckhwdm13, m12 +pmaddwd m13, m16 +padddm11, m13 + +psradm0, 6 +psradm1, 6 +psradm2,
[x265] [PATCH 179 of 307] x86: AVX512 interp_4tap_vert_ss_32xN
# HG changeset patch # User Vignesh Vijayakumar# Date 1511328093 -19800 # Wed Nov 22 10:51:33 2017 +0530 # Node ID 635fbc26941a08a2829a473e13fb5052f5a8471a # Parent 963884afd8f38dbcc8335ff1d3a39385e317d6d4 x86: AVX512 interp_4tap_vert_ss_32xN i444 Size | AVX2 performance | AVX512 performance -- 32x8 | 15.51x | 34.64x 32x16 | 17.04x | 37.82x 32x24 | 15.81x | 35.75x 32x32 | 16.64x | 40.20x 32x64 | 16.85x | 35.51x diff -r 963884afd8f3 -r 635fbc26941a source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Nov 16 14:46:53 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Nov 22 10:51:33 2017 +0530 @@ -4824,6 +4824,11 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = PFX(interp_4tap_vert_ss_32x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512); + p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512); @@ -4834,6 +4839,11 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = PFX(interp_4tap_vert_pp_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vss = PFX(interp_4tap_vert_ss_32x48_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vss = PFX(interp_4tap_vert_ss_32x64_avx512); + p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512); @@ -4848,6 +4858,12 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vpp = PFX(interp_4tap_vert_pp_64x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vss = PFX(interp_4tap_vert_ss_32x8_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vss = PFX(interp_4tap_vert_ss_32x64_avx512); + p.cu[BLOCK_8x8].dct = PFX(dct8_avx512); p.cu[BLOCK_8x8].idct = PFX(idct8_avx512); p.cu[BLOCK_16x16].idct = PFX(idct16_avx512); diff -r 963884afd8f3 -r 635fbc26941a source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Thu Nov 16 14:46:53 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Wed Nov 22 10:51:33 2017 +0530 @@ -167,6 +167,31 @@ times 32 db -2, 10 times 32 db 58, -2 +ALIGN 64 +const pw_ChromaCoeffVer_32_avx512, times 16 dw 0, 64 +times 16 dw 0, 0 + +times 16 dw -2, 58 +times 16 dw 10, -2 + +times 16 dw -4, 54 +times 16 dw 16, -2 + +times 16 dw -6, 46 +times 16 dw 28, -4 + +times 16 dw -4, 36 +times 16 dw 36, -4 + +times 16 dw -4, 28 +times 16 dw 46, -6 + +times 16 dw -2, 16 +times 16 dw 54, -4 + +times 16 dw -2, 10 +times 16 dw 58, -2 + const
[x265] [PATCH 177 of 307] x86: AVX512 optimise interp_4tap_vert_pp_32xN
# HG changeset patch # User Vignesh Vijayakumar# Date 1509968597 -19800 # Mon Nov 06 17:13:17 2017 +0530 # Node ID 2c24c0aadbe3e76eabde711a94c57aed077b7347 # Parent 67e149415f9f8be0d5b7832fde9e02cc592bbf28 x86: AVX512 optimise interp_4tap_vert_pp_32xN diff -r 67e149415f9f -r 2c24c0aadbe3 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Mon Nov 20 15:07:31 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Mon Nov 06 17:13:17 2017 +0530 @@ -142,6 +142,7 @@ times 16 db 58, -10 times 16 db 4, -1 +ALIGN 64 const tab_ChromaCoeffVer_32_avx512, times 32 db 0, 64 times 32 db 0, 0 @@ -10796,151 +10797,94 @@ IPFILTER_CHROMA_PS_48xN_AVX512 64 %endif -%macro PROCESS_CHROMA_VERT_PP_32x8_AVX512 0 -movu ym0, [r0]; m0 = row 0 -lea r6, [r0 + 4 * r1] -lea r7, [r2 + 4 * r3] -vinserti32x8 m0, [r6], 1; m0 = row 4 -movu ym1, [r0 + r1] ; m1 = row 1 -vinserti32x8 m1, [r6 + r1], 1; m1 = row 5 -punpcklbw m2, m0,m1 -punpckhbw m3, m0,m1 -pmaddubsw m2, m10 -pmaddubsw m3, m10 - -movu ym0, [r0 + r1 * 2] ; m0 = row 2 -vinserti32x8 m0, [r6 + r1 * 2], 1; m0 = row 6 -punpcklbw m4, m1,m0 -punpckhbw m5, m1,m0 -pmaddubsw m4, m10 -pmaddubsw m5, m10 - -movu ym1, [r0 + r4] ; m1 = row 3 -vinserti32x8 m1, [r6 + r4], 1; m1 = row 7 -punpcklbw m6, m0,m1 -punpckhbw m7, m0,m1 -pmaddubsw m8, m6,m11 -pmaddubsw m9, m7,m11 -pmaddubsw m6, m10 -pmaddubsw m7, m10 - -paddw m2, m8 -paddw m3, m9 - -pmulhrsw m2, m12 -pmulhrsw m3, m12 -packuswb m2, m3 -movu [r2],ym2 -vextracti32x8 [r7],m2,1 -lea r0, [r0 + r1 * 4] -lea r6, [r6 + r1 * 4] - -movu ym0, [r0]; m0 = row 4 -vinserti32x8 m0, [r6], 1; m0 = row 8 -punpcklbw m2, m1,m0 -punpckhbw m3, m1,m0 -pmaddubsw m8, m2,m11 -pmaddubsw m9, m3,m11 -pmaddubsw m2, m10 -pmaddubsw m3, m10 - -paddw m4, m8 -paddw m5, m9 -pmulhrsw m4, m12 -pmulhrsw m5, m12 -packuswb m4, m5 -movu [r2 + r3], ym4 -vextracti32x8 [r7 + r3], m4,1 - -movu ym1, [r0 + r1] ; m1 = row 5 -vinserti32x8 m1, [r6 + r1], 1; m1 = row 9 -punpcklbw m4, m0,m1 -punpckhbw m5, m0,m1 -pmaddubsw m4, m11 -pmaddubsw m5, m11 -paddw m6, m4 -paddw m7, m5 - -pmulhrsw m6, m12 -pmulhrsw m7, m12 -packuswb m6, m7 -movu [r2 + r3 * 2], ym6 -vextracti32x8 [r7 + r3 * 2], m6,1 - -movu ym0, [r0 + r1 * 2] ; m0 = row 6 -vinserti32x8 m0, [r6 + r1 * 2], 1; m0 = row 10 -punpcklbw m6, m1,m0 -punpckhbw m7, m1,m0 -pmaddubsw m6, m11 -pmaddubsw m7, m11 -paddw m2, m6 -paddw m3, m7 -pmulhrsw m2, m12 -pmulhrsw m3, m12 -packuswb m2, m3 -movu [r2 + r5], ym2 -
[x265] [PATCH 159 of 307] x86: dct8 PASS2 optimize for shuffle instructions
# HG changeset patch # User Praveen Tiwari# Date 1510583185 28800 # Mon Nov 13 06:26:25 2017 -0800 # Node ID 8bfedd92563a0e1da365c4d64a0e565e35f6025a # Parent a7ce91c5db95ac0eb3f58b5c993ace3bfe0bbe2f x86: dct8 PASS2 optimize for shuffle instructions diff -r a7ce91c5db95 -r 8bfedd92563a source/common/x86/dct8.asm --- a/source/common/x86/dct8.asmMon Nov 13 04:08:07 2017 -0800 +++ b/source/common/x86/dct8.asmMon Nov 13 06:26:25 2017 -0800 @@ -35,9 +35,11 @@ dct8_shuf8_AVX512: dd 0, 2, 8, 10, 4, 6, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 dct8_shuf4_AVX512: times 2 dd 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 dct8_shuf7_AVX512: dw 0, 2, 16, 18, 8, 10, 24, 26, 4, 6, 20, 22, 12, 14, 28, 30 +dct8_shuf9_AVX512: times 2 dw 0, 8, 16, 24, 4, 12, 20, 28 dct8_shuf: times 2 db 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9 dct8_shuf_AVX512: times 2 db 4, 5, 6, 7, 0, 1, 2, 3, 12, 13, 14, 15, 8, 9, 10, 11 + tab_dct8: dw 64, 64, 64, 64, 64, 64, 64, 64 dw 89, 75, 50, 18, -18, -50, -75, -89 dw 83, 36, -36, -83, -83, -36, 36, 83 @@ -2325,77 +2327,83 @@ %macro DCT8_AVX512_PASS_2 4 vpmaddwd m0, m9, m%1 vpmaddwd m1, m10, m%1 -vpshufb m2, m0, m6 -vpshufb m3, m1, m6 +vpsrldq m2, m0, 8 +vpsrldq m3, m1, 8 vpaddd m0, m2 vpaddd m1, m3 -vpermd m0, m18, m0 -vpermd m1, m18, m1 -vinserti64x4 m0, m0, ym1, 1 -vpshufb m1, m0, m6 -vpaddd m0, m1 -vpermd m0, m18, m0 - -vpmaddwd m1, m9, m%2 +vpsrlq m2, m0, 32 +vpsrlq m3, m1, 32 +vpaddd m0, m2 +vpaddd m1, m3 +vpaddd m0, m5 +vpsrad m0, DCT8_SHIFT2 +vpaddd m1, m5 +vpsrad m1, DCT8_SHIFT2 +vpackssdwm0, m1 +vpermw m0, m19, m0 + +vpmaddwd m1, m9, m%2 vpmaddwd m2, m10, m%2 -vpshufb m3, m1, m6 -vpshufb m4, m2, m6 +vpsrldq m3, m1, 8 +vpsrldq m4, m2, 8 vpaddd m1, m3 vpaddd m2, m4 -vpermd m1, m18, m1 -vpermd m2, m18, m2 -vinserti64x4 m1, m1, ym2, 1 -vpshufb m2, m1, m6 -vpaddd m1, m2 -vpermd m1, m18, m1 - -vinserti64x4 m0, m0, ym1, 1 -vpaddd m0, m5 -vpsrad m0, DCT8_SHIFT2 +vpsrlq m3, m1, 32 +vpsrlq m4, m2, 32 +vpaddd m1, m3 +vpaddd m2, m4 +vpaddd m1, m5 +vpsrad m1, DCT8_SHIFT2 +vpaddd m2, m5 +vpsrad m2, DCT8_SHIFT2 +vpackssdwm1, m2 +vpermw m1, m19, m1 +vinserti128 ym0, ym0, xm1, 1 vpmaddwd m1, m9, m%3 vpmaddwd m2, m10, m%3 -vpshufb m3, m1, m6 -vpshufb m4, m2, m6 +vpsrldq m3, m1, 8 +vpsrldq m4, m2, 8 vpaddd m1, m3 vpaddd m2, m4 -vpermd m1, m18, m1 -vpermd m2, m18, m2 -vinserti64x4 m1, m1, ym2, 1 -vpshufb m2, m1, m6 -vpaddd m1, m2 -vpermd m1, m18, m1 - -vpmaddwd m2, m9, m%4 -vpmaddwd m3, m10, m%4 -vpshufb m4, m2, m6 -vpshufb m7, m3, m6 +vpsrlq m3, m1, 32 +vpsrlq m4, m2, 32 +vpaddd m1, m3 vpaddd m2, m4 -vpaddd m3, m7 -vpermd m2, m18, m2 -vpermd m3, m18, m3 -vinserti64x4 m2, m2, ym3, 1 -vpshufb m3, m2, m6 -vpaddd m2,
[x265] [PATCH 174 of 307] x86: AVX512 interp_4tap_horiz_ps_24xN for high bit depth
# HG changeset patch # User Jayashri Murugan# Date 1510569160 -19800 # Mon Nov 13 16:02:40 2017 +0530 # Node ID df3c576cd32c50b0412ad3d70eeebfe8fb511da1 # Parent ab41c6957bc2f359e5df82f9936c3fd00a5d2ea5 x86: AVX512 interp_4tap_horiz_ps_24xN for high bit depth Color Space i420 Size | AVX2 performance | AVX512 performance -- 24x32 | 24.21x| 34.11x Color Space i422 Size | AVX2 performance | AVX512 performance -- 24x64 | 24.99x| 35.13x Color Space i444 Size | AVX2 performance | AVX512 performance -- 24x32 | 24.40x| 34.42x diff -r ab41c6957bc2 -r df3c576cd32c source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Apr 05 18:09:10 2018 -0700 +++ b/source/common/x86/asm-primitives.cpp Mon Nov 13 16:02:40 2017 +0530 @@ -2897,6 +2897,10 @@ p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_hps = PFX(interp_4tap_horiz_ps_8x16_avx512); p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_hps = PFX(interp_4tap_horiz_ps_8x32_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_hps = PFX(interp_4tap_horiz_ps_24x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_hps = PFX(interp_4tap_horiz_ps_24x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_24x32].filter_hps = PFX(interp_4tap_horiz_ps_24x32_avx512); + } #endif } diff -r ab41c6957bc2 -r df3c576cd32c source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Thu Apr 05 18:09:10 2018 -0700 +++ b/source/common/x86/ipfilter16.asm Mon Nov 13 16:02:40 2017 +0530 @@ -7479,6 +7479,228 @@ IPFILTER_CHROMA_PS_AVX512_8xN 32 IPFILTER_CHROMA_PS_AVX512_8xN 64 %endif + +%macro PROCESS_IPFILTER_CHROMA_PS_24x4_AVX512 0 +; register map +; m0 , m1 - interpolate coeff +; m2 , m3 - shuffle order table +; m4 - INTERP_OFFSET_PS +; m5 - shuffle store order table + +movuym6, [r0] +vinserti32x8m6,[r0 + r1], 1 +movuym7, [r0 + 8] +vinserti32x8m7,[r0 + r1 + 8], 1 + +pshufb m8,m6,m3 +pshufb m6,m2 +pmaddwd m6,m0 +pmaddwd m8,m1 +paddd m6,m8 +paddd m6,m4 +psrad m6,INTERP_SHIFT_PS + +pshufb m8,m7,m3 +pshufb m7,m2 +pmaddwd m7,m0 +pmaddwd m8,m1 +paddd m7,m8 +paddd m7,m4 +psrad m7,INTERP_SHIFT_PS + +packssdwm6,m7 +pshufb m6,m5 +movu[r2], ym6 +vextracti32x8 [r2 + r3], m6,1 + +movuym6, [r0 + 2 * r1] +vinserti32x8m6,[r0 + r6], 1 +movuym7, [r0 + 2 * r1 + 8] +vinserti32x8m7,[r0 + r6 + 8], 1 + +pshufb m8,m6,m3 +pshufb m6,m2 +pmaddwd m6,m0 +pmaddwd m8,m1 +paddd m6,m8 +paddd m6,m4 +psrad m6,INTERP_SHIFT_PS + +pshufb m8,m7,m3 +pshufb m7,m2 +pmaddwd m7,m0 +pmaddwd m8,m1 +paddd m7,m8 +paddd m7,m4 +psrad m7,INTERP_SHIFT_PS + +packssdwm6,m7 +pshufb m6,m5 +movu[r2 + 2 * r3],ym6 +vextracti32x8 [r2 + r7], m6,1 + +movuxm6, [r0 + mmsize/2] +vinserti32x4m6,[r0 + r1 + mmsize/2], 1 +vinserti32x4m6,[r0 + 2 * r1 + mmsize/2], 2 +vinserti32x4m6,[r0 + r6 + mmsize/2], 3 + +pshufb m8,m6,m3 +pshufb m6,m2 +pmaddwd m6,m0 +pmaddwd m8,m1 +paddd m6,m8 +paddd m6,m4 +psrad m6,INTERP_SHIFT_PS + +movuxm7, [r0 + mmsize/2 + 8] +vinserti32x4m7,[r0 + r1 + mmsize/2 + 8], 1 +vinserti32x4m7,[r0 + 2 * r1 + mmsize/2 + 8], 2 +vinserti32x4m7,[r0 + r6 + mmsize/2 + 8], 3 + +pshufb m8,m7,m3 +pshufb m7,m2 +pmaddwd m7,m0 +pmaddwd m8,m1 +paddd m7,m8 +paddd m7,m4 +psrad m7,INTERP_SHIFT_PS + +packssdwm6,m7 +pshufb m6,m5 +movu[r2 +
[x265] [PATCH 182 of 307] x86: AVX512 interp_4tap_vert_ss_48x64
# HG changeset patch # User Vignesh Vijayakumar# Date 1511337892 -19800 # Wed Nov 22 13:34:52 2017 +0530 # Node ID 3d6605772d179c329fffc669cbecc64afd8c8dff # Parent ad1814e2ff60904208508512af07472dee380c51 x86: AVX512 interp_4tap_vert_ss_48x64 AVX2 performance : 16.34x AVX512 performance : 35.69x diff -r ad1814e2ff60 -r 3d6605772d17 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Nov 22 12:27:48 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Nov 22 13:34:52 2017 +0530 @@ -4883,6 +4883,7 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vss = PFX(interp_4tap_vert_ss_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = PFX(interp_4tap_vert_ss_64x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = PFX(interp_4tap_vert_ss_64x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vss = PFX(interp_4tap_vert_ss_48x64_avx512); p.cu[BLOCK_8x8].dct = PFX(dct8_avx512); p.cu[BLOCK_8x8].idct = PFX(idct8_avx512); diff -r ad1814e2ff60 -r 3d6605772d17 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Wed Nov 22 12:27:48 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Wed Nov 22 13:34:52 2017 +0530 @@ -11373,6 +11373,155 @@ FILTER_VER_SS_CHROMA_32xN_AVX512 64 %endif +%macro PROCESS_CHROMA_VERT_SS_48x4_AVX512 0 +movu m1, [r0] +lea r6, [r0 + 2 * r1] +movu m10,[r6] +movu m3, [r0 + r1] +movu m12,[r6 + r1] +punpcklwd m0, m1, m3 +punpcklwd m9, m10, m12 +pmaddwd m0, m16 +pmaddwd m9, m16 +punpckhwd m1, m3 +punpckhwd m10,m12 +pmaddwd m1, m16 +pmaddwd m10,m16 + +movu m4, [r0 + 2 * r1] +movu m13,[r6 + 2 * r1] +punpcklwd m2, m3, m4 +punpcklwd m11,m12, m13 +pmaddwd m2, m16 +pmaddwd m11,m16 +punpckhwd m3, m4 +punpckhwd m12,m13 +pmaddwd m3, m16 +pmaddwd m12,m16 + +movu m5, [r0 + r7] +movu m14,[r6 + r7] +punpcklwd m6, m4, m5 +punpcklwd m15,m13, m14 +pmaddwd m6, m17 +pmaddwd m15,m17 +paddd m0, m6 +paddd m9, m15 +punpckhwd m4, m5 +punpckhwd m13,m14 +pmaddwd m4, m17 +pmaddwd m13,m17 +paddd m1, m4 +paddd m10,m13 + +movu m4, [r0 + 4 * r1] +movu m13,[r6 + 4 * r1] +punpcklwd m6, m5, m4 +punpcklwd m15,m14, m13 +pmaddwd m6, m17 +pmaddwd m15,m17 +paddd m2, m6 +paddd m11,m15 +punpckhwd m5, m4 +punpckhwd m14,m13 +pmaddwd m5, m17 +pmaddwd m14,m17 +paddd m3, m5 +paddd m12,m14 + +psrad m0, 6 +psrad m1, 6 +psrad m2, 6 +psrad m3, 6 +psrad m9, 6 +psrad m10,6 +psrad m11,6 +psrad m12,6 +packssdw m0, m1 +packssdw m2, m3 +packssdw m9, m10 +packssdw m11,m12 + +movu [r2], m0 +movu [r2 + r3], m2 +movu
[x265] [PATCH 195 of 307] x86: AVX512 interp_8tap_vert_sp_24x32 and interp_8tap_vert_ss_24x32
# HG changeset patch # User Vignesh Vijayakumar# Date 1511505545 -19800 # Fri Nov 24 12:09:05 2017 +0530 # Node ID 47b99c09008b1921881b0dfa00d80cce1f8d15eb # Parent ecaf36f641dd1428d556f172e83cf7078f0287fb x86: AVX512 interp_8tap_vert_sp_24x32 and interp_8tap_vert_ss_24x32 luma_vss AVX2 performance : 10.98x AVX512 performance : 16.36x luma_vsp AVX2 performance : 12.19x AVX512 performance : 17.20x diff -r ecaf36f641dd -r 47b99c09008b source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Nov 24 11:34:33 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Nov 24 12:09:05 2017 +0530 @@ -2844,6 +2844,7 @@ p.pu[LUMA_16x16].luma_vss = PFX(interp_8tap_vert_ss_16x16_avx512); p.pu[LUMA_16x32].luma_vss = PFX(interp_8tap_vert_ss_16x32_avx512); p.pu[LUMA_16x64].luma_vss = PFX(interp_8tap_vert_ss_16x64_avx512); +p.pu[LUMA_24x32].luma_vss = PFX(interp_8tap_vert_ss_24x32_avx512); p.pu[LUMA_32x8].luma_vss = PFX(interp_8tap_vert_ss_32x8_avx512); p.pu[LUMA_32x16].luma_vss = PFX(interp_8tap_vert_ss_32x16_avx512); p.pu[LUMA_32x32].luma_vss = PFX(interp_8tap_vert_ss_32x32_avx512); @@ -2864,6 +2865,7 @@ p.pu[LUMA_16x16].luma_vsp = PFX(interp_8tap_vert_sp_16x16_avx512); p.pu[LUMA_16x32].luma_vsp = PFX(interp_8tap_vert_sp_16x32_avx512); p.pu[LUMA_16x64].luma_vsp = PFX(interp_8tap_vert_sp_16x64_avx512); +p.pu[LUMA_24x32].luma_vsp = PFX(interp_8tap_vert_sp_24x32_avx512); p.pu[LUMA_32x8].luma_vsp = PFX(interp_8tap_vert_sp_32x8_avx512); p.pu[LUMA_32x16].luma_vsp = PFX(interp_8tap_vert_sp_32x16_avx512); p.pu[LUMA_32x32].luma_vsp = PFX(interp_8tap_vert_sp_32x32_avx512); diff -r ecaf36f641dd -r 47b99c09008b source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Fri Nov 24 11:34:33 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Fri Nov 24 12:09:05 2017 +0530 @@ -11134,6 +11134,284 @@ FILTER_VER_S_LUMA_16xN_AVX512 sp, 64 %endif +%macro PROCESS_LUMA_VERT_S_24x8_AVX512 1 +PROCESS_LUMA_VERT_S_16x4_AVX512 %1 +lea r4, [r6 + 4 * r1] +lea r8, [r4 + 4 * r1] +movu ym1, [r6] +movu ym3, [r6 + r1] +vinserti32x8 m1, [r6 + 2 * r1], 1 +vinserti32x8 m3, [r6 + r7], 1 +punpcklwdm0, m1, m3 +pmaddwd m0, m15 +punpckhwdm1, m3 +pmaddwd m1, m15 + +movu ym4, [r6 + 2 * r1] +vinserti32x8 m4, [r4], 1 +punpcklwdm2, m3, m4 +pmaddwd m2, m15 +punpckhwdm3, m4 +pmaddwd m3, m15 + +movu ym5, [r6 + r7] +vinserti32x8 m5, [r4 + r1], 1 +punpcklwdm6, m4, m5 +pmaddwd m6, m16 +punpckhwdm4, m5 +pmaddwd m4, m16 + +padddm0, m6 +padddm1, m4 + +movu ym4, [r4] +vinserti32x8 m4, [r4 + 2 * r1], 1 +punpcklwdm6, m5, m4 +pmaddwd m6, m16 +punpckhwdm5, m4 +pmaddwd m5, m16 + +padddm2, m6 +padddm3, m5 + +movu ym11,[r4 + r1] +vinserti32x8 m11, [r4 + r7], 1 +punpcklwdm8, m4, m11 +pmaddwd m8, m17 +punpckhwdm4, m11 +pmaddwd m4, m17 + +movu ym12,[r4 + 2 * r1] +vinserti32x8 m12, [r4 + 4 * r1], 1 +punpcklwdm10, m11,m12 +pmaddwd m10, m17 +punpckhwdm11, m12 +pmaddwd m11, m17 + +movu ym13,[r4 + r7] +vinserti32x8 m13, [r8 + r1], 1 +punpcklwdm14, m12,m13 +pmaddwd m14,
[x265] [PATCH 228 of 307] x86: AVX512 interp_4tap_vert_sp_48x64
# HG changeset patch # User Vignesh Vijayakumar# Date 1512041776 -19800 # Thu Nov 30 17:06:16 2017 +0530 # Node ID e77ef4964dd04de6a8b84378f7a46219f34bf1b5 # Parent 9c652d9062d29607cdb3392567817e4e2ab7f6bb x86: AVX512 interp_4tap_vert_sp_48x64 AVX2 performance : 11.93x AVX512 performance : 23.59x diff -r 9c652d9062d2 -r e77ef4964dd0 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Nov 30 17:01:28 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Nov 30 17:06:16 2017 +0530 @@ -4998,6 +4998,7 @@ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vsp = PFX(interp_4tap_vert_sp_32x24_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vsp = PFX(interp_4tap_vert_sp_48x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vsp = PFX(interp_4tap_vert_sp_64x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vsp = PFX(interp_4tap_vert_sp_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vsp = PFX(interp_4tap_vert_sp_64x32_avx512); diff -r 9c652d9062d2 -r e77ef4964dd0 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Thu Nov 30 17:01:28 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Thu Nov 30 17:06:16 2017 +0530 @@ -11728,114 +11728,122 @@ FILTER_VER_S_CHROMA_32xN_AVX512 sp, 48 FILTER_VER_S_CHROMA_32xN_AVX512 sp, 64 %endif -%macro PROCESS_CHROMA_VERT_SS_48x4_AVX512 0 -movu m1, [r0] + +%macro PROCESS_CHROMA_VERT_S_48x4_AVX512 1 +PROCESS_CHROMA_VERT_S_32x2_AVX512 %1 lea r6, [r0 + 2 * r1] -movu m10,[r6] -movu m3, [r0 + r1] -movu m12,[r6 + r1] + +movu m1, [r6] +movu m3, [r6 + r1] punpcklwd m0, m1, m3 -punpcklwd m9, m10, m12 -pmaddwd m0, m16 -pmaddwd m9, m16 +pmaddwd m0, m7 punpckhwd m1, m3 -punpckhwd m10,m12 -pmaddwd m1, m16 -pmaddwd m10,m16 - -movu m4, [r0 + 2 * r1] -movu m13,[r6 + 2 * r1] +pmaddwd m1, m7 +movu m4, [r6 + 2 * r1] punpcklwd m2, m3, m4 -punpcklwd m11,m12, m13 -pmaddwd m2, m16 -pmaddwd m11,m16 +pmaddwd m2, m7 punpckhwd m3, m4 -punpckhwd m12,m13 -pmaddwd m3, m16 -pmaddwd m12,m16 - -movu m5, [r0 + r7] -movu m14,[r6 + r7] +pmaddwd m3, m7 + +movu m5, [r6 + r4] punpcklwd m6, m4, m5 -punpcklwd m15,m13, m14 -pmaddwd m6, m17 -pmaddwd m15,m17 +pmaddwd m6, m8 paddd m0, m6 -paddd m9, m15 punpckhwd m4, m5 -punpckhwd m13,m14 -pmaddwd m4, m17 -pmaddwd m13,m17 +pmaddwd m4, m8 paddd m1, m4 -paddd m10,m13 - -movu m4, [r0 + 4 * r1] -movu m13,[r6 + 4 * r1] + +movu m4, [r6 + 4 * r1] punpcklwd m6, m5, m4 -punpcklwd m15,m14, m13 -pmaddwd m6, m17 -pmaddwd m15,m17 +pmaddwd m6, m8 paddd m2, m6 -paddd m11,m15 punpckhwd m5, m4 -
[x265] [PATCH 229 of 307] [x265-avx512]x86: AVX512 denoise DCT
# HG changeset patch # User Gopi Satykrishna Akisetty# Date 1512036841 -19800 # Thu Nov 30 15:44:01 2017 +0530 # Node ID f86b11b8c629b0e4bf8342d42a0e9c475d7c3a7d # Parent e77ef4964dd04de6a8b84378f7a46219f34bf1b5 [x265-avx512]x86: AVX512 denoise DCT diff -r e77ef4964dd0 -r f86b11b8c629 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Nov 30 17:06:16 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Nov 30 15:44:01 2017 +0530 @@ -2888,6 +2888,7 @@ p.cu[BLOCK_32x32].idct = PFX(idct32_avx512); p.quant = PFX(quant_avx512); p.nquant = PFX(nquant_avx512); +p.denoiseDct = PFX(denoise_dct_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512); @@ -5068,6 +5069,7 @@ p.cu[BLOCK_32x32].idct = PFX(idct32_avx512); p.quant = PFX(quant_avx512); p.nquant = PFX(nquant_avx512); +p.denoiseDct = PFX(denoise_dct_avx512); } #endif } diff -r e77ef4964dd0 -r f86b11b8c629 source/common/x86/dct8.asm --- a/source/common/x86/dct8.asmThu Nov 30 17:06:16 2017 +0530 +++ b/source/common/x86/dct8.asmThu Nov 30 15:44:01 2017 +0530 @@ -2357,6 +2357,67 @@ dec r3d jnz .loop RET +%if ARCH_X86_64 == 1 +INIT_ZMM avx512 +cglobal denoise_dct, 4, 4, 22 +pxor m16, m16 +sub r3d, 16 +je .coeff16 +add r3d, 16 +shr r3d,5 +jmp .loop + +.coeff16: +movu ym19, [r0] +pabsw ym17, ym19 +movum2, [r1] +pmovsxwd m18, ym17 +paddd m2, m18 +movu [r1], m2 +movu ym3, [r2] +psubusw ym17, ym3 +pcmpgtw ym18, ym17, ym16 +pand ym17, ym18 +psignwym17, ym19 +movu [r0], ym17 +RET + +.loop: +movu m21, [r0] +pabsw m17, m21 +movu m2, [r1] +pmovsxwd m4, ym17 +paddd m2, m4 +movu [r1], m2 +vextracti64x4 ym4, m17, 1 + +movu m2, [r1 + mmsize] +pmovsxwd m3, ym4 +paddd m2, m3 +movu [r1 + mmsize], m2 +movu m3, [r2] +psubusw m17, m3 + +vextracti64x4 ym20, m17,1 +pcmpgtw ym18, ym17, ym16 +pcmpgtw ym19, ym20, ym16 +vinserti64x4 m18, m18, ym19, 1 + +pand m17, m18 +vextracti64x4 ym19, m17, 1 +vextracti64x4 ym20, m21, 1 +psignwym17, ym21 +psignwym19, ym20 +vinserti64x4 m17, m17, ym19, 1 + +movu [r0], m17 +add r0, mmsize +add r1, mmsize * 2 +add r2, mmsize +dec r3d +jnz .loop +RET +%endif ; ARCH_X86_64 == 1 %if ARCH_X86_64 == 1 %macro DCT8_PASS_1 4 diff -r e77ef4964dd0 -r f86b11b8c629 source/common/x86/dct8.h --- a/source/common/x86/dct8.h Thu Nov 30 17:06:16 2017 +0530 +++ b/source/common/x86/dct8.h Thu Nov 30 15:44:01 2017 +0530 @@ -42,7 +42,7 @@ void PFX(idst4_avx2)(const int16_t* src, int16_t* dst, intptr_t srcStride); void PFX(denoise_dct_sse4)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size); void PFX(denoise_dct_avx2)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size); - +void PFX(denoise_dct_avx512)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size); void PFX(dct8_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride); void PFX(idct8_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride); void PFX(idct16_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 245 of 307] x86: AVX512 interp_4tap_vert_ps_32xN
# HG changeset patch # User Jayashri Murugan# Date 1512455478 -19800 # Tue Dec 05 12:01:18 2017 +0530 # Node ID c335a7ca4304001e245dea7977cde1c2e0c0a8ee # Parent 81a870948ac446b36c248325e0c7264cf8f3f09e x86: AVX512 interp_4tap_vert_ps_32xN i420 Size | AVX2 performance | AVX512 performance -- 32x8 | 36.28x | 47.86x 32x16 | 40.43x | 51.57x 32x24 | 40.96x | 54.05x 32x32 | 40.12x | 54.27x i422 Size | AVX2 performance | AVX512 performance -- 32x16 | 39.84x | 51.35x 32x32 | 39.86x | 54.17x 32x48 | 41.14x | 54.85x 32x64 | 42.00x | 56.50x i444 Size | AVX2 performance | AVX512 performance -- 32x8 | 36.08x | 47.61x 32x16 | 39.96x | 51.41x 32x24 | 40.38x | 54.51x 32x32 | 40.07x | 54.56x 32x64 | 41.94x | 56.59x diff -r 81a870948ac4 -r c335a7ca4304 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Dec 07 15:31:54 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Dec 05 12:01:18 2017 +0530 @@ -5158,6 +5158,23 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = PFX(interp_4tap_vert_ps_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = PFX(interp_4tap_vert_ps_64x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = PFX(interp_4tap_vert_ps_64x16_avx512); + +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx512); + +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vps = PFX(interp_4tap_vert_ps_32x48_avx512); + +p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512); + } #endif } diff -r 81a870948ac4 -r c335a7ca4304 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Thu Dec 07 15:31:54 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Tue Dec 05 12:01:18 2017 +0530 @@ -10951,7 +10951,7 @@ FILTER_VER_PP_CHROMA_16xN_AVX512 64 %endif -%macro PROCESS_CHROMA_VERT_PP_32x4_AVX512 0 +%macro PROCESS_CHROMA_VERT_32x4_AVX512 1 movu ym1,[r0] movu ym3,[r0 + r1] vinserti32x8 m1, [r0 + 2 * r1], 1 @@ -10988,25 +10988,45 @@ pmaddubsw m5, m9 paddw m3, m5 +%ifidn %1,pp pmulhrsw m0, m7 pmulhrsw m1, m7 pmulhrsw m2, m7 pmulhrsw m3, m7 - packuswb m0, m1 packuswb m2, m3 movu [r2], ym0 movu [r2 + r3], ym2 vextracti32x8 [r2 + 2 * r3], m0, 1 vextracti32x8 [r2 + r7], m2, 1 +%else +psubw m0, m7 +psubw m1, m7 +psubw m2, m7 +psubw m3, m7 + +mova m4, m10 +mova m5, m11 +vpermi2q m4, m0,m1 +vpermi2q m5, m0,m1 +mova m6, m10 +mova m12,m11 +vpermi2q m6, m2,m3 +vpermi2q m12, m2,m3 + +movu
[x265] [PATCH 232 of 307] x86: AVX512 optimise interp_4tap_vert_pp_16xN
# HG changeset patch # User Vignesh Vijayakumar# Date 1512377610 -19800 # Mon Dec 04 14:23:30 2017 +0530 # Node ID 3e8615bc86537e07754a1c023ade702a837042a8 # Parent 465682e66d91ecf207feae78c33e32f0eaaf45c4 x86: AVX512 optimise interp_4tap_vert_pp_16xN i444 Size | AVX2 performance | AVX512 performance -- 16x4 | 26.22x | 32.07x 16x12 | 30.95x | 40.01x diff -r 465682e66d91 -r 3e8615bc8653 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Dec 04 12:33:32 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Dec 04 14:23:30 2017 +0530 @@ -4893,9 +4893,11 @@ p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_hps = PFX(interp_4tap_horiz_ps_48x64_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512); -p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512); -p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512); @@ -4927,6 +4929,7 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = PFX(interp_4tap_vert_pp_16x24_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = PFX(interp_4tap_vert_pp_16x24_avx512); @@ -4960,7 +4963,9 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vsp = PFX(interp_4tap_vert_sp_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx512); diff -r 465682e66d91 -r 3e8615bc8653 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Mon Dec 04 12:33:32 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Mon Dec 04 14:23:30 2017 +0530 @@ -10866,96 +10866,50 @@ ;- ;avx512 chroma_vpp code start ;- -%macro PROCESS_CHROMA_VERT_PP_16x8_AVX512 0 +%macro PROCESS_CHROMA_VERT_PP_16x4_AVX512 0 +lea r5, [r0 + 4 * r1] movu xm1,[r0] -lea r8, [r0 + 4 * r1] -lea r9, [r8 + 2 * r1] -vinserti32x4 m1, [r0 + 2 * r1],1 -vinserti32x4 m1, [r8],2 -vinserti32x4 m1, [r9],3 movu xm3,[r0 + r1] -vinserti32x4 m3, [r0 + r6], 1 -vinserti32x4 m3, [r8 + r1], 2 -vinserti32x4 m3, [r9 + r1], 3 +vinserti32x4 m1, [r0 + r1], 1 +vinserti32x4 m3, [r0 + 2 * r1], 1 +vinserti32x4 m1, [r0 + 2 * r1], 2 +
[x265] [PATCH 243 of 307] [x265-avx512]x86: AVX512 sad_x3_16xN for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty# Date 1512637265 -19800 # Thu Dec 07 14:31:05 2017 +0530 # Node ID 0ffc9c56a0a7361e98e6388e3067e4a78e8cd252 # Parent 931dd781dc0c6de76bb31d0215db7a7af885f9bf [x265-avx512]x86: AVX512 sad_x3_16xN for high bit depth Size | AVX2 performance | AVX512 performance -- 16x8 | 16.34x | 17.91x 16x12 | 17.38x | 18.82x 16x16 | 17.90x | 20.07x 16x32 | 18.39x | 21.77x 16x64 | 18.00x | 22.43x diff -r 931dd781dc0c -r 0ffc9c56a0a7 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Dec 07 11:07:35 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Dec 07 14:31:05 2017 +0530 @@ -2495,6 +2495,11 @@ p.pu[LUMA_64x64].pixelavg_pp = PFX(pixel_avg_64x64_avx512); p.pu[LUMA_48x64].pixelavg_pp = PFX(pixel_avg_48x64_avx512); +p.pu[LUMA_16x8].sad_x3 = PFX(pixel_sad_x3_16x8_avx512); +p.pu[LUMA_16x12].sad_x3 = PFX(pixel_sad_x3_16x12_avx512); +p.pu[LUMA_16x16].sad_x3 = PFX(pixel_sad_x3_16x16_avx512); +p.pu[LUMA_16x32].sad_x3 = PFX(pixel_sad_x3_16x32_avx512); +p.pu[LUMA_16x64].sad_x3 = PFX(pixel_sad_x3_16x64_avx512); p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx512); p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx512); p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512); diff -r 931dd781dc0c -r 0ffc9c56a0a7 source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Thu Dec 07 11:07:35 2017 +0530 +++ b/source/common/x86/sad16-a.asm Thu Dec 07 14:31:05 2017 +0530 @@ -2443,6 +2443,54 @@ %endmacro +%macro PROCESS_SAD_X3_16x4_AVX512 0 +movuym6, [r0] +vinserti64x4 m6, [r0 + 2 * FENC_STRIDE], 1 +movuym3, [r1] +vinserti64x4 m3, [r1 + r4], 1 +movuym4, [r2] +vinserti64x4 m4, [r2 + r4], 1 +movuym5, [r3] +vinserti64x4 m5, [r3 + r4], 1 + +psubw m3, m6 +psubw m4, m6 +psubw m5, m6 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 + +movuym6, [r0 + 4 * FENC_STRIDE] +vinserti64x4 m6, [r0 + 6 * FENC_STRIDE], 1 +movuym3, [r1 + 2 * r4] +vinserti64x4 m3, [r1 + r6], 1 +movuym4, [r2 + 2 * r4] +vinserti64x4 m4, [r2 + r6], 1 +movuym5, [r3 + 2 * r4] +vinserti64x4 m5, [r3 + r6], 1 + +psubw m3, m6 +psubw m4, m6 +psubw m5, m6 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 +%endmacro + %macro PROCESS_SAD_X3_32x4_AVX512 0 movum6, [r0] @@ -2700,6 +2748,118 @@ ;-- +; void pixel_sad_x3_16x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res ) +;-- +%if ARCH_X86_64 +INIT_ZMM avx512 +cglobal pixel_sad_x3_16x8, 6,7,8 +pxorm0, m0 +pxorm1, m1 +pxorm2, m2 + +vbroadcasti32x8 m7, [pw_1] + +add r4d, r4d +lea r6d, [r4 * 3] + +PROCESS_SAD_X3_16x4_AVX512 +add r0, FENC_STRIDE * 8 +lea r1, [r1 + r4 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r4 * 4] +PROCESS_SAD_X3_16x4_AVX512 +PROCESS_SAD_X3_END_AVX512 +RET + +INIT_ZMM avx512 +cglobal pixel_sad_x3_16x12, 6,7,8 +pxorm0, m0 +pxorm1, m1 +pxorm2, m2 + +vbroadcasti32x8 m7, [pw_1] + +add r4d, r4d +lea r6d, [r4 * 3] +%rep 2 +PROCESS_SAD_X3_16x4_AVX512 +add r0, FENC_STRIDE * 8 +lea r1, [r1 + r4 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r4 * 4] +%endrep +PROCESS_SAD_X3_16x4_AVX512 +PROCESS_SAD_X3_END_AVX512 +RET + +INIT_ZMM avx512 +cglobal pixel_sad_x3_16x16, 6,7,8 +pxorm0, m0 +pxorm1, m1 +pxorm2, m2 + +vbroadcasti32x8 m7, [pw_1] + +add r4d, r4d +lea r6d, [r4 * 3] + +%rep 3 +PROCESS_SAD_X3_16x4_AVX512 +add r0, FENC_STRIDE * 8 +lea r1, [r1 + r4 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r4 * 4] +%endrep +PROCESS_SAD_X3_16x4_AVX512 +PROCESS_SAD_X3_END_AVX512 +RET + +INIT_ZMM avx512
[x265] [PATCH 221 of 307] x86: AVX512 interp_8tap_vert_sp_64xN
# HG changeset patch # User Vignesh Vijayakumar# Date 1511953084 -19800 # Wed Nov 29 16:28:04 2017 +0530 # Node ID 834a8f52a976a6c5da294267392bcd6da1aa6d6e # Parent 9f2c4a0d09f3405f9c28cd3ebf229617c2278681 x86: AVX512 interp_8tap_vert_sp_64xN Size | AVX2 performance | AVX512 performance -- 64x16 | 12.23x | 21.04x 64x32 | 12.24x | 22.10x 64x48 | 12.28x | 22.19x 64x64 | 12.26x | 22.23x diff -r 9f2c4a0d09f3 -r 834a8f52a976 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Nov 29 15:08:25 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Nov 29 16:28:04 2017 +0530 @@ -2886,7 +2886,10 @@ p.pu[LUMA_32x24].luma_vsp = PFX(interp_8tap_vert_sp_32x24_avx512); p.pu[LUMA_32x16].luma_vsp = PFX(interp_8tap_vert_sp_32x16_avx512); p.pu[LUMA_32x8].luma_vsp = PFX(interp_8tap_vert_sp_32x8_avx512); - +p.pu[LUMA_64x64].luma_vsp = PFX(interp_8tap_vert_sp_64x64_avx512); +p.pu[LUMA_64x48].luma_vsp = PFX(interp_8tap_vert_sp_64x48_avx512); +p.pu[LUMA_64x32].luma_vsp = PFX(interp_8tap_vert_sp_64x32_avx512); +p.pu[LUMA_64x16].luma_vsp = PFX(interp_8tap_vert_sp_64x16_avx512); p.cu[BLOCK_8x8].dct = PFX(dct8_avx512); p.cu[BLOCK_8x8].idct = PFX(idct8_avx512); p.cu[BLOCK_16x16].idct = PFX(idct16_avx512); diff -r 9f2c4a0d09f3 -r 834a8f52a976 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Wed Nov 29 15:08:25 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Wed Nov 29 16:28:04 2017 +0530 @@ -13702,86 +13702,8 @@ PROCESS_LUMA_VERT_SS_48x4_AVX512 RET %endif - -%macro PROCESS_LUMA_VERT_SS_64x2_AVX512 0 -movu m1, [r0] ;0 row -movu m3, [r0 + r1] ;1 row -punpcklwdm0, m1, m3 -pmaddwd m0, m15 -punpckhwdm1, m3 -pmaddwd m1, m15 - -movu m4, [r0 + 2 * r1] ;2 row -punpcklwdm2, m3, m4 -pmaddwd m2, m15 -punpckhwdm3, m4 -pmaddwd m3, m15 - -movu m5, [r0 + r7] ;3 row -punpcklwdm6, m4, m5 -pmaddwd m6, m16 -punpckhwdm4, m5 -pmaddwd m4, m16 - -padddm0, m6 -padddm1, m4 - -movu m4, [r0 + 4 * r1] ;4 row -punpcklwdm6, m5, m4 -pmaddwd m6, m16 -punpckhwdm5, m4 -pmaddwd m5, m16 - -padddm2, m6 -padddm3, m5 - -lea r6, [r0 + 4 * r1] - -movu m11, [r6 + r1] ;5 row -punpcklwdm8, m4, m11 -pmaddwd m8, m17 -punpckhwdm4, m11 -pmaddwd m4, m17 - -movu m12, [r6 + 2 * r1] ;6 row -punpcklwdm10, m11,m12 -pmaddwd m10, m17 -punpckhwdm11, m12 -pmaddwd m11, m17 - -movu m13, [r6 + r7] ;7 row -punpcklwdm14, m12,m13 -pmaddwd m14, m18 -punpckhwdm12, m13 -pmaddwd m12, m18 - -padddm8, m14 -padddm4, m12 -padddm0, m8 -padddm1, m4 - -movu m12, [r6 + 4 * r1] ; 8 row -punpcklwdm14, m13,m12 -pmaddwd m14, m18 -punpckhwdm13, m12 -pmaddwd m13, m18 - -padddm10, m14 -padddm11, m13 -
[x265] [PATCH 238 of 307] x86: AVX512 interp_8tap_vert_pp_16xN and interp_8tap_vert_ps_16xN for high bit depth
# HG changeset patch # User Vignesh Vijayakumar# Date 1512465067 -19800 # Tue Dec 05 14:41:07 2017 +0530 # Node ID f92128e41ac3c1da210c1c665d97061539821aaf # Parent ca6bb5919227672e0cf98b785acf099531c32945 x86: AVX512 interp_8tap_vert_pp_16xN and interp_8tap_vert_ps_16xN for high bit depth luma_vpp Size | AVX2 performance | AVX512 performance -- 16x4 | 8.32x | 13.14x 16x8 | 10.69x | 15.14x 16x12 | 11.62x | 15.94x 16x16 | 12.19x | 15.97x 16x32 | 12.24x | 16.59x 16x64 | 12.57x | 16.50x luma_vps Size | AVX2 performance | AVX512 performance -- 16x4 | 8.04x | 15.37x 16x8 | 9.72x | 14.97x 16x12 | 10.47x | 14.71x 16x16 | 9.79x | 15.21x 16x32 | 9.66x | 15.60x 16x64 | 11.16x | 15.67x diff -r ca6bb5919227 -r f92128e41ac3 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Dec 05 13:28:42 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Dec 05 14:41:07 2017 +0530 @@ -2882,6 +2882,12 @@ p.pu[LUMA_64x64].luma_vsp = PFX(interp_8tap_vert_sp_64x64_avx512); p.pu[LUMA_48x64].luma_vsp = PFX(interp_8tap_vert_sp_48x64_avx512); +p.pu[LUMA_16x4].luma_vpp = PFX(interp_8tap_vert_pp_16x4_avx512); +p.pu[LUMA_16x8].luma_vpp = PFX(interp_8tap_vert_pp_16x8_avx512); +p.pu[LUMA_16x12].luma_vpp = PFX(interp_8tap_vert_pp_16x12_avx512); +p.pu[LUMA_16x16].luma_vpp = PFX(interp_8tap_vert_pp_16x16_avx512); +p.pu[LUMA_16x32].luma_vpp = PFX(interp_8tap_vert_pp_16x32_avx512); +p.pu[LUMA_16x64].luma_vpp = PFX(interp_8tap_vert_pp_16x64_avx512); p.pu[LUMA_32x8].luma_vpp = PFX(interp_8tap_vert_pp_32x8_avx512); p.pu[LUMA_32x16].luma_vpp = PFX(interp_8tap_vert_pp_32x16_avx512); p.pu[LUMA_32x32].luma_vpp = PFX(interp_8tap_vert_pp_32x32_avx512); @@ -2892,6 +2898,12 @@ p.pu[LUMA_64x48].luma_vpp = PFX(interp_8tap_vert_pp_64x48_avx512); p.pu[LUMA_64x64].luma_vpp = PFX(interp_8tap_vert_pp_64x64_avx512); +p.pu[LUMA_16x4].luma_vps = PFX(interp_8tap_vert_ps_16x4_avx512); +p.pu[LUMA_16x8].luma_vps = PFX(interp_8tap_vert_ps_16x8_avx512); +p.pu[LUMA_16x12].luma_vps = PFX(interp_8tap_vert_ps_16x12_avx512); +p.pu[LUMA_16x16].luma_vps = PFX(interp_8tap_vert_ps_16x16_avx512); +p.pu[LUMA_16x32].luma_vps = PFX(interp_8tap_vert_ps_16x32_avx512); +p.pu[LUMA_16x64].luma_vps = PFX(interp_8tap_vert_ps_16x64_avx512); p.pu[LUMA_32x8].luma_vps = PFX(interp_8tap_vert_ps_32x8_avx512); p.pu[LUMA_32x16].luma_vps = PFX(interp_8tap_vert_ps_32x16_avx512); p.pu[LUMA_32x32].luma_vps = PFX(interp_8tap_vert_ps_32x32_avx512); diff -r ca6bb5919227 -r f92128e41ac3 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Tue Dec 05 13:28:42 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Tue Dec 05 14:41:07 2017 +0530 @@ -12930,6 +12930,169 @@ ;- ;avx512 luma_vpp and luma_vps code start ;- +%macro PROCESS_LUMA_VERT_P_16x4_AVX512 1 +lea r6, [r0 + 4 * r1] +movu ym1, [r0] +movu ym3, [r0 + r1] +vinserti32x8 m1, [r0 + 2 * r1], 1 +vinserti32x8 m3, [r0 + r7], 1 +punpcklwdm0, m1, m3 +pmaddwd m0, m15 +punpckhwdm1, m3 +pmaddwd m1, m15 + +movu ym4, [r0 + 2 * r1] +vinserti32x8 m4, [r0 + 4 * r1], 1 +punpcklwdm2, m3, m4 +pmaddwd m2, m15 +punpckhwdm3, m4 +pmaddwd m3, m15 + +movu ym5, [r0 + r7] +vinserti32x8 m5, [r6 + r1], 1 +punpcklwdm6, m4, m5 +pmaddwd m6, m16 +punpckhwdm4, m5 +pmaddwd m4, m16 + +padddm0, m6 +padddm1, m4 + +movu ym4, [r6] +vinserti32x8 m4, [r6 + 2 * r1],
[x265] [PATCH 240 of 307] x86: AVX512 interp_8tap_vert_pp_24xN and interp_vert_ps_24xN for high bit depth
# HG changeset patch # User Vignesh Vijayakumar# Date 1512537795 -19800 # Wed Dec 06 10:53:15 2017 +0530 # Node ID 2d298099a8d6b266a32b975de4b6a369988d3887 # Parent 8b1c9d9c5bd8135dc11b6d031b990bfe47e3bcd8 x86: AVX512 interp_8tap_vert_pp_24xN and interp_vert_ps_24xN for high bit depth luma_vpp AVX2 performance : 11.91x AVX512 performance : 15.77x luma_vsp AVX2 performance : 10.36x AVX512 performance : 14.20x diff -r 8b1c9d9c5bd8 -r 2d298099a8d6 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Dec 05 17:30:30 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Dec 06 10:53:15 2017 +0530 @@ -2888,6 +2888,7 @@ p.pu[LUMA_16x16].luma_vpp = PFX(interp_8tap_vert_pp_16x16_avx512); p.pu[LUMA_16x32].luma_vpp = PFX(interp_8tap_vert_pp_16x32_avx512); p.pu[LUMA_16x64].luma_vpp = PFX(interp_8tap_vert_pp_16x64_avx512); +p.pu[LUMA_24x32].luma_vpp = PFX(interp_8tap_vert_pp_24x32_avx512); p.pu[LUMA_32x8].luma_vpp = PFX(interp_8tap_vert_pp_32x8_avx512); p.pu[LUMA_32x16].luma_vpp = PFX(interp_8tap_vert_pp_32x16_avx512); p.pu[LUMA_32x32].luma_vpp = PFX(interp_8tap_vert_pp_32x32_avx512); @@ -2905,6 +2906,7 @@ p.pu[LUMA_16x16].luma_vps = PFX(interp_8tap_vert_ps_16x16_avx512); p.pu[LUMA_16x32].luma_vps = PFX(interp_8tap_vert_ps_16x32_avx512); p.pu[LUMA_16x64].luma_vps = PFX(interp_8tap_vert_ps_16x64_avx512); +p.pu[LUMA_24x32].luma_vps = PFX(interp_8tap_vert_ps_24x32_avx512); p.pu[LUMA_32x8].luma_vps = PFX(interp_8tap_vert_ps_32x8_avx512); p.pu[LUMA_32x16].luma_vps = PFX(interp_8tap_vert_ps_32x16_avx512); p.pu[LUMA_32x32].luma_vps = PFX(interp_8tap_vert_ps_32x32_avx512); diff -r 8b1c9d9c5bd8 -r 2d298099a8d6 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Tue Dec 05 17:30:30 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Wed Dec 06 10:53:15 2017 +0530 @@ -12931,7 +12931,7 @@ ;avx512 luma_vpp and luma_vps code start ;- %macro PROCESS_LUMA_VERT_P_16x4_AVX512 1 -lea r6, [r0 + 4 * r1] +lea r5, [r0 + 4 * r1] movu ym1, [r0] movu ym3, [r0 + r1] vinserti32x8 m1, [r0 + 2 * r1], 1 @@ -12949,7 +12949,7 @@ pmaddwd m3, m15 movu ym5, [r0 + r7] -vinserti32x8 m5, [r6 + r1], 1 +vinserti32x8 m5, [r5 + r1], 1 punpcklwdm6, m4, m5 pmaddwd m6, m16 punpckhwdm4, m5 @@ -12958,8 +12958,8 @@ padddm0, m6 padddm1, m4 -movu ym4, [r6] -vinserti32x8 m4, [r6 + 2 * r1], 1 +movu ym4, [r5] +vinserti32x8 m4, [r5 + 2 * r1], 1 punpcklwdm6, m5, m4 pmaddwd m6, m16 punpckhwdm5, m4 @@ -12968,22 +12968,22 @@ padddm2, m6 padddm3, m5 -lea r4, [r6 + 4 * r1] -movu ym11,[r6 + r1] -vinserti32x8 m11, [r6 + r7], 1 +lea r4, [r5 + 4 * r1] +movu ym11,[r5 + r1] +vinserti32x8 m11, [r5 + r7], 1 punpcklwdm8, m4, m11 pmaddwd m8, m17 punpckhwdm4, m11 pmaddwd m4, m17 -movu ym12,[r6 + 2 * r1] +movu ym12,[r5 + 2 * r1] vinserti32x8 m12, [r4], 1 punpcklwdm10, m11,m12 pmaddwd m10, m17 punpckhwdm11, m12 pmaddwd m11, m17 -movu ym13,[r6 + r7] +movu ym13,[r5 + r7] vinserti32x8 m13, [r4 + r1], 1 punpcklwdm14, m12,m13 pmaddwd m14,
[x265] [PATCH 234 of 307] x86: AVX512 interp_4tap_vert_pp_48x64
# HG changeset patch # User Vignesh Vijayakumar# Date 1512389309 -19800 # Mon Dec 04 17:38:29 2017 +0530 # Node ID 283aa4d77cef296699167c041763d7115e7a88aa # Parent ae75b2d09d10f28391d573507c13512360593386 x86: AVX512 interp_4tap_vert_pp_48x64 AVX2 performance : 43.04x AVX512 performance : 51.46x diff -r ae75b2d09d10 -r 283aa4d77cef source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Dec 04 15:05:04 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Dec 04 17:38:29 2017 +0530 @@ -4977,6 +4977,7 @@ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = PFX(interp_4tap_vert_pp_48x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = PFX(interp_4tap_vert_pp_64x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = PFX(interp_4tap_vert_pp_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx512); diff -r ae75b2d09d10 -r 283aa4d77cef source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Mon Dec 04 15:05:04 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Mon Dec 04 17:38:29 2017 +0530 @@ -11038,6 +11038,125 @@ FILTER_VER_PP_CHROMA_32xN_AVX512 64 %endif +%macro PROCESS_CHROMA_VERT_PP_48x4_AVX512 0 +movu ym1,[r0] +movu ym3,[r0 + r1] +vinserti32x8 m1, [r0 + 2 * r1], 1 +vinserti32x8 m3, [r0 + r6], 1 +punpcklbw m0, m1, m3 +pmaddubsw m0, m8 +punpckhbw m1, m3 +pmaddubsw m1, m8 + +movu ym4,[r0 + 2 * r1] +vinserti32x8 m4, [r0 + 4 * r1], 1 +punpcklbw m2, m3, m4 +pmaddubsw m2, m8 +punpckhbw m3, m4 +pmaddubsw m3, m8 + +lea r5, [r0 + 4 * r1] + +movu ym5,[r0 + r6] +vinserti32x8 m5, [r5 + r1], 1 +punpcklbw m6, m4, m5 +pmaddubsw m6, m9 +paddw m0, m6 +punpckhbw m4, m5 +pmaddubsw m4, m9 +paddw m1, m4 + +movu ym4,[r0 + 4 * r1] +vinserti32x8 m4, [r5 + 2 * r1], 1 +punpcklbw m6, m5, m4 +pmaddubsw m6, m9 +paddw m2, m6 +punpckhbw m5, m4 +pmaddubsw m5, m9 +paddw m3, m5 + +pmulhrsw m0, m7 +pmulhrsw m1, m7 +pmulhrsw m2, m7 +pmulhrsw m3, m7 + +packuswb m0, m1 +packuswb m2, m3 +movu [r2], ym0 +movu [r2 + r3], ym2 +vextracti32x8 [r2 + 2 * r3], m0, 1 +vextracti32x8 [r2 + r7], m2, 1 + +movu xm1,[r0 + mmsize/2] +movu xm3,[r0 + r1 + mmsize/2] +vinserti32x4 m1, [r0 + r1 + mmsize/2], 1 +vinserti32x4 m3, [r0 + 2 * r1 + mmsize/2], 1 +vinserti32x4 m1, [r0 + 2 * r1 + mmsize/2], 2 +vinserti32x4 m3, [r0 + r6 + mmsize/2], 2 +vinserti32x4 m1, [r0 + r6 + mmsize/2], 3 +vinserti32x4 m3, [r0 + 4 * r1 + mmsize/2], 3 + +punpcklbw m0, m1, m3 +pmaddubsw m0, m8 +punpckhbw m1, m3 +pmaddubsw m1, m8 + +movu xm4,[r0 + 2 * r1 + mmsize/2] +movu xm5,[r0 + r6 + mmsize/2] +vinserti32x4 m4,
[x265] [PATCH 244 of 307] [x265-avx512]x86: AVX512 sad_x4_16xN for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty# Date 1512640914 -19800 # Thu Dec 07 15:31:54 2017 +0530 # Node ID 81a870948ac446b36c248325e0c7264cf8f3f09e # Parent 0ffc9c56a0a7361e98e6388e3067e4a78e8cd252 [x265-avx512]x86: AVX512 sad_x4_16xN for high bit depth Size | AVX2 performance | AVX512 performance -- 16x8 | 16.33x | 18.34x 16x12 | 15.79x | 19.91x 16x16 | 15.73x | 18.82x 16x32 | 17.13x | 20.72x 16x64 | 17.72x | 23.04x diff -r 0ffc9c56a0a7 -r 81a870948ac4 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Dec 07 14:31:05 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Dec 07 15:31:54 2017 +0530 @@ -2511,6 +2511,11 @@ p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512); p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx512); +p.pu[LUMA_16x8].sad_x4 = PFX(pixel_sad_x4_16x8_avx512); +p.pu[LUMA_16x12].sad_x4 = PFX(pixel_sad_x4_16x12_avx512); +p.pu[LUMA_16x16].sad_x4 = PFX(pixel_sad_x4_16x16_avx512); +p.pu[LUMA_16x32].sad_x4 = PFX(pixel_sad_x4_16x32_avx512); +p.pu[LUMA_16x64].sad_x4 = PFX(pixel_sad_x4_16x64_avx512); p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx512); p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512); p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512); diff -r 0ffc9c56a0a7 -r 81a870948ac4 source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Thu Dec 07 14:31:05 2017 +0530 +++ b/source/common/x86/sad16-a.asm Thu Dec 07 15:31:54 2017 +0530 @@ -2124,6 +2124,67 @@ ; SAD x3/x4 avx512 code start ; +%macro PROCESS_SAD_X4_16x4_AVX512 0 +movuym8, [r0] +vinserti64x4 m8, [r0 + 2 * FENC_STRIDE], 1 +movuym4, [r1] +vinserti64x4 m4, [r1 + r5], 1 +movuym5, [r2] +vinserti64x4 m5, [r2 + r5], 1 +movuym6, [r3] +vinserti64x4 m6, [r3 + r5], 1 +movuym7, [r4] +vinserti64x4 m7, [r4 + r5], 1 + + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 + +movuym8, [r0 + 4 * FENC_STRIDE] +vinserti64x4 m8, [r0 + 6 * FENC_STRIDE], 1 +movuym4, [r1 + 2 * r5] +vinserti64x4 m4, [r1 + r7], 1 +movuym5, [r2 + 2 * r5] +vinserti64x4 m5, [r2 + r7], 1 +movuym6, [r3 + 2 * r5] +vinserti64x4 m6, [r3 + r7], 1 +movuym7, [r4 + 2 * r5] +vinserti64x4 m7, [r4 + r7], 1 + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 +%endmacro + %macro PROCESS_SAD_X4_32x4_AVX512 0 movum8, [r0] movum4, [r1] @@ -3467,6 +3528,130 @@ PROCESS_SAD_X3_END_AVX512 RET %endif + +; +; void pixel_sad_x4_16x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res ) +; +%if ARCH_X86_64 +INIT_ZMM avx512 +cglobal pixel_sad_x4_16x8, 6,8,10 +pxorm0, m0 +pxorm1, m1 +pxorm2, m2 +pxorm3, m3 + +vbroadcasti32x8 m9, [pw_1] + +add r5d, r5d +lea r7d, [r5 * 3] + +PROCESS_SAD_X4_16x4_AVX512 +add r0, FENC_STRIDE * 8 +lea r1, [r1 + r5 * 4] +lea r2, [r2 + r5 * 4] +lea r3, [r3 + r5 * 4] +lea r4, [r4 + r5 * 4] +PROCESS_SAD_X4_16x4_AVX512 +PROCESS_SAD_X4_END_AVX512 +RET + +INIT_ZMM avx512 +cglobal pixel_sad_x4_16x12, 6,8,10 +pxorm0, m0 +pxorm1, m1 +pxorm2, m2 +pxorm3, m3 + +vbroadcasti32x8 m9, [pw_1] + +add r5d, r5d +lea r7d, [r5 * 3] + +%rep 2 +PROCESS_SAD_X4_16x4_AVX512 +add r0, FENC_STRIDE * 8 +lea r1, [r1 + r5 * 4] +lea r2, [r2 + r5 * 4] +lea r3, [r3 + r5 * 4] +lea r4, [r4 + r5 * 4] +
[x265] [PATCH 237 of 307] x86: AVX512 interp_8tap_vert_pp_64xN and interp_8tap_vert_ps_64xN for high bit depth
# HG changeset patch # User Vignesh Vijayakumar# Date 1512460722 -19800 # Tue Dec 05 13:28:42 2017 +0530 # Node ID ca6bb5919227672e0cf98b785acf099531c32945 # Parent c3a341391f0c777665e191a4cd172f08a5a313f9 x86: AVX512 interp_8tap_vert_pp_64xN and interp_8tap_vert_ps_64xN for high bit depth luma_vpp Size | AVX2 performance | AVX512 performance -- 64x16 | 11.51x | 19.67x 64x32 | 11.51x | 19.42x 64x48 | 11.54x | 19.42x 64x64 | 11.55x | 19.72x luma_vps Size | AVX2 performance | AVX512 performance -- 64x16 |9.92x | 18.23x 64x32 |9.71x | 18.13x 64x48 |9.81x | 18.04x 64x64 |9.86x | 18.14x diff -r c3a341391f0c -r ca6bb5919227 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Nov 24 16:44:56 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Dec 05 13:28:42 2017 +0530 @@ -2887,12 +2887,20 @@ p.pu[LUMA_32x32].luma_vpp = PFX(interp_8tap_vert_pp_32x32_avx512); p.pu[LUMA_32x24].luma_vpp = PFX(interp_8tap_vert_pp_32x24_avx512); p.pu[LUMA_32x64].luma_vpp = PFX(interp_8tap_vert_pp_32x64_avx512); +p.pu[LUMA_64x16].luma_vpp = PFX(interp_8tap_vert_pp_64x16_avx512); +p.pu[LUMA_64x32].luma_vpp = PFX(interp_8tap_vert_pp_64x32_avx512); +p.pu[LUMA_64x48].luma_vpp = PFX(interp_8tap_vert_pp_64x48_avx512); +p.pu[LUMA_64x64].luma_vpp = PFX(interp_8tap_vert_pp_64x64_avx512); p.pu[LUMA_32x8].luma_vps = PFX(interp_8tap_vert_ps_32x8_avx512); p.pu[LUMA_32x16].luma_vps = PFX(interp_8tap_vert_ps_32x16_avx512); p.pu[LUMA_32x32].luma_vps = PFX(interp_8tap_vert_ps_32x32_avx512); p.pu[LUMA_32x24].luma_vps = PFX(interp_8tap_vert_ps_32x24_avx512); p.pu[LUMA_32x64].luma_vps = PFX(interp_8tap_vert_ps_32x64_avx512); +p.pu[LUMA_64x16].luma_vps = PFX(interp_8tap_vert_ps_64x16_avx512); +p.pu[LUMA_64x32].luma_vps = PFX(interp_8tap_vert_ps_64x32_avx512); +p.pu[LUMA_64x48].luma_vps = PFX(interp_8tap_vert_ps_64x48_avx512); +p.pu[LUMA_64x64].luma_vps = PFX(interp_8tap_vert_ps_64x64_avx512); p.cu[BLOCK_8x8].dct = PFX(dct8_avx512); /* TODO: Currently these kernels performance are similar to AVX2 version, we need a to improve them further to ebable diff -r c3a341391f0c -r ca6bb5919227 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Fri Nov 24 16:44:56 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Tue Dec 05 13:28:42 2017 +0530 @@ -13078,6 +13078,152 @@ FILTER_VER_P_LUMA_32xN_AVX512 pp, 24 FILTER_VER_P_LUMA_32xN_AVX512 pp, 64 %endif + +%macro PROCESS_LUMA_VERT_P_64x2_AVX512 1 +PROCESS_LUMA_VERT_P_32x2_AVX512 %1 +movu m1, [r0 + mmsize] +movu m3, [r0 + r1 + mmsize] +punpcklwdm0, m1, m3 +pmaddwd m0, m15 +punpckhwdm1, m3 +pmaddwd m1, m15 + +movu m4, [r0 + 2 * r1 + mmsize] +punpcklwdm2, m3, m4 +pmaddwd m2, m15 +punpckhwdm3, m4 +pmaddwd m3, m15 + +movu m5, [r0 + r7 + mmsize] +punpcklwdm6, m4, m5 +pmaddwd m6, m16 +punpckhwdm4, m5 +pmaddwd m4, m16 + +padddm0, m6 +padddm1, m4 + +movu m4, [r0 + 4 * r1 + mmsize] +punpcklwdm6, m5, m4 +pmaddwd m6, m16 +punpckhwdm5, m4 +pmaddwd m5, m16 + +padddm2, m6 +padddm3, m5 + +movu m11, [r6 + r1 + mmsize] +punpcklwdm8, m4, m11 +pmaddwd m8, m17 +punpckhwdm4, m11 +pmaddwd m4, m17 + +movu m12, [r6 + 2 * r1 + mmsize] +punpcklwdm10, m11,m12 +pmaddwd m10, m17 +punpckhwdm11, m12 +pmaddwd m11, m17 + +movu
[x265] [PATCH 236 of 307] x86: AVX512 interp_8tap_vert_pp_32xN and interp_8tap_vert_ps_32xN for high bit depth
# HG changeset patch # User Vignesh Vijayakumar# Date 1511522096 -19800 # Fri Nov 24 16:44:56 2017 +0530 # Node ID c3a341391f0c777665e191a4cd172f08a5a313f9 # Parent 1cd123613bbb28fd00da36a3cfe3765f8e07d00e x86: AVX512 interp_8tap_vert_pp_32xN and interp_8tap_vert_ps_32xN for high bit depth luma_vpp Size | AVX2 performance | AVX512 performance -- 32x8 | 10.54x | 18.96x 32x16 | 11.70x | 20.71x 32x24 | 11.34x | 20.47x 32x32 | 11.76x | 19.45x 32x64 | 11.87x | 21.04x luma_vps Size | AVX2 performance | AVX512 performance -- 32x8 |9.01x | 17.10x 32x16 | 10.15x | 18.05x 32x24 |9.78x | 17.90x 32x32 | 10.19x | 17.79x 32x64 | 10.14x | 18.50x diff -r 1cd123613bbb -r c3a341391f0c source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Nov 27 16:45:08 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Nov 24 16:44:56 2017 +0530 @@ -2882,6 +2882,18 @@ p.pu[LUMA_64x64].luma_vsp = PFX(interp_8tap_vert_sp_64x64_avx512); p.pu[LUMA_48x64].luma_vsp = PFX(interp_8tap_vert_sp_48x64_avx512); +p.pu[LUMA_32x8].luma_vpp = PFX(interp_8tap_vert_pp_32x8_avx512); +p.pu[LUMA_32x16].luma_vpp = PFX(interp_8tap_vert_pp_32x16_avx512); +p.pu[LUMA_32x32].luma_vpp = PFX(interp_8tap_vert_pp_32x32_avx512); +p.pu[LUMA_32x24].luma_vpp = PFX(interp_8tap_vert_pp_32x24_avx512); +p.pu[LUMA_32x64].luma_vpp = PFX(interp_8tap_vert_pp_32x64_avx512); + +p.pu[LUMA_32x8].luma_vps = PFX(interp_8tap_vert_ps_32x8_avx512); +p.pu[LUMA_32x16].luma_vps = PFX(interp_8tap_vert_ps_32x16_avx512); +p.pu[LUMA_32x32].luma_vps = PFX(interp_8tap_vert_ps_32x32_avx512); +p.pu[LUMA_32x24].luma_vps = PFX(interp_8tap_vert_ps_32x24_avx512); +p.pu[LUMA_32x64].luma_vps = PFX(interp_8tap_vert_ps_32x64_avx512); + p.cu[BLOCK_8x8].dct = PFX(dct8_avx512); /* TODO: Currently these kernels performance are similar to AVX2 version, we need a to improve them further to ebable * it. Probably a Vtune analysis will help here. diff -r 1cd123613bbb -r c3a341391f0c source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Mon Nov 27 16:45:08 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Fri Nov 24 16:44:56 2017 +0530 @@ -12928,5 +12928,159 @@ ;avx512 luma_vss and luma_vsp code end ;- ;- +;avx512 luma_vpp and luma_vps code start +;- +%macro PROCESS_LUMA_VERT_P_32x2_AVX512 1 +movu m1, [r0] ;0 row +movu m3, [r0 + r1] ;1 row +punpcklwdm0, m1, m3 +pmaddwd m0, m15 +punpckhwdm1, m3 +pmaddwd m1, m15 + +movu m4, [r0 + 2 * r1] ;2 row +punpcklwdm2, m3, m4 +pmaddwd m2, m15 +punpckhwdm3, m4 +pmaddwd m3, m15 + +movu m5, [r0 + r7] ;3 row +punpcklwdm6, m4, m5 +pmaddwd m6, m16 +punpckhwdm4, m5 +pmaddwd m4, m16 + +padddm0, m6 +padddm1, m4 + +movu m4, [r0 + 4 * r1] ;4 row +punpcklwdm6, m5, m4 +pmaddwd m6, m16 +punpckhwdm5, m4 +pmaddwd m5, m16 + +padddm2, m6 +padddm3, m5 + +lea r6, [r0 + 4 * r1] + +movu m11, [r6 + r1] ;5 row +punpcklwdm8, m4, m11 +pmaddwd m8, m17 +punpckhwdm4, m11 +pmaddwd m4, m17 + +movu
[x265] [PATCH 231 of 307] x86: AVX512 ssd_ss_16x16
# HG changeset patch # User Vignesh Vijayakumar# Date 1512371012 -19800 # Mon Dec 04 12:33:32 2017 +0530 # Node ID 465682e66d91ecf207feae78c33e32f0eaaf45c4 # Parent 4f690222337dbc1757665729ea15f2380a11c329 x86: AVX512 ssd_ss_16x16 AVX2 performance : 43.55x AVX512 performance : 48.11x This patch also cleanup already existing ssd_ss AVX512 code diff -r 4f690222337d -r 465682e66d91 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Dec 01 10:30:38 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Dec 04 12:33:32 2017 +0530 @@ -4743,6 +4743,7 @@ p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512); p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512); +p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_avx512); p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512); p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx512); diff -r 4f690222337d -r 465682e66d91 source/common/x86/ssd-a.asm --- a/source/common/x86/ssd-a.asm Fri Dec 01 10:30:38 2017 +0530 +++ b/source/common/x86/ssd-a.asm Mon Dec 04 12:33:32 2017 +0530 @@ -1390,183 +1390,120 @@ ;- ; ssd_ss avx512 code start ;- -%macro PROCESS_SSD_SS_64x8_AVX512 0 +%if ARCH_X86_64 +%macro PROCESS_SSD_SS_64x4_AVX512 0 movum0, [r0] movum1, [r0 + mmsize] movum2, [r0 + r1] movum3, [r0 + r1 + mmsize] - -psubw m0, [r2] -psubw m1, [r2 + mmsize] -psubw m2, [r2 + r3] -psubw m3, [r2 + r3 + mmsize] +movum4, [r2] +movum5, [r2 + mmsize] +movum6, [r2 + r3] +movum7, [r2 + r3 + mmsize] + +psubw m0, m4 +psubw m1, m5 +psubw m2, m6 +psubw m3, m7 pmaddwd m0, m0 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 -paddd m4, m0 -paddd m5, m1 -paddd m4, m2 -paddd m5, m3 +paddd m8, m0 +paddd m8, m1 +paddd m8, m2 +paddd m8, m3 movum0, [r0 + 2 * r1] movum1, [r0 + 2 * r1 + mmsize] movum2, [r0 + r5] movum3, [r0 + r5 + mmsize] - -psubw m0, [r2 + 2 * r3] -psubw m1, [r2 + 2 * r3 + mmsize] -psubw m2, [r2 + r6] -psubw m3, [r2 + r6 + mmsize] +movum4, [r2 + 2 * r3] +movum5, [r2 + 2 * r3 + mmsize] +movum6, [r2 + r6] +movum7, [r2 + r6 + mmsize] + +psubw m0, m4 +psubw m1, m5 +psubw m2, m6 +psubw m3, m7 pmaddwd m0, m0 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 -paddd m4, m0 -paddd m5, m1 -paddd m4, m2 -paddd m5, m3 - -lea r0, [r0 + 4 * r1] -lea r2, [r2 + 4 * r3] - +paddd m8, m0 +paddd m8, m1 +paddd m8, m2 +paddd m8, m3 +%endmacro + +%macro PROCESS_SSD_SS_32x4_AVX512 0 movum0, [r0] -movum1, [r0 + mmsize] -movum2, [r0 + r1] -movum3, [r0 + r1 + mmsize] - -psubw m0, [r2] -psubw m1, [r2 + mmsize] -psubw m2, [r2 + r3] -psubw m3, [r2 + r3 + mmsize] +movum1, [r0 + r1] +movum2, [r0 + 2 * r1] +movum3, [r0 + r5] +movum4, [r2] +movum5, [r2 + r3] +movum6, [r2 + 2 * r3] +movum7, [r2 + r6] + +psubw m0, m4 +psubw m1, m5 +psubw m2, m6 +psubw m3, m7 pmaddwd m0, m0 pmaddwd m1, m1 pmaddwd m2, m2 pmaddwd m3, m3 -paddd m4, m0 -paddd m5, m1 -paddd m4, m2 -paddd m5, m3 - -movum0, [r0 + 2 * r1] -movum1, [r0 + 2 * r1 + mmsize] -movum2, [r0 + r5] -movum3, [r0 + r5 + mmsize] - -psubw m0, [r2 + 2 * r3] -psubw m1, [r2 + 2 * r3 + mmsize] -psubw m2, [r2 + r6] -psubw m3, [r2 + r6 + mmsize] +paddd m8, m0 +paddd m8, m1 +paddd m8, m2 +paddd m8, m3 +%endmacro + +%macro PROCESS_SSD_SS_16x4_AVX512 0 +movu ym0, [r0] +vinserti32x8m0, [r0 + r1],1 +movu ym1, [r0 + 2 * r1] +vinserti32x8m1, [r0 + r5],1 +movu ym4, [r2] +vinserti32x8m4, [r2 + r3],1 +movu ym5, [r2 + 2 * r3] +vinserti32x8m5, [r2 + r6],1 + +psubw m0, m4 +psubw m1, m5 pmaddwd m0, m0 pmaddwd m1, m1 -pmaddwd m2, m2 -pmaddwd m3, m3 -paddd m4, m0 -paddd
[x265] [PATCH 235 of 307] x86: AVX512 interp_4tap_vert_ps_64xN
# HG changeset patch # User Jayashri Murugan# Date 1511781308 -19800 # Mon Nov 27 16:45:08 2017 +0530 # Node ID 1cd123613bbb28fd00da36a3cfe3765f8e07d00e # Parent 283aa4d77cef296699167c041763d7115e7a88aa x86: AVX512 interp_4tap_vert_ps_64xN Size | AVX2 performance | AVX512 performance -- 64x16 | 39.17x | 64.63x 64x32 | 40.14x | 64.98x 64x48 | 39.97x | 64.52x 64x64 | 40.32x | 64.93x diff -r 283aa4d77cef -r 1cd123613bbb source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Dec 04 17:38:29 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Nov 27 16:45:08 2017 +0530 @@ -5087,6 +5087,11 @@ p.quant = PFX(quant_avx512); p.nquant = PFX(nquant_avx512); p.denoiseDct = PFX(denoise_dct_avx512); + +p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = PFX(interp_4tap_vert_ps_64x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = PFX(interp_4tap_vert_ps_64x48_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = PFX(interp_4tap_vert_ps_64x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = PFX(interp_4tap_vert_ps_64x16_avx512); } #endif } diff -r 283aa4d77cef -r 1cd123613bbb source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Mon Dec 04 17:38:29 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Mon Nov 27 16:45:08 2017 +0530 @@ -243,10 +243,13 @@ const interp4_horiz_shuf_load3_avx512, times 2 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 ALIGN 64 +interp4_vps_store1_avx512: dq 0, 1, 8, 9, 2, 3, 10, 11 +interp4_vps_store2_avx512: dq 4, 5, 12, 13, 6, 7, 14, 15 const interp4_hps_shuf_avx512, dq 0, 4, 1, 5, 2, 6, 3, 7 const interp4_hps_store_16xN_avx512, dq 0, 2, 1, 3, 4, 6, 5, 7 const interp8_hps_store_avx512, dq 0, 1, 4, 5, 2, 3, 6, 7 const interp8_vsp_store_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7 + SECTION .text cextern pb_128 cextern pw_1 @@ -10864,7 +10867,7 @@ %endif ;- -;avx512 chroma_vpp code start +;avx512 chroma_vpp and chroma_vps code start ;- %macro PROCESS_CHROMA_VERT_PP_16x4_AVX512 0 lea r5, [r0 + 4 * r1] @@ -11157,7 +11160,7 @@ RET %endif -%macro PROCESS_CHROMA_VERT_PP_64x4_AVX512 0 +%macro PROCESS_CHROMA_VERT_64x4_AVX512 1 movu m0, [r0]; m0 = row 0 movu m1, [r0 + r1] ; m1 = row 1 punpcklbw m2, m0,m1 @@ -11179,10 +11182,21 @@ paddw m2, m8 paddw m3, m9 +%ifidn %1,pp pmulhrsw m2, m12 pmulhrsw m3, m12 packuswb m2, m3 movu [r2],m2 +%else +psubw m2, m12 +psubw m3, m12 +movu m8, m13 +movu m9, m14 +vpermi2q m8, m2, m3 +vpermi2q m9, m2, m3 +movu [r2], m8 +movu [r2 + mmsize], m9 +%endif lea r0, [r0 + r1 * 4] movu m0, [r0]; m0 = row 4 @@ -11194,10 +11208,22 @@ pmaddubsw m3, m10 paddw m4, m8 paddw m5, m9 + +%ifidn %1,pp pmulhrsw m4, m12 pmulhrsw m5, m12 packuswb m4, m5 movu [r2 + r3], m4 +%else +psubw m4, m12 +psubw m5, m12 +movu m8, m13 +movu m9, m14 +vpermi2q m8, m4, m5 +vpermi2q m9, m4, m5 +movu [r2 + r3], m8 +movu [r2 + r3 + mmsize], m9 +%endif movu m1, [r0 + r1] ; m1 = row 5 punpcklbw m4, m0,m1 @@ -11207,11 +11233,21 @@ paddw m6, m4 paddw m7, m5 +%ifidn %1,pp pmulhrsw m6, m12 pmulhrsw m7, m12 packuswb m6, m7 movu [r2 + r3 * 2], m6 - +%else +psubw m6, m12 +psubw m7, m12 +movu m8, m13 +movu m9, m14 +vpermi2q m8, m6, m7 +vpermi2q m9, m6, m7 +movu [r2 + 2 * r3], m8 +movu [r2 + 2 * r3 + mmsize], m9 +%endif
[x265] [PATCH 242 of 307] [x265-avx512]x86: AVX512 sad_16x32 and sad_16x64 for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty# Date 1512625055 -19800 # Thu Dec 07 11:07:35 2017 +0530 # Node ID 931dd781dc0c6de76bb31d0215db7a7af885f9bf # Parent 9bd38bd06850914d1cbf617063ea0e1e60f66219 [x265-avx512]x86: AVX512 sad_16x32 and sad_16x64 for high bit depth Size | AVX2 performance | AVX512 performance -- 16x32 | 15.49x | 16.89x 16x64 | 16.46x | 17.84x diff -r 9bd38bd06850 -r 931dd781dc0c source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Dec 07 10:25:21 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Dec 07 11:07:35 2017 +0530 @@ -2434,6 +2434,8 @@ p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512); +p.pu[LUMA_16x32].sad = PFX(pixel_sad_16x32_avx512); +p.pu[LUMA_16x64].sad = PFX(pixel_sad_16x64_avx512); p.pu[LUMA_32x8].sad = PFX(pixel_sad_32x8_avx512); p.pu[LUMA_32x16].sad = PFX(pixel_sad_32x16_avx512); p.pu[LUMA_32x24].sad = PFX(pixel_sad_32x24_avx512); diff -r 9bd38bd06850 -r 931dd781dc0c source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Thu Dec 07 10:25:21 2017 +0530 +++ b/source/common/x86/sad16-a.asm Thu Dec 07 11:07:35 2017 +0530 @@ -1277,6 +1277,46 @@ paddd m0, m1 %endmacro +%macro PROCESS_SAD_16x8_AVX512 0 +movuym1, [r2] +vinserti64x4 m1, [r2 + r3], 1 +movuym2, [r2 + 2 * r3] +vinserti64x4 m2, [r2 + r5], 1 +movuym3, [r0] +vinserti64x4 m3, [r0 + r1], 1 +movuym4, [r0 + 2 * r1] +vinserti64x4 m4, [r0 + r4], 1 + +psubw m1, m3 +psubw m2, m4 +pabsw m1, m1 +pabsw m2, m2 +paddw m5, m1, m2 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] + +movuym1, [r2] +vinserti64x4 m1, [r2 + r3], 1 +movuym2, [r2 + 2 * r3] +vinserti64x4 m2, [r2 + r5], 1 +movuym3, [r0] +vinserti64x4 m3, [r0 + r1], 1 +movuym4, [r0 + 2 * r1] +vinserti64x4 m4, [r0 + r4], 1 + +psubw m1, m3 +psubw m2, m4 +pabsw m1, m1 +pabsw m2, m2 +paddw m1, m2 + +pmaddwd m5, m6 +paddd m0, m5 +pmaddwd m1, m6 +paddd m0, m1 +%endmacro + %macro PROCESS_SAD_AVX512_END 0 vextracti32x8 ym1, m0, 1 paddd ym0, ym1 @@ -1523,6 +1563,51 @@ %endif ;- +; int pixel_sad_16x%1( uint16_t *, intptr_t, uint16_t *, intptr_t ) +;- +%if ARCH_X86_64 +INIT_ZMM avx512 +cglobal pixel_sad_16x32, 4,6,7 +pxorm0, m0 + +vbroadcasti32x8 m6, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] + +%rep 3 +PROCESS_SAD_16x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +%endrep +PROCESS_SAD_16x8_AVX512 +PROCESS_SAD_AVX512_END +RET + +INIT_ZMM avx512 +cglobal pixel_sad_16x64, 4,6,7 + pxorm0, m0 + +vbroadcasti32x8 m6, [pw_1] + +add r3d, r3d +add r1d, r1d +lea r4d, [r1 * 3] +lea r5d, [r3 * 3] + +%rep 7 +PROCESS_SAD_16x8_AVX512 +lear2, [r2 + 4 * r3] +lear0, [r0 + 4 * r1] +%endrep +PROCESS_SAD_16x8_AVX512 +PROCESS_SAD_AVX512_END +RET +%endif + +;- ; int pixel_sad_48x64( uint16_t *, intptr_t, uint16_t *, intptr_t ) ;- %if ARCH_X86_64 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 225 of 307] x86: AVX512 interp_4tap_vert_sp_32xN
# HG changeset patch # User Vignesh Vijayakumar# Date 1512021637 -19800 # Thu Nov 30 11:30:37 2017 +0530 # Node ID 6137bed68dac85ab475b8be28fdba7f5787ac551 # Parent a78e09e144582bd52c52d3475aa1922fc2ae8893 x86: AVX512 interp_4tap_vert_sp_32xN i444 Size | AVX2 performance | AVX512 performance -- 32x8 | 10.49x | 24.74x 32x16 | 11.57x | 25.34x 32x24 | 11.59x | 25.55x 32x32 | 12.01x | 25.72x 32x64 | 12.29x | 26.02x diff -r a78e09e14458 -r 6137bed68dac source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Nov 30 10:35:20 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Nov 30 11:30:37 2017 +0530 @@ -4906,6 +4906,10 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = PFX(interp_4tap_vert_ss_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vss = PFX(interp_4tap_vert_ss_32x24_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vsp = PFX(interp_4tap_vert_sp_32x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vsp = PFX(interp_4tap_vert_sp_32x24_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512); @@ -4931,6 +4935,10 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vss = PFX(interp_4tap_vert_ss_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vss = PFX(interp_4tap_vert_ss_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vss = PFX(interp_4tap_vert_ss_32x64_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vsp = PFX(interp_4tap_vert_sp_32x48_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512); @@ -4966,6 +4974,11 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vss = PFX(interp_4tap_vert_ss_64x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vss = PFX(interp_4tap_vert_ss_64x16_avx512); p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vss = PFX(interp_4tap_vert_ss_48x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vsp = PFX(interp_4tap_vert_sp_32x8_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vsp = PFX(interp_4tap_vert_sp_32x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vsp = PFX(interp_4tap_vert_sp_32x24_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vsp = PFX(interp_4tap_vert_sp_32x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vsp = PFX(interp_4tap_vert_sp_32x64_avx512); p.pu[LUMA_8x8].luma_vss = PFX(interp_8tap_vert_ss_8x8_avx512); p.pu[LUMA_8x16].luma_vss = PFX(interp_8tap_vert_ss_8x16_avx512); diff -r a78e09e14458 -r 6137bed68dac source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Thu Nov 30 10:35:20 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Thu Nov 30 11:30:37 2017 +0530 @@ -11614,120 +11614,109 @@ FILTER_VER_SS_CHROMA_24xN_AVX512 32 FILTER_VER_SS_CHROMA_24xN_AVX512 64 %endif - -%macro PROCESS_CHROMA_VERT_SS_32x4_AVX512 0 +%macro PROCESS_CHROMA_VERT_S_32x2_AVX512 1 movu m1, [r0] -lea r6, [r0 + 2 * r1] -movu m10,[r6] movu m3, [r0 + r1] -movu m12,[r0 + r4] punpcklwd m0, m1, m3 -punpcklwd m9, m10, m12 -pmaddwd m0, m16 -pmaddwd m9, m16 +pmaddwd m0, m7 punpckhwd m1, m3 -punpckhwd m10,m12 -pmaddwd m1, m16 -pmaddwd m10,m16 +pmaddwd m1, m7
[x265] [PATCH 224 of 307] x86: AVX512 interp_8tap_vert_sp_16xN
# HG changeset patch # User Vignesh Vijayakumar# Date 1512018320 -19800 # Thu Nov 30 10:35:20 2017 +0530 # Node ID a78e09e144582bd52c52d3475aa1922fc2ae8893 # Parent 3e14c3f607d0f9ec6dd3735d21fc2e698217fe71 x86: AVX512 interp_8tap_vert_sp_16xN Size | AVX2 performance | AVX512 performance -- 16x4 | 9.68x | 16.45x 16x8 | 11.69x | 16.93x 16x16 | 13.26x | 18.58x 16x32 | 12.96x | 19.23x 16x64 | 13.12x | 16.84x diff -r 3e14c3f607d0 -r a78e09e14458 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Nov 30 16:00:14 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Nov 30 10:35:20 2017 +0530 @@ -5002,7 +5002,12 @@ p.pu[LUMA_64x48].luma_vpp = PFX(interp_8tap_vert_pp_64x48_avx512); p.pu[LUMA_64x32].luma_vpp = PFX(interp_8tap_vert_pp_64x32_avx512); p.pu[LUMA_64x16].luma_vpp = PFX(interp_8tap_vert_pp_64x16_avx512); - +p.pu[LUMA_16x4].luma_vsp = PFX(interp_8tap_vert_sp_16x4_avx512); +p.pu[LUMA_16x8].luma_vsp = PFX(interp_8tap_vert_sp_16x8_avx512); +p.pu[LUMA_16x12].luma_vsp = PFX(interp_8tap_vert_sp_16x12_avx512); +p.pu[LUMA_16x16].luma_vsp = PFX(interp_8tap_vert_sp_16x16_avx512); +p.pu[LUMA_16x32].luma_vsp = PFX(interp_8tap_vert_sp_16x32_avx512); +p.pu[LUMA_16x64].luma_vsp = PFX(interp_8tap_vert_sp_16x64_avx512); p.pu[LUMA_32x64].luma_vsp = PFX(interp_8tap_vert_sp_32x64_avx512); p.pu[LUMA_32x32].luma_vsp = PFX(interp_8tap_vert_sp_32x32_avx512); p.pu[LUMA_32x24].luma_vsp = PFX(interp_8tap_vert_sp_32x24_avx512); diff -r 3e14c3f607d0 -r a78e09e14458 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Thu Nov 30 16:00:14 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Thu Nov 30 10:35:20 2017 +0530 @@ -12985,8 +12985,7 @@ FILTER_VER_SS_LUMA_8xN_AVX512 16 FILTER_VER_SS_LUMA_8xN_AVX512 32 %endif - -%macro PROCESS_LUMA_VERT_SS_16x4_AVX512 0 +%macro PROCESS_LUMA_VERT_S_16x4_AVX512 1 movu ym1, [r0] movu ym3, [r0 + r1] vinserti32x8 m1, [r0 + 2 * r1], 1 @@ -13062,7 +13061,26 @@ padddm11, m13 padddm2, m10 padddm3, m11 - +%ifidn %1, sp +padddm0, m19 +padddm1, m19 +padddm2, m19 +padddm3, m19 + +psradm0, 12 +psradm1, 12 +psradm2, 12 +psradm3, 12 + +packssdw m0, m1 +packssdw m2, m3 +packuswb m0, m2 +vpermq m0, m20, m0 +movu [r2],xm0 +vextracti32x4[r2 + r3], m0,2 +vextracti32x4[r2 + 2 * r3], m0,1 +vextracti32x4[r2 + r5], m0,3 +%else psradm0, 6 psradm1, 6 psradm2, 6 @@ -13075,15 +13093,15 @@ movu [r2 + r3], ym2 vextracti32x8[r2 + 2 * r3], m0,1 vextracti32x8[r2 + r5], m2,1 +%endif %endmacro ;- ; void interp_8tap_vert(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx) ;- -%macro FILTER_VER_SS_LUMA_16xN_AVX512 1 +%macro FILTER_VER_S_LUMA_16xN_AVX512 2 INIT_ZMM avx512 -cglobal interp_8tap_vert_ss_16x%1, 5, 8, 19 +cglobal interp_8tap_vert_%1_16x%2, 5, 8, 21 add r1d,r1d -add r3d,r3d lea r7, [3 * r1] sub r0, r7 shl r4d,8 @@ -13100,28 +13118,39 @@ mova m17,[r5 + 2 * mmsize] mova m18,[r5 + 3 * mmsize] %endif +%ifidn %1, sp +vbroadcasti32x4 m19,[pd_526336] +mova m20,[interp8_vsp_store_avx512] +%else +add r3d,r3d +%endif lea
[x265] [PATCH 239 of 307] x86: AVX512 interp_8tap_vert_pp_48x64 and interp_8tap_vert_ps_48x64 for high bit depth
# HG changeset patch # User Vignesh Vijayakumar# Date 1512475230 -19800 # Tue Dec 05 17:30:30 2017 +0530 # Node ID 8b1c9d9c5bd8135dc11b6d031b990bfe47e3bcd8 # Parent f92128e41ac3c1da210c1c665d97061539821aaf x86: AVX512 interp_8tap_vert_pp_48x64 and interp_8tap_vert_ps_48x64 for high bit depth luma_vpp_48x64 AVX2 performance : 11.60x AVX512 performance : 18.57x luma_vps_48x64 AVX2 performance : 9.97x AVX512 performance : 17.28x diff -r f92128e41ac3 -r 8b1c9d9c5bd8 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Dec 05 14:41:07 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Dec 05 17:30:30 2017 +0530 @@ -2893,6 +2893,7 @@ p.pu[LUMA_32x32].luma_vpp = PFX(interp_8tap_vert_pp_32x32_avx512); p.pu[LUMA_32x24].luma_vpp = PFX(interp_8tap_vert_pp_32x24_avx512); p.pu[LUMA_32x64].luma_vpp = PFX(interp_8tap_vert_pp_32x64_avx512); +p.pu[LUMA_48x64].luma_vpp = PFX(interp_8tap_vert_pp_48x64_avx512); p.pu[LUMA_64x16].luma_vpp = PFX(interp_8tap_vert_pp_64x16_avx512); p.pu[LUMA_64x32].luma_vpp = PFX(interp_8tap_vert_pp_64x32_avx512); p.pu[LUMA_64x48].luma_vpp = PFX(interp_8tap_vert_pp_64x48_avx512); @@ -2909,6 +2910,7 @@ p.pu[LUMA_32x32].luma_vps = PFX(interp_8tap_vert_ps_32x32_avx512); p.pu[LUMA_32x24].luma_vps = PFX(interp_8tap_vert_ps_32x24_avx512); p.pu[LUMA_32x64].luma_vps = PFX(interp_8tap_vert_ps_32x64_avx512); +p.pu[LUMA_48x64].luma_vps = PFX(interp_8tap_vert_ps_48x64_avx512); p.pu[LUMA_64x16].luma_vps = PFX(interp_8tap_vert_ps_64x16_avx512); p.pu[LUMA_64x32].luma_vps = PFX(interp_8tap_vert_ps_64x32_avx512); p.pu[LUMA_64x48].luma_vps = PFX(interp_8tap_vert_ps_64x48_avx512); diff -r f92128e41ac3 -r 8b1c9d9c5bd8 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Tue Dec 05 14:41:07 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Tue Dec 05 17:30:30 2017 +0530 @@ -13242,6 +13242,251 @@ FILTER_VER_P_LUMA_32xN_AVX512 pp, 64 %endif +%macro PROCESS_LUMA_VERT_P_48x4_AVX512 1 +PROCESS_LUMA_VERT_P_32x2_AVX512 %1 +movu m1, [r0 + 2 * r1] +movu m3, [r0 + r7] +punpcklwdm0, m1, m3 +pmaddwd m0, m15 +punpckhwdm1, m3 +pmaddwd m1, m15 + +movu m4, [r0 + 4 * r1] +punpcklwdm2, m3, m4 +pmaddwd m2, m15 +punpckhwdm3, m4 +pmaddwd m3, m15 + +movu m5, [r6 + r1] +punpcklwdm6, m4, m5 +pmaddwd m6, m16 +punpckhwdm4, m5 +pmaddwd m4, m16 + +padddm0, m6 +padddm1, m4 + +movu m4, [r6 + 2 * r1] +punpcklwdm6, m5, m4 +pmaddwd m6, m16 +punpckhwdm5, m4 +pmaddwd m5, m16 + +padddm2, m6 +padddm3, m5 + +lea r4, [r6 + 4 * r1] + +movu m11, [r6 + r7] +punpcklwdm8, m4, m11 +pmaddwd m8, m17 +punpckhwdm4, m11 +pmaddwd m4, m17 + +movu m12, [r6 + 4 * r1] +punpcklwdm10, m11,m12 +pmaddwd m10, m17 +punpckhwdm11, m12 +pmaddwd m11, m17 + +movu m13, [r4 + r1] +punpcklwdm14, m12,m13 +pmaddwd m14, m18 +punpckhwdm12, m13 +pmaddwd m12, m18 + +padddm8, m14 +padddm4, m12 +padddm0, m8 +padddm1, m4 + +movu m12, [r4 + 2 * r1] +punpcklwdm14, m13,m12 +pmaddwd m14, m18 +punpckhwdm13, m12 +pmaddwd
[x265] [PATCH 241 of 307] x86: AVX512 pixel_satd_64xN and 32xN for high bit depth
# HG changeset patch # User Vignesh Vijayakumar# Date 1512622521 -19800 # Thu Dec 07 10:25:21 2017 +0530 # Node ID 9bd38bd06850914d1cbf617063ea0e1e60f66219 # Parent 2d298099a8d6b266a32b975de4b6a369988d3887 x86: AVX512 pixel_satd_64xN and 32xN for high bit depth Size | AVX2 performance | AVX512 performance -- 32x8 | 10.99x | 17.98x 32x16 | 12.18x | 17.05x 32x24 | 13.11x | 19.70x 32x32 | 13.21x | 18.36x 32x64 | 13.27x | 19.04x 64x16 | 12.36x | 17.15x 64x32 | 11.63x | 17.78x 64x48 | 12.00x | 19.23x 64x64 | 12.12x | 19.20x diff -r 2d298099a8d6 -r 9bd38bd06850 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Dec 06 10:53:15 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Dec 07 10:25:21 2017 +0530 @@ -3015,6 +3015,24 @@ //Luma_hps_48x64 p.pu[LUMA_48x64].luma_hps = PFX(interp_8tap_horiz_ps_48x64_avx512); +p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_avx512); +p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_avx512); +p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_avx512); +p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_avx512); +p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_avx512); +p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_avx512); +p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_avx512); +p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_avx512); +p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_avx512); + +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512); + } #endif } diff -r 2d298099a8d6 -r 9bd38bd06850 source/common/x86/pixel-a.asm --- a/source/common/x86/pixel-a.asm Wed Dec 06 10:53:15 2017 +0530 +++ b/source/common/x86/pixel-a.asm Thu Dec 07 10:25:21 2017 +0530 @@ -13958,6 +13958,192 @@ paddd xm6, xm7 movdeax, xm6 RET + +%macro SATD_HBD_AVX512_END 0 +vextracti32x8 ym7, m6, 1 +paddd ym6, ym7 +vextracti128xm7, ym6, 1 +paddd xm6, xm7 +pxorxm7, xm7 +movhlps xm7, xm6 +paddd xm6, xm7 +pshufd xm7, xm6, 1 +paddd xm6, xm7 +movdeax, xm6 +%endmacro + +%macro PROCESS_SATD_32x8_HBD_AVX512 0; function to compute satd cost for 32 columns, 8 rows +; rows 0-3 +movum0, [r0] +movum4, [r2] +psubw m0, m4 +movum1, [r0 + r1] +movum5, [r2 + r3] +psubw m1, m5 +movum2, [r0 + r1 * 2] +movum4, [r2 + r3 * 2] +psubw m2, m4 +movum3, [r0 + r4] +movum5, [r2 + r5] +psubw m3, m5 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +paddw m4, m0, m1 +psubw m1, m0 +paddw m0, m2, m3 +psubw m3, m2 +punpckhwd m2, m4, m1 +punpcklwd m4, m1 +punpckhwd m1, m0, m3 +punpcklwd m0, m3 +paddw m3, m4, m0 +psubw m0, m4 +paddw m4, m2, m1 +psubw m1, m2 +punpckhdq m2, m3, m0 +punpckldq m3, m0 +paddw m0, m3, m2 +psubw m2, m3 +punpckhdq m3, m4, m1 +punpckldq m4, m1 +paddw m1, m4, m3 +psubw m3, m4 +punpckhqdq m4, m0, m1 +punpcklqdq m0, m1 +pabsw m0, m0 +pabsw m4, m4 +pmaxsw m0, m0, m4 +punpckhqdq m1, m2, m3 +punpcklqdq m2, m3 +pabsw m2, m2 +pabsw m1, m1 +pmaxsw m2, m1 +pxorm7, m7 +movam1, m0 +punpcklwd m1, m7 +paddd m6, m1 +movam1, m0 +punpckhwd m1, m7 +paddd m6, m1 +pxorm7, m7 +movam1, m2 +punpcklwd m1, m7 +paddd m6, m1 +movam1, m2 +punpckhwd m1, m7 +paddd m6, m1 +; rows 4-7 +movum0, [r0] +movum4, [r2] +psubw m0, m4 +movu
[x265] [PATCH 252 of 307] x86: AVX512 intra_pred_dc32 for high bit depth
# HG changeset patch # User Vignesh Vijayakumar# Date 1512723573 -19800 # Fri Dec 08 14:29:33 2017 +0530 # Node ID ddd64f4b2ff382d05e86708750b20332ed93f3c9 # Parent fa954ed4a1e7ce2741f3cac14006f78c3199191b x86: AVX512 intra_pred_dc32 for high bit depth AVX2 performance : 15.53x AVX512 performance : 23.96x diff -r fa954ed4a1e7 -r ddd64f4b2ff3 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Dec 08 12:12:43 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Dec 08 14:29:33 2017 +0530 @@ -3053,6 +3053,7 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512); +p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx512); } #endif diff -r fa954ed4a1e7 -r ddd64f4b2ff3 source/common/x86/intrapred.h --- a/source/common/x86/intrapred.h Fri Dec 08 12:12:43 2017 +0530 +++ b/source/common/x86/intrapred.h Fri Dec 08 14:29:33 2017 +0530 @@ -76,7 +76,7 @@ FUNCDEF_TU_S2(void, intra_pred_dc, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); FUNCDEF_TU_S2(void, intra_pred_dc, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); FUNCDEF_TU_S2(void, intra_pred_dc, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); - +FUNCDEF_TU_S2(void, intra_pred_dc, avx512, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); FUNCDEF_TU_S2(void, intra_pred_planar, sse2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); FUNCDEF_TU_S2(void, intra_pred_planar, sse4, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); FUNCDEF_TU_S2(void, intra_pred_planar, avx2, pixel* dst, intptr_t dstStride, const pixel*srcPix, int, int filter); diff -r fa954ed4a1e7 -r ddd64f4b2ff3 source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Fri Dec 08 12:12:43 2017 +0530 +++ b/source/common/x86/intrapred16.asm Fri Dec 08 14:29:33 2017 +0530 @@ -688,6 +688,68 @@ movu[r0 + r2 * 1 + 0], m0 movu[r0 + r2 * 1 + mmsize], m0 RET + +INIT_ZMM avx512 +cglobal intra_pred_dc32, 3,3,17 +add r2, 2 +add r1d, r1d +movu m16, [r2] +movu m1, [r2 + 2 * mmsize] +paddwm16, m1 +vextracti32x8 ym1, m16, 1 +paddw ym16, ym1 +vextracti32x4 xm1, m16, 1 +paddw xm16, xm1 +pmaddwd xm16, [pw_1] +movhlps xm1, xm16 +paddd xm16, xm1 +phaddd xm16, xm16 +paddd xm16, [pd_32]; sum = sum + 32 +psrld xm16, 6 ; sum = sum / 64 +vpbroadcastw m0, xm16 + +lea r2, [r1 * 3] +; store DC 32x32 +movu[r0 + r1 * 0 + 0], m0 +movu[r0 + r1 * 1 + 0], m0 +movu[r0 + r1 * 2 + 0], m0 +movu[r0 + r2 * 1 + 0], m0 +lea r0, [r0 + r1 * 4] +movu[r0 + r1 * 0 + 0], m0 +movu[r0 + r1 * 1 + 0], m0 +movu[r0 + r1 * 2 + 0], m0 +movu[r0 + r2 * 1 + 0], m0 +lea r0, [r0 + r1 * 4] +movu[r0 + r1 * 0 + 0], m0 +movu[r0 + r1 * 1 + 0], m0 +movu[r0 + r1 * 2 + 0], m0 +movu[r0 + r2 * 1 + 0], m0 +lea r0, [r0 + r1 * 4] +movu[r0 + r1 * 0 + 0], m0 +movu[r0 + r1 * 1 + 0], m0 +movu[r0 + r1 * 2 + 0], m0 +movu[r0 + r2 * 1 + 0], m0 +lea r0, [r0 + r1 * 4] +movu[r0 + r1 * 0 + 0], m0 +movu[r0 + r1 * 1 + 0], m0 +movu[r0 + r1 * 2 + 0], m0 +movu[r0 + r2 * 1 + 0], m0 +lea r0, [r0 + r1 * 4] +movu[r0 + r1 * 0 + 0], m0 +movu[r0 + r1 * 1 + 0], m0 +movu[r0 + r1 * 2 + 0], m0 +movu[r0 + r2 * 1 + 0], m0 +lea r0, [r0 + r1 * 4] +movu[r0 + r1 * 0 + 0], m0 +movu[r0 + r1 * 1 + 0], m0 +movu[r0 + r1 * 2 + 0], m0 +movu[r0 + r2 * 1 + 0], m0 +lea r0, [r0 + r1 * 4] +movu[r0 + r1 * 0 + 0], m0 +movu[r0 + r1 * 1 + 0], m0 +movu[r0 + r1 * 2 + 0], m0 +movu[r0 + r2 * 1 + 0], m0 +RET %endif ;--- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 255 of 307] x86: AVX512 luma_hvpp
# HG changeset patch # User Vignesh Vijayakumar# Date 1513072665 -19800 # Tue Dec 12 15:27:45 2017 +0530 # Node ID 9ca6f6a66eabf5bfdecc3a8472c1137d16b1c722 # Parent b858f80e3ff03118abb1ef3e4ea56059f9ec5af4 x86: AVX512 luma_hvpp Calling interp_8tap_hv_pp_cpu C function(which calls luma_hps and luma_vsp asm functions individually) Including ALL_LUMA_PU_T for luma_hvpp which calls interp_8tap_hv_pp_cpu C function. diff -r b858f80e3ff0 -r 9ca6f6a66eab source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Dec 12 15:44:55 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Dec 12 15:27:45 2017 +0530 @@ -3056,6 +3056,23 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512); p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx512); +p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_64x16].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_32x8].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_32x16].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_32x32].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_32x24].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_32x64].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_16x4].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_16x8].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_16x12].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_16x16].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_16x32].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_16x64].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_48x64].luma_hvpp = interp_8tap_hv_pp_cpu; + } #endif } @@ -5220,6 +5237,24 @@ p.pu[LUMA_16x16].luma_vps = PFX(interp_8tap_vert_ps_16x16_avx512); p.pu[LUMA_16x32].luma_vps = PFX(interp_8tap_vert_ps_16x32_avx512); //p.pu[LUMA_16x64].luma_vps = PFX(interp_8tap_vert_ps_16x64_avx512); + +p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_64x16].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_32x8].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_32x16].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_32x32].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_32x24].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_32x64].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_16x4].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_16x8].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_16x12].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_16x16].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_16x32].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_16x64].luma_hvpp = interp_8tap_hv_pp_cpu; +p.pu[LUMA_48x64].luma_hvpp = interp_8tap_hv_pp_cpu; + } #endif } ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 293 of 307] x86 : AVX512 intra_pred_ang16 mode 5 and 31 high bit depth
# HG changeset patch # User Jayashree # Date 1515992814 -19800 # Mon Jan 15 10:36:54 2018 +0530 # Node ID 3a310b157fdf345023ff4e96e7de316cee79b954 # Parent c1daa99a8c14edbe5e9e5a59a74a6b0936c27a82 x86 : AVX512 intra_pred_ang16 mode 5 and 31 high bit depth Mode | AVX2 performance | AVX512 performance --- 5 |10.5x | 16.61x 31 |12.26x | 20.3x diff -r c1daa99a8c14 -r 3a310b157fdf source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jan 15 09:53:46 2018 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jan 15 10:36:54 2018 +0530 @@ -3111,7 +3111,8 @@ p.cu[BLOCK_16x16].intra_pred[11]= PFX(intra_pred_ang16_11_avx512); p.cu[BLOCK_16x16].intra_pred[25]= PFX(intra_pred_ang16_25_avx512); p.cu[BLOCK_16x16].intra_pred[27]= PFX(intra_pred_ang16_27_avx512); - +p.cu[BLOCK_16x16].intra_pred[5] = PFX(intra_pred_ang16_5_avx512); +p.cu[BLOCK_16x16].intra_pred[31] = PFX(intra_pred_ang16_31_avx512); p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu; p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu; p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu; diff -r c1daa99a8c14 -r 3a310b157fdf source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Mon Jan 15 09:53:46 2018 +0530 +++ b/source/common/x86/intrapred16.asm Mon Jan 15 10:36:54 2018 +0530 @@ -19283,10 +19283,29 @@ callang16_mode_5_31 add r2,18 - callang32_mode_5_31 RET - +cglobal intra_pred_ang16_5, 3,7,13 +add r2,64 +xor r6d, r6d +vbroadcasti32x8 m15, [pd_16] +lea r3,[ang_table_avx2 + 16 * 32] +add r1d, r1d +lea r4,[r1 * 3] + +callang16_mode_5_31 +RET + +cglobal intra_pred_ang16_31, 3,7,13 +xor r6d, r6d +inc r6d +vbroadcasti32x8 m15, [pd_16] +lea r3,[ang_table_avx2 + 16 * 32] +add r1d, r1d +lea r4,[r1 * 3] + +callang16_mode_5_31 +RET ;--- ; avx512 code for intra_pred_ang32 mode 2 to 34 end ;--- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 304 of 307] Fix SIMD register count for intra_pred_ang modes
# HG changeset patch # User Jayashree # Date 1516361209 -19800 # Fri Jan 19 16:56:49 2018 +0530 # Node ID e82bfd58acb99cd4c2e4767b1afdd3750881a68e # Parent f56354b2b542aaafa389a226f0fb3b41e4d33803 Fix SIMD register count for intra_pred_ang modes diff -r f56354b2b542 -r e82bfd58acb9 source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Wed Jan 17 10:11:09 2018 -0800 +++ b/source/common/x86/intrapred16.asm Fri Jan 19 16:56:49 2018 +0530 @@ -19236,8 +19236,7 @@ packusdwm2, m1 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 2, 0, 1, 16 ret - -cglobal intra_pred_ang32_5, 3,8,13 +cglobal intra_pred_ang32_5, 3,8,17 add r2,128 xor r6d, r6d lea r3,[ang_table_avx2 + 16 * 32] @@ -19259,11 +19258,9 @@ vbroadcasti32x8 m15, [pd_16] add r2,18 lea r0,[r0 + 32] - callang32_mode_5_31 RET - -cglobal intra_pred_ang32_31, 3,7,13 +cglobal intra_pred_ang32_31, 3,7,17 xor r6d, r6d inc r6d lea r3,[ang_table_avx2 + 16 * 32] @@ -19285,18 +19282,16 @@ add r2,18 callang32_mode_5_31 RET -cglobal intra_pred_ang16_5, 3,7,13 +cglobal intra_pred_ang16_5, 3,7,17 add r2,64 xor r6d, r6d vbroadcasti32x8 m15, [pd_16] lea r3,[ang_table_avx2 + 16 * 32] add r1d, r1d lea r4,[r1 * 3] - callang16_mode_5_31 RET - -cglobal intra_pred_ang16_31, 3,7,13 +cglobal intra_pred_ang16_31, 3,7,17 xor r6d, r6d inc r6d vbroadcasti32x8 m15, [pd_16] @@ -19609,8 +19604,7 @@ packusdwm3, m0 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 3, 0, 1, 16 ret - -cglobal intra_pred_ang32_4, 3,8,13 +cglobal intra_pred_ang32_4, 3,8,17 add r2,128 xor r6d, r6d lea r3,[ang_table_avx2 + 18 * 32] @@ -19632,11 +19626,9 @@ add r2,22 lea r0,[r0 + 32] - callang32_mode_4_32 RET - -cglobal intra_pred_ang32_32, 3,7,13 +cglobal intra_pred_ang32_32, 3,7,17 xor r6d, r6d inc r6d lea r3,[ang_table_avx2 + 18 * 32] @@ -19654,23 +19646,19 @@ mov r0,r5 callang16_mode_4_32 - add r2,22 - callang32_mode_4_32 RET -cglobal intra_pred_ang16_4, 3,7,13 +cglobal intra_pred_ang16_4, 3,7,17 add r2,64 xor r6d, r6d vbroadcasti32x8 m15, [pd_16] lea r3,[ang_table_avx2 + 18 * 32] add r1d, r1d lea r4,[r1 * 3] - callang16_mode_4_32 RET - -cglobal intra_pred_ang16_32, 3,7,13 +cglobal intra_pred_ang16_32, 3,7,17 xor r6d, r6d inc r6d vbroadcasti32x8 m15, [pd_16] @@ -19949,8 +19937,7 @@ packusdwm11, m3 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 16 ret - -cglobal intra_pred_ang32_6, 3,8,14 +cglobal intra_pred_ang32_6, 3,8,17 add r2,128 xor r6d, r6d lea r3,[ang_table_avx2 + 15 * 32] @@ -19972,11 +19959,9 @@ add r2,12 lea r0,[r0 + 32] - callang32_mode_6_30 RET - -cglobal intra_pred_ang32_30, 3,7,14 +cglobal intra_pred_ang32_30, 3,7,17 xor r6d, r6d inc r6d lea r3,[ang_table_avx2 + 15 * 32] @@ -19998,18 +19983,16 @@ add r2,12 callang32_mode_6_30 RET -cglobal intra_pred_ang16_6, 3,7,14 +cglobal intra_pred_ang16_6, 3,7,17 add r2,64 xor r6d, r6d vbroadcasti32x8 m15, [pd_16] lea r3,[ang_table_avx2 + 15 * 32] shl r1d, 1 lea r4,[r1 * 3] - callang16_mode_6_30 RET - -cglobal intra_pred_ang16_30, 3,7,14 +cglobal intra_pred_ang16_30, 3,7,17 xor r6d, r6d inc r6d vbroadcasti32x8 m15, [pd_16] ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 292 of 307] x86:AVX512 intra_pred_ang32 mode 5 and 31 high bit depth
# HG changeset patch # User Jayashree # Date 1515990226 -19800 # Mon Jan 15 09:53:46 2018 +0530 # Node ID c1daa99a8c14edbe5e9e5a59a74a6b0936c27a82 # Parent e4983d90f403d968d6760ae044f86a7a2e1865a2 x86:AVX512 intra_pred_ang32 mode 5 and 31 high bit depth Mode | AVX2 performance | AVX512 performance --- 5 |9.5x| 17.11x 31 |11.5x | 24.1x diff -r e4983d90f403 -r c1daa99a8c14 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jan 12 15:17:56 2018 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jan 15 09:53:46 2018 +0530 @@ -3105,11 +3105,13 @@ p.cu[BLOCK_32x32].intra_pred[25]= PFX(intra_pred_ang32_25_avx512); p.cu[BLOCK_32x32].intra_pred[26]= PFX(intra_pred_ang32_26_avx512); p.cu[BLOCK_32x32].intra_pred[27]= PFX(intra_pred_ang32_27_avx512); - +p.cu[BLOCK_32x32].intra_pred[5] = PFX(intra_pred_ang32_5_avx512); +p.cu[BLOCK_32x32].intra_pred[31] = PFX(intra_pred_ang32_31_avx512); p.cu[BLOCK_16x16].intra_pred[9] = PFX(intra_pred_ang16_9_avx512); p.cu[BLOCK_16x16].intra_pred[11]= PFX(intra_pred_ang16_11_avx512); p.cu[BLOCK_16x16].intra_pred[25]= PFX(intra_pred_ang16_25_avx512); p.cu[BLOCK_16x16].intra_pred[27]= PFX(intra_pred_ang16_27_avx512); + p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu; p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu; p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu; diff -r e4983d90f403 -r c1daa99a8c14 source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Fri Jan 12 15:17:56 2018 +0530 +++ b/source/common/x86/intrapred16.asm Mon Jan 15 09:53:46 2018 +0530 @@ -18977,6 +18977,316 @@ lea r4,[r1 * 3] callang16_mode_11_25 RET +cglobal ang16_mode_5_31 +testr6d, r6d + +vbroadcasti32x8m0, [r2 + 2]; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] +vbroadcasti32x8m1, [r2 + 4]; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + +punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] +punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] + +vbroadcasti32x8m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] +vbroadcasti32x8m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] +punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] +punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13] + +pmaddwd m4, m3, [r3 + 1 * 32] ; [17] +paddd m4, m15 +psrld m4, 5 +pmaddwd m5, m0, [r3 + 1 * 32] +paddd m5, m15 +psrld m5, 5 +packusdwm4, m5 + +movuym16, [r3 - 14 * 32] ; [2] +vinserti32x8m16, [r3 + 3 * 32] ,1 ; [19] +palignr m6, m0, m3, 4 +pmaddwd m5, m6, m16 +paddd m5, m15 +psrld m5, 5 +palignr m7, m2, m0, 4 +pmaddwd m8, m7, m16 +paddd m8, m15 +psrld m8, 5 +packusdwm5, m8 +vextracti32x8 ym6, m5, 1 + +palignr m8, m0, m3, 8 +palignr m9, m2, m0, 8 +movuym16, [r3 - 12 * 32] ; [4] +vinserti32x8m16, [r3 + 5 * 32],1 ; [21] +pmaddwd m7, m8, m16 +paddd m7, m15 +psrld m7, 5 +pmaddwd m10, m9,m16 +paddd m10, m15 +psrld m10, 5 +packusdwm7, m10 +vextracti32x8 ym8, m7, 1 + +palignr m10, m0, m3, 12 +palignr m11, m2, m0, 12 +movuym16,[r3 - 10 * 32] ; [6] +vinserti32x8m16, [r3 + 7 * 32] ,1 ; [23] +pmaddwd m9, m10, m16 +paddd m9, m15 +psrld m9, 5 +pmaddwd m3, m11, m16 +paddd m3, m15 +psrld m3, 5 +packusdwm9, m3 +vextracti32x8 ym10, m9, 1 + +pmaddwd m11, m0, [r3 - 8 * 32] ; [8] +paddd m11, m15 +psrld m11, 5 +pmaddwd m3, m2, [r3 - 8 * 32] +paddd m3, m15 +psrld m3, 5 +packusdwm11, m3 + +TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 3, 0 + +pmaddwd m4, m0, [r3 + 9 * 32] ; [25] +paddd m4, m15 +psrld m4, 5 +pmaddwd m5, m2, [r3 + 9 * 32] +paddd m5, m15 +psrld m5, 5 +packusdwm4, m5 + +palignr m6, m2, m0, 4 +movuym16, [r3
[x265] [PATCH 305 of 307] X86: AVX512 intra_pred_ang16 mode 8 and 28 high bit depth
# HG changeset patch # User Jayashri Murugan# Date 1517294626 -19800 # Tue Jan 30 12:13:46 2018 +0530 # Node ID b80e844209ecd0abc896df94306a5ef96b27b918 # Parent e82bfd58acb99cd4c2e4767b1afdd3750881a68e X86: AVX512 intra_pred_ang16 mode 8 and 28 high bit depth Mode | AVX2 performance | AVX512 performance --- 8 |9.31x |10.78x 28 |12.80x|15.21x diff -r e82bfd58acb9 -r b80e844209ec source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jan 19 16:56:49 2018 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jan 30 12:13:46 2018 +0530 @@ -3113,14 +3113,14 @@ p.cu[BLOCK_32x32].intra_pred[6] = PFX(intra_pred_ang32_6_avx512); p.cu[BLOCK_32x32].intra_pred[29] = PFX(intra_pred_ang32_29_avx512); p.cu[BLOCK_32x32].intra_pred[7] = PFX(intra_pred_ang32_7_avx512); - +p.cu[BLOCK_32x32].intra_pred[8]= PFX(intra_pred_ang32_8_avx512); +p.cu[BLOCK_32x32].intra_pred[28]= PFX(intra_pred_ang32_28_avx512); p.cu[BLOCK_16x16].intra_pred[9] = PFX(intra_pred_ang16_9_avx512); p.cu[BLOCK_16x16].intra_pred[11]= PFX(intra_pred_ang16_11_avx512); p.cu[BLOCK_16x16].intra_pred[25]= PFX(intra_pred_ang16_25_avx512); p.cu[BLOCK_16x16].intra_pred[27]= PFX(intra_pred_ang16_27_avx512); -p.cu[BLOCK_32x32].intra_pred[8]= PFX(intra_pred_ang32_8_avx512); -p.cu[BLOCK_32x32].intra_pred[28]= PFX(intra_pred_ang32_28_avx512); - +p.cu[BLOCK_16x16].intra_pred[8] = PFX(intra_pred_ang16_8_avx512); +p.cu[BLOCK_16x16].intra_pred[28]= PFX(intra_pred_ang16_28_avx512); p.cu[BLOCK_16x16].intra_pred[5] = PFX(intra_pred_ang16_5_avx512); p.cu[BLOCK_16x16].intra_pred[31] = PFX(intra_pred_ang16_31_avx512); p.cu[BLOCK_16x16].intra_pred[4] = PFX(intra_pred_ang16_4_avx512); diff -r e82bfd58acb9 -r b80e844209ec source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Fri Jan 19 16:56:49 2018 +0530 +++ b/source/common/x86/intrapred16.asm Tue Jan 30 12:13:46 2018 +0530 @@ -11843,6 +11843,27 @@ packusdwm11, m3 TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 16 ret +cglobal intra_pred_ang16_8, 3,7,16 +add r2,64 +xor r6d, r6d +lea r3,[ang_table_avx2 + 15 * 32] +add r1d, r1d +lea r4,[r1 * 3] +vbroadcasti32x8 m15, [pd_16] + +callang16_mode_8_28 +RET + +cglobal intra_pred_ang16_28, 3,7,16 +xor r6d, r6d +inc r6d +lea r3,[ang_table_avx2 + 15 * 32] +add r1d, r1d +lea r4,[r1 * 3] +vbroadcasti32x8 m15, [pd_16] + +callang16_mode_8_28 +RET ;; angle 16, modes 7 and 29 cglobal ang16_mode_7_29 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 307 of 307] x86:AVX512 Set run time flag to enable/disable avx512
# HG changeset patch # User Jayashree# Date 1522928767 -19800 # Thu Apr 05 17:16:07 2018 +0530 # Node ID f6ad2fa637fd3c8f9e2811982b89aa28228e9f6b # Parent 876b6e006f2080072c0684dbf75e7cfde974ba79 x86:AVX512 Set run time flag to enable/disable avx512 diff -r 876b6e006f20 -r f6ad2fa637fd source/common/cpu.cpp --- a/source/common/cpu.cpp Mon Feb 05 10:39:00 2018 -0800 +++ b/source/common/cpu.cpp Thu Apr 05 17:16:07 2018 +0530 @@ -122,7 +122,7 @@ #pragma warning(disable: 4309) // truncation of constant value #endif -uint32_t cpu_detect(void) +uint32_t cpu_detect(bool benableavx512 ) { uint32_t cpu = 0; @@ -184,11 +184,13 @@ { if (ebx & 0x0020) cpu |= X265_CPU_AVX2; - -if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */ +if (benableavx512) { -if ((ebx & 0xD003) == 0xD003) -cpu |= X265_CPU_AVX512; +if ((xcr0 & 0xE0) == 0xE0) /* OPMASK/ZMM state */ +{ +if ((ebx & 0xD003) == 0xD003) +cpu |= X265_CPU_AVX512; +} } } } @@ -327,7 +329,7 @@ int PFX(cpu_fast_neon_mrc_test)(void); } -uint32_t cpu_detect(void) +uint32_t cpu_detect(bool benableavx512) { int flags = 0; @@ -370,7 +372,7 @@ #elif X265_ARCH_POWER8 -uint32_t cpu_detect(void) +uint32_t cpu_detect(bool benableavx512) { #if HAVE_ALTIVEC return X265_CPU_ALTIVEC; @@ -381,7 +383,7 @@ #else // if X265_ARCH_POWER8 -uint32_t cpu_detect(void) +uint32_t cpu_detect(bool benableavx512) { return 0; } diff -r 876b6e006f20 -r f6ad2fa637fd source/common/cpu.h --- a/source/common/cpu.h Mon Feb 05 10:39:00 2018 -0800 +++ b/source/common/cpu.h Thu Apr 05 17:16:07 2018 +0530 @@ -50,7 +50,7 @@ #endif namespace X265_NS { -uint32_t cpu_detect(void); +uint32_t cpu_detect(bool); struct cpu_name_t { diff -r 876b6e006f20 -r f6ad2fa637fd source/common/param.cpp --- a/source/common/param.cpp Mon Feb 05 10:39:00 2018 -0800 +++ b/source/common/param.cpp Thu Apr 05 17:16:07 2018 +0530 @@ -99,13 +99,13 @@ { x265_free(p); } - +bool benableavx512 = false; void x265_param_default(x265_param* param) { memset(param, 0, sizeof(x265_param)); /* Applying default values to all elements in the param structure */ -param->cpuid = X265_NS::cpu_detect(); +param->cpuid = X265_NS::cpu_detect(benableavx512); param->bEnableWavefront = 1; param->frameNumThreads = 0; @@ -609,6 +609,17 @@ if (0) ; OPT("asm") { +sscanf(value, "%s", p->asmname); +if (strcmp(value, "avx512")==0) +{ +p->bEnableavx512 = 1; +benableavx512 = true; +} +else +{ +p->bEnableavx512 = 0; +benableavx512 = false; +} if (bValueWasNull) p->cpuid = atobool(value); else @@ -1072,7 +1083,7 @@ if (isdigit(value[0])) cpu = x265_atoi(value, bError); else -cpu = !strcmp(value, "auto") || x265_atobool(value, bError) ? X265_NS::cpu_detect() : 0; +cpu = !strcmp(value, "auto") || x265_atobool(value, bError) ? X265_NS::cpu_detect(benableavx512) : 0; if (bError) { diff -r 876b6e006f20 -r f6ad2fa637fd source/test/pixelharness.cpp --- a/source/test/pixelharness.cpp Mon Feb 05 10:39:00 2018 -0800 +++ b/source/test/pixelharness.cpp Thu Apr 05 17:16:07 2018 +0530 @@ -332,8 +332,9 @@ memset(ref_dest, 0, 64 * 64 * sizeof(pixel)); memset(opt_dest, 0, 64 * 64 * sizeof(pixel)); int j = 0; +bool enableavx512 = true; int width = 16 * (rand() % 4 + 1); -int cpuid = X265_NS::cpu_detect(); +int cpuid = X265_NS::cpu_detect(enableavx512); if (cpuid & X265_CPU_AVX512) width = 32 * (rand() % 2 + 1); int height = 8; diff -r 876b6e006f20 -r f6ad2fa637fd source/test/testbench.cpp --- a/source/test/testbench.cpp Mon Feb 05 10:39:00 2018 -0800 +++ b/source/test/testbench.cpp Thu Apr 05 17:16:07 2018 +0530 @@ -96,7 +96,8 @@ int main(int argc, char *argv[]) { -int cpuid = X265_NS::cpu_detect(); +bool enableavx512 = true; +int cpuid = X265_NS::cpu_detect(enableavx512); const char *testname = 0; if (!(argc & 1)) diff -r 876b6e006f20 -r f6ad2fa637fd source/x265.h --- a/source/x265.h Mon Feb 05 10:39:00 2018 -0800 +++ b/source/x265.h Thu Apr 05 17:16:07 2018 +0530 @@ -585,7 +585,14 @@ * somehow flawed on your target hardware. The asm function tables are * process global, the first encoder configures them for all encoders */ int cpuid; - + /*==Assembly features ==*/ + /* x265_param_parse() will detect if the avx512 is enabled (in cli )and set + * bEnableavx512 to 1 to use avx512 SIMD. By default this flag will not be set , + * hence the encoding
[x265] [PATCH 295 of 307] x86:AVX512 intra_pred-ang32 mode 4 and 32 high bit depth
# HG changeset patch # User Jayashree # Date 1516018946 -19800 # Mon Jan 15 17:52:26 2018 +0530 # Node ID a2b347ed81f90ac82f59d891deba7fa876df7f62 # Parent 1107c2def5f9dbee9947a2c9c41f50961fa31bc6 x86:AVX512 intra_pred-ang32 mode 4 and 32 high bit depth Mode | AVX2 performance | AVX512 performance --- 4 |9.1x | 14.6x 32 |11.35x | 20.85x diff -r 1107c2def5f9 -r a2b347ed81f9 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jan 15 12:22:40 2018 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jan 15 17:52:26 2018 +0530 @@ -3107,6 +3107,9 @@ p.cu[BLOCK_32x32].intra_pred[27]= PFX(intra_pred_ang32_27_avx512); p.cu[BLOCK_32x32].intra_pred[5] = PFX(intra_pred_ang32_5_avx512); p.cu[BLOCK_32x32].intra_pred[31] = PFX(intra_pred_ang32_31_avx512); +p.cu[BLOCK_32x32].intra_pred[32] = PFX(intra_pred_ang32_32_avx512); +p.cu[BLOCK_32x32].intra_pred[4] = PFX(intra_pred_ang32_4_avx512); + p.cu[BLOCK_16x16].intra_pred[9] = PFX(intra_pred_ang16_9_avx512); p.cu[BLOCK_16x16].intra_pred[11]= PFX(intra_pred_ang16_11_avx512); p.cu[BLOCK_16x16].intra_pred[25]= PFX(intra_pred_ang16_25_avx512); diff -r 1107c2def5f9 -r a2b347ed81f9 source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Mon Jan 15 12:22:40 2018 +0530 +++ b/source/common/x86/intrapred16.asm Mon Jan 15 17:52:26 2018 +0530 @@ -19303,9 +19303,363 @@ lea r3,[ang_table_avx2 + 16 * 32] add r1d, r1d lea r4,[r1 * 3] - callang16_mode_5_31 RET +;; angle 16, modes 4 and 32 +cglobal ang16_mode_4_32 +testr6d, r6d + +vbroadcasti32x8m0, [r2 + 2]; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] +vbroadcasti32x8m1, [r2 + 4]; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + +punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] +punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] + +vbroadcasti32x8m1, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] +vbroadcasti32x8m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] +punpcklwd m2, m1, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] +punpckhwd m1, m4 ; [25 24 24 23 23 22 22 21 17 16 16 15 15 14 14 13] + +pmaddwd m4, m3, [r3 + 3 * 32] ; [21] +paddd m4, m15 +psrld m4, 5 +pmaddwd m5, m0, [r3 + 3 * 32] +paddd m5, m15 +psrld m5, 5 +packusdwm4, m5 + +palignr m6, m0, m3, 4 ; [14 13 13 12 12 11 11 10 6 5 5 4 4 3 3 2] +palignr m7, m2, m0, 4 ; [18 17 17 16 16 15 15 14 10 9 9 8 8 7 7 6] +movuym16,[r3 - 8 * 32] ; [10] +vinserti32x8m16, [r3 + 13 * 32] ,1 ; [31] +pmaddwd m5, m6, m16 +paddd m5, m15 +psrld m5, 5 +pmaddwd m8, m7,m16 +paddd m8, m15 +psrld m8, 5 +packusdwm5, m8 +vextracti32x8 ym6, m5, 1 + + +palignr m7, m0, m3, 8 ; [15 14 14 13 13 12 12 11 7 6 6 5 5 4 4 3] +pmaddwd m7, [r3 + 2 * 32] ; [20] +paddd m7, m15 +psrld m7, 5 +palignr m8, m2, m0, 8 ; [19 18 18 17 17 16 16 15 11 10 10 9 9 8 8 7] +pmaddwd m8, [r3 + 2 * 32] +paddd m8, m15 +psrld m8, 5 +packusdwm7, m8 + +palignr m9, m0, m3, 12 +palignr m3, m2, m0, 12 +movuym16,[r3 - 9 * 32] ; [9] +vinserti32x8m16, [r3 + 12 * 32] ,1 ; [30] +pmaddwd m8, m9, m16 +paddd m8, m15 +psrld m8, 5 +pmaddwd m10, m3,m16 +paddd m10,m15 +psrld m10, 5 +packusdwm8, m10 +vextracti32x8 ym9, m8, 1 + + +pmaddwd m10, m0, [r3 + 1 * 32] ; [19] +paddd m10,m15 +psrld m10, 5 +pmaddwd m3, m2, [r3 + 1 * 32] +paddd m3, m15 +psrld m3, 5 +packusdwm10, m3 + +palignr m11, m2, m0, 4 +pmaddwd m11, [r3 - 10 * 32] ; [8] +paddd m11, m15 +psrld m11, 5 +palignr m3, m1, m2, 4 +pmaddwd m3, [r3 - 10 * 32] +paddd m3, m15 +psrld m3, 5 +packusdwm11, m3 + +TRANSPOSE_STORE_AVX2 4, 5, 6,
[x265] [PATCH 302 of 307] X86:AVX512 intra_pred_ang32 mode 7 and 29 high bit depth
# HG changeset patch # User Jayashree # Date 1516212669 28800 # Wed Jan 17 10:11:09 2018 -0800 # Node ID ae3f7bd65b45df716f1cd56b6b15d91643772621 # Parent 3a08a957d4cd2bf0eb57524651a824513378e0a3 X86:AVX512 intra_pred_ang32 mode 7 and 29 high bit depth Mode | AVX2 performance | AVX512 performance --- 7 |9.2x | 11.45x 29 |16.2x | 24.0x diff -r 3a08a957d4cd -r ae3f7bd65b45 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jan 29 20:05:49 2018 -0800 +++ b/source/common/x86/asm-primitives.cpp Wed Jan 17 10:11:09 2018 -0800 @@ -3111,6 +3111,9 @@ p.cu[BLOCK_32x32].intra_pred[4] = PFX(intra_pred_ang32_4_avx512); p.cu[BLOCK_32x32].intra_pred[30] = PFX(intra_pred_ang32_30_avx512); p.cu[BLOCK_32x32].intra_pred[6] = PFX(intra_pred_ang32_6_avx512); +p.cu[BLOCK_32x32].intra_pred[29] = PFX(intra_pred_ang32_29_avx512); +p.cu[BLOCK_32x32].intra_pred[7] = PFX(intra_pred_ang32_7_avx512); + p.cu[BLOCK_16x16].intra_pred[9] = PFX(intra_pred_ang16_9_avx512); p.cu[BLOCK_16x16].intra_pred[11]= PFX(intra_pred_ang16_11_avx512); p.cu[BLOCK_16x16].intra_pred[25]= PFX(intra_pred_ang16_25_avx512); diff -r 3a08a957d4cd -r ae3f7bd65b45 source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Mon Jan 29 20:05:49 2018 -0800 +++ b/source/common/x86/intrapred16.asm Wed Jan 17 10:11:09 2018 -0800 @@ -20311,7 +20311,318 @@ callang32_mode_8_28 RET - +;; angle 16, modes 7 and 29 +cglobal ang16_mode_7_29 +testr6d, r6d + +vbroadcasti32x8 m0, [r2 + 2]; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] +vbroadcasti32x8m1, [r2 + 4]; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + +punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] +punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] + +vbroadcasti32x8 m2, [r2 + 18] ; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] +vbroadcasti32x8m4, [r2 + 20] ; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] +punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] + +movuym16, [r3 - 8 * 32] ; [9] +vinserti32x8m16, [r3 + 1 * 32] ,1 ; [18] +pmaddwd m4, m3,m16 +paddd m4, m15 +psrld m4, 5 +pmaddwd m5, m0, m16 +paddd m5, m15 +psrld m5, 5 +packusdwm4, m5 +vextracti32x8 ym5, m4, 1 + +pmaddwd m6, m3, [r3 + 10 * 32] ; [27] +paddd m6, m15 +psrld m6, 5 +pmaddwd m9, m0, [r3 + 10 * 32] +paddd m9, m15 +psrld m9, 5 +packusdwm6, m9 + +palignr m10, m0, m3, 4 +pmaddwd m7, m10, [r3 - 13 * 32] ; [4] +paddd m7, m15 +psrld m7, 5 +palignr m11, m2, m0, 4 +pmaddwd m8, m11, [r3 - 13 * 32] +paddd m8, m15 +psrld m8, 5 +packusdwm7, m8 + +movuym16, [r3 - 4 * 32] ; [13] +vinserti32x8m16, [r3 + 5 * 32],1 ; [22] +pmaddwd m8, m10, m16 +paddd m8, m15 +psrld m8, 5 +pmaddwd m9, m11, m16 +paddd m9, m15 +psrld m9, 5 +packusdwm8, m9 +vextracti32x8 ym9, m8, 1 + +pmaddwd m10, [r3 + 14 * 32] ; [31] +paddd m10, m15 +psrld m10, 5 +pmaddwd m11, [r3 + 14 * 32] +paddd m11, m15 +psrld m11, 5 +packusdwm10, m11 + +palignr m11, m0, m3, 8 +pmaddwd m11, [r3 - 9 * 32] ; [8] +paddd m11, m15 +psrld m11, 5 +palignr m12, m2, m0, 8 +pmaddwd m12, [r3 - 9 * 32] +paddd m12, m15 +psrld m12, 5 +packusdwm11, m12 + +TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0 + +palignr m5, m0, m3, 8 +palignr m6, m2, m0, 8 +movuym16, [r3]; [17] +vinserti32x8m16, [r3 + 9 * 32] ,1 ; [26] +pmaddwd m4, m5, m16 +paddd m4, m15 +psrld m4, 5 +pmaddwd m7, m6, m16 +paddd m7, m15 +psrld m7, 5 +packusdwm4, m7 +vextracti32x8 ym5, m4, 1 + + +palignr m9, m0, m3, 12 +palignr m3, m2, m0, 12 +movuym16, [r3 - 14 * 32] ; [3] +vinserti32x8m16, [r3 - 5 * 32] ,1 ;
[x265] [PATCH 299 of 307] X86: AVX512 intra_pred_ang32 mode 8 and 28 high bit depth
# HG changeset patch # User Jayashri Murugan# Date 1515789616 28800 # Fri Jan 12 12:40:16 2018 -0800 # Node ID 624c83571d1df840e1206c46e589044fbf87ff32 # Parent b0d00ca83af0cb2053d6eda82b6d4081236a0f5f X86: AVX512 intra_pred_ang32 mode 8 and 28 high bit depth Mode | AVX2 performance | AVX512 performance --- 8 |9.15x |9.60x 28 |11.30x|12.13x diff -r b0d00ca83af0 -r 624c83571d1d source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jan 16 15:38:58 2018 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jan 12 12:40:16 2018 -0800 @@ -3115,6 +3115,9 @@ p.cu[BLOCK_16x16].intra_pred[11]= PFX(intra_pred_ang16_11_avx512); p.cu[BLOCK_16x16].intra_pred[25]= PFX(intra_pred_ang16_25_avx512); p.cu[BLOCK_16x16].intra_pred[27]= PFX(intra_pred_ang16_27_avx512); +p.cu[BLOCK_32x32].intra_pred[8]= PFX(intra_pred_ang32_8_avx512); +p.cu[BLOCK_32x32].intra_pred[28]= PFX(intra_pred_ang32_28_avx512); + p.cu[BLOCK_16x16].intra_pred[5] = PFX(intra_pred_ang16_5_avx512); p.cu[BLOCK_16x16].intra_pred[31] = PFX(intra_pred_ang16_31_avx512); p.cu[BLOCK_16x16].intra_pred[4] = PFX(intra_pred_ang16_4_avx512); diff -r b0d00ca83af0 -r 624c83571d1d source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Tue Jan 16 15:38:58 2018 +0530 +++ b/source/common/x86/intrapred16.asm Fri Jan 12 12:40:16 2018 -0800 @@ -20016,9 +20016,302 @@ lea r3,[ang_table_avx2 + 15 * 32] shl r1d, 1 lea r4,[r1 * 3] - callang16_mode_6_30 RET + +;; angle 16, modes 8 and 28 +cglobal ang16_mode_8_28 +testr6d, r6d + +vbroadcasti32x8m0, [r2 + 2] ; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] +vbroadcasti32x8m1, [r2 + 4] ; [17 16 15 14 13 12 11 10 9 8 7 6 5 4 3 2] + +punpcklwd m3, m0, m1 ; [13 12 12 11 11 10 10 9 5 4 4 3 3 2 2 1] +punpckhwd m0, m1 ; [17 16 16 15 15 14 14 13 9 8 8 7 7 6 6 5] + +vbroadcasti32x8m2, [r2 + 18]; [24 23 22 21 20 19 18 17 16 15 14 13 12 11 10 9] +vbroadcasti32x8m4, [r2 + 20]; [25 24 23 22 21 20 19 18 17 16 15 14 13 12 11 10] +punpcklwd m2, m4 ; [21 20 20 19 19 18 18 17 13 12 12 11 11 10 10 9] + +movuym14, [r3 - 10 * 32] +vinserti32x8m14, [r3 - 5 * 32], 1 +pmaddwd m4, m3, m14; [5], [10] +paddd m4, m15 +psrld m4, 5 +pmaddwd m5, m0, m14 +paddd m5, m15 +psrld m5, 5 +packusdwm4, m5 +vextracti32x8 ym5, m4, 1 + +movuym14, [r3] +vinserti32x8m14, [r3 + 5 * 32], 1 +pmaddwd m6, m3, m14; [15], [20] +paddd m6, m15 +psrld m6, 5 +pmaddwd m9, m0, m14 +paddd m9, m15 +psrld m9, 5 +packusdwm6, m9 +vextracti32x8 ym7, m6, 1 + +movuym14, [r3 + 10 * 32] +vinserti32x8m14, [r3 + 15 * 32], 1 +pmaddwd m8, m3, m14 ; [25], [30] +paddd m8, m15 +psrld m8, 5 +pmaddwd m9, m0, m14 +paddd m9, m15 +psrld m9, 5 +packusdwm8, m9 +vextracti32x8 ym9, m8, 1 + +palignr m11, m0, m3, 4 +movuym14, [r3 - 12 * 32] +vinserti32x8m14, [r3 - 7 * 32], 1 +pmaddwd m10, m11, m14 ; [3], [8] +paddd m10, m15 +psrld m10, 5 +palignr m1, m2, m0, 4 +pmaddwd m12, m1, m14 +paddd m12, m15 +psrld m12, 5 +packusdwm10, m12 +vextracti32x8 ym11, m10, 1 + +TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 0 + +palignr m7, m0, m3, 4 +movuym14, [r3 - 2 * 32] +vinserti32x8m14, [r3 + 3 * 32], 1 +pmaddwd m4, m7, m14 ; [13], [18] +paddd m4, m15 +psrld m4, 5 +palignr m1, m2, m0, 4 +pmaddwd m5, m1, m14 +paddd m5, m15 +psrld m5, 5 +packusdwm4, m5 +vextracti32x8 ym5, m4, 1 + +movuym14, [r3 + 8 * 32] +vinserti32x8m14, [r3 + 13 * 32], 1 +pmaddwd m6, m7, m14 ; [23], [28] +paddd m6, m15 +psrld m6, 5 +pmaddwd m8, m1, m14 +paddd m8, m15 +psrld m8, 5 +packusdwm6, m8 +vextracti32x8 ym7, m6, 1 + +movuym14, [r3 - 14 * 32] +
[x265] [PATCH 289 of 307] x86: AVX512 intra_pred_ang32 mode 11 and 25, intra_pred_ang16 mode 11 and 25
# HG changeset patch # User Vignesh Vijayakumar# Date 1515661273 -19800 # Thu Jan 11 14:31:13 2018 +0530 # Node ID d43237051962eab3cd761cf24f3971de09c07aa5 # Parent 5a90661c7fbf2fbacbd6b8afde64368147c29674 x86: AVX512 intra_pred_ang32 mode 11 and 25, intra_pred_ang16 mode 11 and 25 Size | Mode | AVX2 performance | AVX512 performance --- 16 | 11 | 8.68x| 9.27x 16 | 25 | 11.11x | 14.26x 32 | 11 | 6.54x| 11.19x 32 | 25 | 12.40x | 14.86x diff -r 5a90661c7fbf -r d43237051962 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jan 11 09:13:56 2018 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jan 11 14:31:13 2018 +0530 @@ -3100,12 +3100,16 @@ p.cu[BLOCK_32x32].intra_pred[34] = PFX(intra_pred_ang32_2_avx512); p.cu[BLOCK_32x32].intra_pred[9] = PFX(intra_pred_ang32_9_avx512); p.cu[BLOCK_32x32].intra_pred[10] = PFX(intra_pred_ang32_10_avx512); +p.cu[BLOCK_32x32].intra_pred[11]= PFX(intra_pred_ang32_11_avx512); p.cu[BLOCK_32x32].intra_pred[18]= PFX(intra_pred_ang32_18_avx512); +p.cu[BLOCK_32x32].intra_pred[25]= PFX(intra_pred_ang32_25_avx512); p.cu[BLOCK_32x32].intra_pred[26]= PFX(intra_pred_ang32_26_avx512); p.cu[BLOCK_32x32].intra_pred[27]= PFX(intra_pred_ang32_27_avx512); + p.cu[BLOCK_16x16].intra_pred[9] = PFX(intra_pred_ang16_9_avx512); +p.cu[BLOCK_16x16].intra_pred[11]= PFX(intra_pred_ang16_11_avx512); +p.cu[BLOCK_16x16].intra_pred[25]= PFX(intra_pred_ang16_25_avx512); p.cu[BLOCK_16x16].intra_pred[27]= PFX(intra_pred_ang16_27_avx512); - p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu; p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu; p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu; diff -r 5a90661c7fbf -r d43237051962 source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Thu Jan 11 09:13:56 2018 +0530 +++ b/source/common/x86/intrapred16.asm Thu Jan 11 14:31:13 2018 +0530 @@ -18779,6 +18779,158 @@ add r2,2 callang16_mode_9_27 RET +;; angle 16, modes 11 and 25 +cglobal ang16_mode_11_25 +testr6d, r6d + +vbroadcasti32x8 m0, [r2]; [15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0] +vbroadcasti32x8 m1, [r2 + 2]; [16 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1] + +punpcklwd m3, m0, m1 ; [12 11 11 10 10 9 9 8 4 3 3 2 2 1 1 0] +punpckhwd m0, m1 ; [16 15 15 14 14 13 13 12 8 7 7 6 6 5 5 4] + +movuym16, [r3 + 14 * 32] ; [30] +vinserti32x8m16, [r3 + 12 * 32], 1 ; [28] +pmaddwd m4, m3, m16 +paddd m4, m15 +psrld m4, 5 +pmaddwd m5, m0, m16 +paddd m5, m15 +psrld m5, 5 +packusdwm4, m5 +vextracti32x8 ym5, m4, 1 +movuym16, [r3 + 10 * 32] ; [26] +vinserti32x8m16, [r3 + 8 * 32], 1; [24] +pmaddwd m6, m3, m16 +paddd m6, m15 +psrld m6, 5 +pmaddwd m9, m0, m16 +paddd m9, m15 +psrld m9, 5 +packusdwm6, m9 +vextracti32x8 ym7, m6, 1 +movuym16, [r3 + 6 * 32] ; [22] +vinserti32x8m16, [r3 + 4 * 32], 1; [20] +pmaddwd m8, m3, m16 +paddd m8, m15 +psrld m8, 5 +pmaddwd m9, m0, m16 +paddd m9, m15 +psrld m9, 5 +packusdwm8, m9 +vextracti32x8 ym9, m8, 1 +movuym16, [r3 + 2 * 32] ; [18] +vinserti32x8m16, [r3], 1 ; [16] +pmaddwd m10, m3, m16 +paddd m10, m15 +psrld m10, 5 +pmaddwd m1, m0, m16 +paddd m1, m15 +psrld m1, 5 +packusdwm10, m1 +vextracti32x8 ym11, m10, 1 +TRANSPOSE_STORE_AVX2 4, 5, 6, 7, 8, 9, 10, 11, 2, 1, 0 + +movuym16, [r3 - 2 * 32] ; [14] +vinserti32x8m16, [r3 - 4 * 32], 1 ; [12] +pmaddwd m4, m3, m16 +paddd m4, m15 +psrld m4, 5 +pmaddwd m5, m0, m16 +paddd m5, m15 +psrld m5, 5 +packusdwm4, m5 +vextracti32x8 ym5, m4, 1 +movuym16, [r3 - 6 * 32] ; [10] +vinserti32x8m16, [r3 - 8 * 32], 1 ; [8] +pmaddwd m6, m3, m16 +paddd m6, m15 +psrld m6, 5 +pmaddwd m8, m0, m16 +paddd m8, m15 +psrld m8, 5 +
[x265] [PATCH 303 of 307] X86:AVX512 intra_pred_ang16 mode 7 and 29 high bit depth
# HG changeset patch # User Jayashree # Date 1516212669 28800 # Wed Jan 17 10:11:09 2018 -0800 # Node ID f56354b2b542aaafa389a226f0fb3b41e4d33803 # Parent ae3f7bd65b45df716f1cd56b6b15d91643772621 X86:AVX512 intra_pred_ang16 mode 7 and 29 high bit depth Mode | AVX2 performance | AVX512 performance --- 7 |9.2x | 11.9x 29 |17.3x | 24.30x diff -r ae3f7bd65b45 -r f56354b2b542 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Jan 17 10:11:09 2018 -0800 +++ b/source/common/x86/asm-primitives.cpp Wed Jan 17 10:11:09 2018 -0800 @@ -3127,6 +3127,8 @@ p.cu[BLOCK_16x16].intra_pred[32] = PFX(intra_pred_ang16_32_avx512); p.cu[BLOCK_16x16].intra_pred[6] = PFX(intra_pred_ang16_6_avx512); p.cu[BLOCK_16x16].intra_pred[30] = PFX(intra_pred_ang16_30_avx512); +p.cu[BLOCK_16x16].intra_pred[7] = PFX(intra_pred_ang16_7_avx512); +p.cu[BLOCK_16x16].intra_pred[29] = PFX(intra_pred_ang16_29_avx512); p.pu[LUMA_64x64].luma_hvpp = interp_8tap_hv_pp_cpu; p.pu[LUMA_64x48].luma_hvpp = interp_8tap_hv_pp_cpu; p.pu[LUMA_64x32].luma_hvpp = interp_8tap_hv_pp_cpu; diff -r ae3f7bd65b45 -r f56354b2b542 source/common/x86/intrapred16.asm --- a/source/common/x86/intrapred16.asm Wed Jan 17 10:11:09 2018 -0800 +++ b/source/common/x86/intrapred16.asm Wed Jan 17 10:11:09 2018 -0800 @@ -20618,11 +20618,30 @@ mov r0,r5 callang16_mode_7_29 - add r2,8 - callang32_mode_7_29 RET +cglobal intra_pred_ang16_7, 3,7,17 +add r2,64 +xor r6d, r6d +vbroadcasti32x8 m15, [pd_16] +lea r3,[ang_table_avx2 + 17 * 32] +add r1d, r1d +lea r4,[r1 * 3] + +callang16_mode_7_29 +RET + +cglobal intra_pred_ang16_29, 3,7,17 +xor r6d, r6d +inc r6d +vbroadcasti32x8 m15, [pd_16] +lea r3,[ang_table_avx2 + 17 * 32] +add r1d, r1d +lea r4,[r1 * 3] + +callang16_mode_7_29 +RET ;--- ; avx512 code for intra_pred_ang32 mode 2 to 34 end ;--- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 029 of 307] x86: AVX512 pixel_sad_x4_48x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1500263597 -19800 # Mon Jul 17 09:23:17 2017 +0530 # Node ID 576a93cba7d189fddba3466a21188f0ece3ed278 # Parent 229c13a0d7e4a1dafad7b0a2e9eef041ecccdb77 x86: AVX512 pixel_sad_x4_48x64 AVX2 performance : 59.49x AVX512 performance: 62.29x diff -r 229c13a0d7e4 -r 576a93cba7d1 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jul 17 08:27:14 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jul 17 09:23:17 2017 +0530 @@ -3756,6 +3756,7 @@ p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512); p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512); p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx512); +p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_avx512); p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_avx512); p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_avx512); diff -r 229c13a0d7e4 -r 576a93cba7d1 source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Mon Jul 17 08:27:14 2017 +0530 +++ b/source/common/x86/sad-a.asm Mon Jul 17 09:23:17 2017 +0530 @@ -4348,6 +4348,154 @@ paddd m3, m4 %endmacro +%macro SAD_X4_48x8_AVX512 0 +movuym4, [r0] +vinserti32x8m4, [r0 + FENC_STRIDE], 1 +movuym5, [r1] +vinserti32x8m5, [r1 + r5], 1 +movuym6, [r2] +vinserti32x8m6, [r2 + r5], 1 +movuym7, [r3] +vinserti32x8m7, [r3 + r5], 1 +movuym8, [r4] +vinserti32x8m8, [r4 + r5], 1 + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 + +movuym4, [r0 + FENC_STRIDE * 2] +vinserti32x8 m4, [r0 + FENC_STRIDE * 3], 1 +movuym5, [r1 + r5 * 2] +vinserti32x8 m5, [r1 + r7], 1 +movuym6, [r2 + r5 * 2] +vinserti32x8 m6, [r2 + r7], 1 +movuym7, [r3 + r5 * 2] +vinserti32x8 m7, [r3 + r7], 1 +movuym8, [r4 + r5 * 2] +vinserti32x8 m8, [r4 + r7], 1 + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 + +movu xm4, [r0 + 32] +vinserti32x4m4, [r0 + FENC_STRIDE + 32], 1 +vinserti32x4m4, [r0 + FENC_STRIDE * 2 + 32], 2 +vinserti32x4m4, [r0 + FENC_STRIDE * 3 + 32], 3 +movu xm5, [r1 + 32] +vinserti32x4m5, [r1 + r5 + 32], 1 +vinserti32x4m5, [r1 + r5 * 2 + 32], 2 +vinserti32x4m5, [r1 + r7 + 32], 3 +movu xm6, [r2 + 32] +vinserti32x4m6, [r2 + r5 + 32], 1 +vinserti32x4m6, [r2 + r5 * 2 + 32], 2 +vinserti32x4m6, [r2 + r7 + 32], 3 +movu xm7, [r3 + 32] +vinserti32x4m7, [r3 + r5 + 32], 1 +vinserti32x4m7, [r3 + r5 * 2 + 32], 2 +vinserti32x4m7, [r3 + r7 + 32], 3 +movu xm8, [r4 + 32] +vinserti32x4m8, [r4 + r5 + 32], 1 +vinserti32x4m8, [r4 + r5 * 2 + 32], 2 +vinserti32x4m8, [r4 + r7 + 32], 3 + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 + +add r0, FENC_STRIDE * 4 +lea r1, [r1 + r5 * 4] +lea r2, [r2 + r5 * 4] +lea r3, [r3 + r5 * 4] +lea r4, [r4 + r5 * 4] + +movuym4, [r0] +vinserti32x8m4, [r0 + FENC_STRIDE], 1 +movuym5, [r1] +vinserti32x8m5, [r1 + r5], 1 +movuym6, [r2] +vinserti32x8m6, [r2 + r5], 1 +movuym7, [r3] +vinserti32x8m7, [r3 + r5], 1 +movuym8, [r4] +vinserti32x8m8, [r4 + r5], 1 + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +psadbw m4, m8 +paddd m3, m4 + +movuym4, [r0 + FENC_STRIDE * 2] +vinserti32x8 m4, [r0 + FENC_STRIDE * 3], 1 +movuym5, [r1 + r5 * 2] +vinserti32x8 m5, [r1 + r7], 1 +movuym6, [r2 + r5 * 2] +vinserti32x8 m6, [r2 + r7], 1 +movuym7, [r3 + r5 * 2] +vinserti32x8 m7, [r3 + r7], 1 +movuym8, [r4 + r5 * 2] +vinserti32x8 m8, [r4 + r7], 1 + +psadbw m9, m4, m5 +paddd m0, m9 +psadbw m5, m4, m6 +paddd m1, m5 +psadbw m6, m4, m7 +paddd m2, m6 +
[x265] [PATCH 026 of 307] x86: AVX512 pixel_sad_x3_W64
# HG changeset patch # User Vignesh Vijayakumar # Date 1500013190 -19800 # Fri Jul 14 11:49:50 2017 +0530 # Node ID 20ca79c2c6a803e2c6caf0c1dc87fb211ea9f708 # Parent 3183189cf8a0f1b95c31ecc39dd07b220ec53cea x86: AVX512 pixel_sad_x3_W64 Size | AVX2 performance | AVX512 performance - 64x16 | 64,76x | 95.17x 64x32 | 71.08x | 106.10x 64x48 | 71.45x | 108.12x 64x64 | 75.57x | 110.06x diff -r 3183189cf8a0 -r 20ca79c2c6a8 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Jul 14 11:21:54 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jul 14 11:49:50 2017 +0530 @@ -3736,6 +3736,11 @@ p.pu[LUMA_64x48].sad = PFX(pixel_sad_64x48_avx512); p.pu[LUMA_64x64].sad = PFX(pixel_sad_64x64_avx512); +p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx512); +p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512); +p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512); +p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx512); + p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512); p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512); p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512); diff -r 3183189cf8a0 -r 20ca79c2c6a8 source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Fri Jul 14 11:21:54 2017 +0530 +++ b/source/common/x86/sad-a.asm Fri Jul 14 11:49:50 2017 +0530 @@ -6129,6 +6129,263 @@ RET %endif +; +;sad_x3 avx512 code start +; +%macro SAD_X3_64x8_AVX512 0 +movum3, [r0] +movum4, [r1] +movum5, [r2] +movum6, [r3] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE] +movum4, [r1 + r4] +movum5, [r2 + r4] +movum6, [r3 + r4] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 2] +movum4, [r1 + r4 * 2] +movum5, [r2 + r4 * 2] +movum6, [r3 + r4 * 2] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 3] +movum4, [r1 + r6] +movum5, [r2 + r6] +movum6, [r3 + r6] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +add r0, FENC_STRIDE * 4 +lea r1, [r1 + r4 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r4 * 4] + +movum3, [r0] +movum4, [r1] +movum5, [r2] +movum6, [r3] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE] +movum4, [r1 + r4] +movum5, [r2 + r4] +movum6, [r3 + r4] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 2] +movum4, [r1 + r4 * 2] +movum5, [r2 + r4 * 2] +movum6, [r3 + r4 * 2] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movum3, [r0 + FENC_STRIDE * 3] +movum4, [r1 + r6] +movum5, [r2 + r6] +movum6, [r3 + r6] + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 +%endmacro + +%macro PIXEL_SAD_X3_END_AVX512 0 +vextracti32x8 ym3, m0, 1 +vextracti32x8 ym4, m1, 1 +vextracti32x8 ym5, m2, 1 +paddd ym0, ym3 +paddd ym1, ym4 +paddd ym2, ym5 +vextracti64x2 xm3, m0, 1 +vextracti64x2 xm4, m1, 1 +vextracti64x2 xm5, m2, 1 +paddd xm0, xm3 +paddd xm1, xm4 +paddd xm2, xm5 +pshufd xm3, xm0, 2
[x265] [PATCH 030 of 307] x86: AVX512 convert_p2s 64xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1499858302 -19800 # Wed Jul 12 16:48:22 2017 +0530 # Node ID a77082ebfa67b40f3dbb8cd45b54c17e710a104c # Parent 576a93cba7d189fddba3466a21188f0ece3ed278 x86: AVX512 convert_p2s 64xN Size| AVX2 performance | AVX512 performance 64x16 | 2.05x | 3.77x 64x32 | 2.16x | 3.88x 64x48 | 2.13x | 3.91x 64x64 | 2.16x | 4.00x diff -r 576a93cba7d1 -r a77082ebfa67 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Jul 17 09:23:17 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Jul 12 16:48:22 2017 +0530 @@ -3832,6 +3832,11 @@ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_avx512); p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_avx512); +p.pu[LUMA_64x16].convert_p2s = PFX(filterPixelToShort_64x16_avx512); +p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512); +p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512); +p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512); + } #endif } diff -r 576a93cba7d1 -r a77082ebfa67 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Mon Jul 17 09:23:17 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Wed Jul 12 16:48:22 2017 +0530 @@ -2269,6 +2269,186 @@ P2S_H_64xN_avx2 48 ;- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;- +%macro PROCESS_P2S_64x8_AVX512 0 +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + mmsize/2] +pmovzxbwm2, [r0 + r1] +pmovzxbwm3, [r0 + r1 + mmsize/2] + +psllw m0, 6 +psllw m1, 6 +psllw m2, 6 +psllw m3, 6 +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu[r2], m0 +movu[r2 + mmsize], m1 +movu[r2 + r3], m2 +movu[r2 + r3 + mmsize], m3 + +pmovzxbwm0, [r0 + r1 * 2] +pmovzxbwm1, [r0 + r1 * 2 + mmsize/2] +pmovzxbwm2, [r0 + r5] +pmovzxbwm3, [r0 + r5 + mmsize/2] + +psllw m0, 6 +psllw m1, 6 +psllw m2, 6 +psllw m3, 6 +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu[r2 + r3 * 2], m0 +movu[r2 + r3 * 2 + mmsize], m1 +movu[r2 + r6], m2 +movu[r2 + r6 + mmsize], m3 + +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] + +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + mmsize/2] +pmovzxbwm2, [r0 + r1] +pmovzxbwm3, [r0 + r1 + mmsize/2] + +psllw m0, 6 +psllw m1, 6 +psllw m2, 6 +psllw m3, 6 +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu[r2], m0 +movu[r2 + mmsize], m1 +movu[r2 + r3], m2 +movu[r2 + r3 + mmsize], m3 + +pmovzxbwm0, [r0 + r1 * 2] +pmovzxbwm1, [r0 + r1 * 2 + mmsize/2] +pmovzxbwm2, [r0 + r5] +pmovzxbwm3, [r0 + r5 + mmsize/2] + +psllw m0, 6 +psllw m1, 6 +psllw m2, 6 +psllw m3, 6 +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu[r2 + r3 * 2], m0 +movu[r2 + r3 * 2 + mmsize], m1 +movu[r2 + r6], m2 +movu[r2 + r6 + mmsize], m3 +%endmacro + +INIT_ZMM avx512 +cglobal filterPixelToShort_64x64, 3, 7, 5 +mov r3d, r3m +add r3d, r3d +lea r5, [r1 * 3] +lea r6, [r3 * 3] + +; load constant +vpbroadcastd m4, [pw_2000] + +PROCESS_P2S_64x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_64x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_64x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_64x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_64x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_64x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_64x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_64x8_AVX512 +RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_64x48, 3, 7, 9 +mov r3d, r3m +add r3d, r3d +lea r5, [r1 * 3] +lea r6, [r3 * 3] + +; load constant +vpbroadcastd
[x265] [PATCH 031 of 307] x86: AVX512 convert_p2s_32xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1500445753 -19800 # Wed Jul 19 11:59:13 2017 +0530 # Node ID 60c8ad7f3cadcfe7bb5242a89908546ce38bb5d1 # Parent a77082ebfa67b40f3dbb8cd45b54c17e710a104c x86: AVX512 convert_p2s_32xN Size| AVX2 performance | AVX512 performance 32x8| 1.51x | 1.54x 32x16 | 2.18x | 3.62x 32x24 | 2.26x | 3.58x 32x32 | 2.28x | 3.94x 32x64 | 2.20x | 4.06x diff -r a77082ebfa67 -r 60c8ad7f3cad source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Jul 12 16:48:22 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Jul 19 11:59:13 2017 +0530 @@ -3836,6 +3836,19 @@ p.pu[LUMA_64x32].convert_p2s = PFX(filterPixelToShort_64x32_avx512); p.pu[LUMA_64x48].convert_p2s = PFX(filterPixelToShort_64x48_avx512); p.pu[LUMA_64x64].convert_p2s = PFX(filterPixelToShort_64x64_avx512); +p.pu[LUMA_32x8].convert_p2s = PFX(filterPixelToShort_32x8_avx2); +p.pu[LUMA_32x16].convert_p2s = PFX(filterPixelToShort_32x16_avx512); +p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512); +p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512); +p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].p2s = PFX(filterPixelToShort_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].p2s = PFX(filterPixelToShort_32x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); } #endif diff -r a77082ebfa67 -r 60c8ad7f3cad source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Wed Jul 12 16:48:22 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Wed Jul 19 11:59:13 2017 +0530 @@ -1956,6 +1956,184 @@ ;- ; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) ;- +%macro PROCESS_P2S_32x8_AVX512 0 +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + r1] +pmovzxbwm2, [r0 + r1 * 2] +pmovzxbwm3, [r0 + r5] + +psllw m0, 6 +psllw m1, 6 +psllw m2, 6 +psllw m3, 6 +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 + +movu[r2], m0 +movu[r2 + r3], m1 +movu[r2 + r3 * 2], m2 +movu[r2 + r6], m3 + +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] + +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + r1] +pmovzxbwm2, [r0 + r1 * 2] +pmovzxbwm3, [r0 + r5] + +psllw m0, 6 +psllw m1, 6 +psllw m2, 6 +psllw m3, 6 +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 + +movu[r2], m0 +movu[r2 + r3], m1 +movu[r2 + r3 * 2], m2 +movu[r2 + r6], m3 +%endmacro + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x8, 3, 7, 5 +mov r3d, r3m +add r3d, r3d +lea r5, [r1 * 3] +lea r6, [r3 * 3] + +; load constant +vpbroadcastd m4, [pw_2000] + +PROCESS_P2S_32x8_AVX512 +RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x16, 3, 7, 5 +mov r3d, r3m +add r3d, r3d +lea r5, [r1 * 3] +lea r6, [r3 * 3] + +; load constant +vpbroadcastd m4, [pw_2000] + +PROCESS_P2S_32x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_32x8_AVX512 +RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x24, 3, 7, 5 +mov r3d, r3m +add r3d, r3d +lea r5, [r1 * 3] +lea r6, [r3 * 3] + +; load constant +vpbroadcastd m4, [pw_2000] + +PROCESS_P2S_32x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_32x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_32x8_AVX512 +RET + +INIT_ZMM avx512 +cglobal filterPixelToShort_32x32, 3, 7, 5 +mov r3d, r3m +add r3d, r3d +lea r5,
[x265] [PATCH 033 of 307] x86: AVX512 fix convert_p2s_64xN,48x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1500536572 -19800 # Thu Jul 20 13:12:52 2017 +0530 # Node ID bf9a9cd255216300408506d10d4ff8bc87a15845 # Parent 97d5ab44b6da2db69584875c2dde97aef5533d9b x86: AVX512 fix convert_p2s_64xN,48x64 diff -r 97d5ab44b6da -r bf9a9cd25521 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Wed Jul 19 12:25:43 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Thu Jul 20 13:12:52 2017 +0530 @@ -1953,9 +1953,6 @@ P2S_H_32xN_avx2 64 P2S_H_32xN_avx2 48 -;- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;- %macro PROCESS_P2S_32x8_AVX512 0 pmovzxbwm0, [r0] pmovzxbwm1, [r0 + r1] @@ -1999,6 +1996,9 @@ movu[r2 + r6], m3 %endmacro +;- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;- INIT_ZMM avx512 cglobal filterPixelToShort_32x8, 3, 7, 5 mov r3d, r3m @@ -2446,9 +2446,6 @@ P2S_H_64xN_avx2 32 P2S_H_64xN_avx2 48 -;- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;- %macro PROCESS_P2S_64x8_AVX512 0 pmovzxbwm0, [r0] pmovzxbwm1, [r0 + mmsize/2] @@ -2526,6 +2523,9 @@ movu[r2 + r6 + mmsize], m3 %endmacro +;- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;- INIT_ZMM avx512 cglobal filterPixelToShort_64x64, 3, 7, 5 mov r3d, r3m @@ -2561,14 +2561,14 @@ RET INIT_ZMM avx512 -cglobal filterPixelToShort_64x48, 3, 7, 9 +cglobal filterPixelToShort_64x48, 3, 7, 5 mov r3d, r3m add r3d, r3d lea r5, [r1 * 3] lea r6, [r3 * 3] ; load constant -vpbroadcastd m8, [pw_2000] +vpbroadcastd m4, [pw_2000] PROCESS_P2S_64x8_AVX512 lea r0, [r0 + r1 * 4] @@ -2589,14 +2589,14 @@ RET INIT_ZMM avx512 -cglobal filterPixelToShort_64x32, 3, 7, 9 +cglobal filterPixelToShort_64x32, 3, 7, 5 mov r3d, r3m add r3d, r3d lea r5, [r1 * 3] lea r6, [r3 * 3] ; load constant -vpbroadcastd m8, [pw_2000] +vpbroadcastd m4, [pw_2000] PROCESS_P2S_64x8_AVX512 lea r0, [r0 + r1 * 4] @@ -2611,14 +2611,14 @@ RET INIT_ZMM avx512 -cglobal filterPixelToShort_64x16, 3, 7, 9 +cglobal filterPixelToShort_64x16, 3, 7, 5 mov r3d, r3m add r3d, r3d lea r5, [r1 * 3] lea r6, [r3 * 3] ; load constant -vpbroadcastd m8, [pw_2000] +vpbroadcastd m4, [pw_2000] PROCESS_P2S_64x8_AVX512 lea r0, [r0 + r1 * 4] @@ -3047,9 +3047,6 @@ jnz.loop RET -;- -; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) -;- %macro PROCESS_P2S_48x8_AVX512 0 pmovzxbwm0, [r0] pmovzxbwm1, [r0 + r1] @@ -3123,6 +3120,9 @@ movu[r2 + r6 + 64], ym3 %endmacro +;- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;- INIT_ZMM avx512 cglobal filterPixelToShort_48x64, 3,7,5 mov r3d, r3m @@ -3131,7 +3131,7 @@ lea r6, [r3 * 3] ; load constant -vpbroadcastd m8, [pw_2000] +vpbroadcastd m4, [pw_2000] PROCESS_P2S_48x8_AVX512 lea r0, [r0 + r1 * 4] ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 034 of 307] x86: AVX512 ssd_ss_64x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1500528397 -19800 # Thu Jul 20 10:56:37 2017 +0530 # Node ID 0320e60b3323546eb6767508f1c39cd088e9f03e # Parent bf9a9cd255216300408506d10d4ff8bc87a15845 x86: AVX512 ssd_ss_64x64 AVX2 performance : 14.85x AVX512 performance : 21.35x diff -r bf9a9cd25521 -r 0320e60b3323 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 20 13:12:52 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jul 20 10:56:37 2017 +0530 @@ -3851,6 +3851,8 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); +p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512); + } #endif } diff -r bf9a9cd25521 -r 0320e60b3323 source/common/x86/ssd-a.asm --- a/source/common/x86/ssd-a.asm Thu Jul 20 13:12:52 2017 +0530 +++ b/source/common/x86/ssd-a.asm Thu Jul 20 10:56:37 2017 +0530 @@ -1377,7 +1377,124 @@ HADDD m2, m0 movdeax, xm2 RET +;- +; ssd_ss avx512 code start +;- +%macro PROCESS_SSD_SS_64x8_AVX512 0 +movum0, [r0] +movum1, [r0 + mmsize] +movum2, [r0 + r1] +movum3, [r0 + r1 + mmsize] +psubw m0, [r2] +psubw m1, [r2 + mmsize] +psubw m2, [r2 + r3] +psubw m3, [r2 + r3 + mmsize] +pmaddwd m0, m0 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +paddd m4, m0 +paddd m5, m1 +paddd m4, m2 +paddd m5, m3 + +movum0, [r0 + 2 * r1] +movum1, [r0 + 2 * r1 + mmsize] +movum2, [r0 + r5] +movum3, [r0 + r5 + mmsize] + +psubw m0, [r2 + 2 * r3] +psubw m1, [r2 + 2 * r3 + mmsize] +psubw m2, [r2 + r6] +psubw m3, [r2 + r6 + mmsize] +pmaddwd m0, m0 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +paddd m4, m0 +paddd m5, m1 +paddd m4, m2 +paddd m5, m3 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] + +movum0, [r0] +movum1, [r0 + mmsize] +movum2, [r0 + r1] +movum3, [r0 + r1 + mmsize] + +psubw m0, [r2] +psubw m1, [r2 + mmsize] +psubw m2, [r2 + r3] +psubw m3, [r2 + r3 + mmsize] +pmaddwd m0, m0 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +paddd m4, m0 +paddd m5, m1 +paddd m4, m2 +paddd m5, m3 + +movum0, [r0 + 2 * r1] +movum1, [r0 + 2 * r1 + mmsize] +movum2, [r0 + r5] +movum3, [r0 + r5 + mmsize] + +psubw m0, [r2 + 2 * r3] +psubw m1, [r2 + 2 * r3 + mmsize] +psubw m2, [r2 + r6] +psubw m3, [r2 + r6 + mmsize] +pmaddwd m0, m0 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +paddd m4, m0 +paddd m5, m1 +paddd m4, m2 +paddd m5, m3 +%endmacro + +INIT_ZMM avx512 +cglobal pixel_ssd_ss_64x64, 4,7,6 +add r1d, r1d +add r3d, r3d +lea r5, [r1 * 3] +lea r6, [r3 * 3] +pxorm4, m4 +pxorm5, m5 + +PROCESS_SSD_SS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_64x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_64x8_AVX512 +paddd m4, m5 +HADDD m4, m0 +movdeax, xm4 +RET +;- +; ssd_ss avx512 code end +;- %endif ; !HIGH_BIT_DEPTH %if HIGH_BIT_DEPTH == 0 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 032 of 307] x86: AVX512 convert_p2s 48x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1500447343 -19800 # Wed Jul 19 12:25:43 2017 +0530 # Node ID 97d5ab44b6da2db69584875c2dde97aef5533d9b # Parent 60c8ad7f3cadcfe7bb5242a89908546ce38bb5d1 x86: AVX512 convert_p2s 48x64 AVX2 performance : 2.22x AVX512 performance: 3.01x diff -r 60c8ad7f3cad -r 97d5ab44b6da source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Jul 19 11:59:13 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Jul 19 12:25:43 2017 +0530 @@ -3841,6 +3841,7 @@ p.pu[LUMA_32x24].convert_p2s = PFX(filterPixelToShort_32x24_avx512); p.pu[LUMA_32x32].convert_p2s = PFX(filterPixelToShort_32x32_avx512); p.pu[LUMA_32x64].convert_p2s = PFX(filterPixelToShort_32x64_avx512); +p.pu[LUMA_48x64].convert_p2s = PFX(filterPixelToShort_48x64_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].p2s = PFX(filterPixelToShort_32x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].p2s = PFX(filterPixelToShort_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].p2s = PFX(filterPixelToShort_32x24_avx512); diff -r 60c8ad7f3cad -r 97d5ab44b6da source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Wed Jul 19 11:59:13 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Wed Jul 19 12:25:43 2017 +0530 @@ -3047,6 +3047,115 @@ jnz.loop RET +;- +; void filterPixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int16_t dstStride) +;- +%macro PROCESS_P2S_48x8_AVX512 0 +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + r1] +pmovzxbwm2, [r0 + r1 * 2] +pmovzxbwm3, [r0 + r5] +psllw m0, 6 +psllw m1, 6 +psllw m2, 6 +psllw m3, 6 +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu[r2], m0 +movu[r2 + r3], m1 +movu[r2 + r3 * 2], m2 +movu[r2 + r6], m3 + +pmovzxbwym0, [r0 + 32] +pmovzxbwym1, [r0 + r1 + 32] +pmovzxbwym2, [r0 + r1 * 2 + 32] +pmovzxbwym3, [r0 + r5 + 32] +psllw ym0, 6 +psllw ym1, 6 +psllw ym2, 6 +psllw ym3, 6 +psubw ym0, ym4 +psubw ym1, ym4 +psubw ym2, ym4 +psubw ym3, ym4 +movu[r2 + 64], ym0 +movu[r2 + r3 + 64], ym1 +movu[r2 + r3 * 2 + 64], ym2 +movu[r2 + r6 + 64], ym3 + +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] + +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + r1] +pmovzxbwm2, [r0 + r1 * 2] +pmovzxbwm3, [r0 + r5] +psllw m0, 6 +psllw m1, 6 +psllw m2, 6 +psllw m3, 6 +psubw m0, m4 +psubw m1, m4 +psubw m2, m4 +psubw m3, m4 +movu[r2], m0 +movu[r2 + r3], m1 +movu[r2 + r3 * 2], m2 +movu[r2 + r6], m3 + +pmovzxbwym0, [r0 + 32] +pmovzxbwym1, [r0 + r1 + 32] +pmovzxbwym2, [r0 + r1 * 2 + 32] +pmovzxbwym3, [r0 + r5 + 32] +psllw ym0, 6 +psllw ym1, 6 +psllw ym2, 6 +psllw ym3, 6 +psubw ym0, ym4 +psubw ym1, ym4 +psubw ym2, ym4 +psubw ym3, ym4 +movu[r2 + 64], ym0 +movu[r2 + r3 + 64], ym1 +movu[r2 + r3 * 2 + 64], ym2 +movu[r2 + r6 + 64], ym3 +%endmacro + +INIT_ZMM avx512 +cglobal filterPixelToShort_48x64, 3,7,5 +mov r3d, r3m +add r3d, r3d +lea r5, [r1 * 3] +lea r6, [r3 * 3] + +; load constant +vpbroadcastd m8, [pw_2000] + +PROCESS_P2S_48x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_48x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_48x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_48x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_48x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_48x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_48x8_AVX512 +lea r0, [r0 + r1 * 4] +lea r2, [r2 + r3 * 4] +PROCESS_P2S_48x8_AVX512 +RET %macro PROCESS_LUMA_W4_4R 0 movdm0, [r0] ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 036 of 307] x86: AVX512 blockcopy_ss_64x64
# HG changeset patch # User Jayashri Murugan# Date 1499162011 -19800 # Tue Jul 04 15:23:31 2017 +0530 # Node ID 3e3a44c6d77c0c0a7b3a084127a0dc6c835ff392 # Parent 2eda6628c75302a10d59918a58740d6e27434293 x86: AVX512 blockcopy_ss_64x64 AVX2 performance over C code : 1.32x AVX512 performance over C code : 3.00x diff -r 2eda6628c753 -r 3e3a44c6d77c source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 20 16:59:52 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 04 15:23:31 2017 +0530 @@ -3854,6 +3854,8 @@ p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512); p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512); +p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512); + } #endif } diff -r 2eda6628c753 -r 3e3a44c6d77c source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Thu Jul 20 16:59:52 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Tue Jul 04 15:23:31 2017 +0530 @@ -4462,6 +4462,154 @@ BLOCKCOPY_SS_W64_H4_avx 64, 48 BLOCKCOPY_SS_W64_H4_avx 64, 64 +%macro PROCESS_BLOCKCOPY_SS_W64_H8_avx512 0 +movum0, [r2] +movum1, [r2 + mmsize] +movum2, [r2 + r3] +movum3, [r2 + r3 + mmsize] + +movu[r0], m0 +movu[r0 + mmsize], m1 +movu[r0 + r1], m2 +movu[r0 + r1 + mmsize], m3 + +movum0, [r2 + 2 * r3] +movum1, [r2 + 2 * r3 + mmsize] +movum2, [r2 + r6] +movum3, [r2 + r6 + mmsize] +lea r2, [r2 + 4 * r3] + +movu[r0 + 2 * r1], m0 +movu[r0 + 2 * r1 + mmsize], m1 +movu[r0 + r5], m2 +movu[r0 + r5 + mmsize], m3 +lea r0, [r0 + 4 * r1] + +movum0, [r2] +movum1, [r2 + mmsize] +movum2, [r2 + r3] +movum3, [r2 + r3 + mmsize] + +movu[r0], m0 +movu[r0 + mmsize], m1 +movu[r0 + r1], m2 +movu[r0 + r1 + mmsize], m3 + +movum0, [r2 + 2 * r3] +movum1, [r2 + 2 * r3 + mmsize] +movum2, [r2 + r6] +movum3, [r2 + r6 + mmsize] +lea r2, [r2 + 4 * r3] + +movu[r0 + 2 * r1], m0 +movu[r0 + 2 * r1 + mmsize], m1 +movu[r0 + r5], m2 +movu[r0 + r5 + mmsize], m3 +lea r0, [r0 + 4 * r1] +%endmacro + +%macro PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 0 +movum0, [r2] +movum1, [r2 + mmsize] +movum2, [r2 + r3] +movum3, [r2 + r3 + mmsize] + +movu[r0], m0 +movu[r0 + mmsize], m1 +movu[r0 + r1], m2 +movu[r0 + r1 + mmsize], m3 + +movum0, [r2 + 2 * r3] +movum1, [r2 + 2 * r3 + mmsize] +movum2, [r2 + r6] +movum3, [r2 + r6 + mmsize] +lea r2, [r2 + 4 * r3] + +movu[r0 + 2 * r1], m0 +movu[r0 + 2 * r1 + mmsize], m1 +movu[r0 + r5], m2 +movu[r0 + r5 + mmsize], m3 +lea r0, [r0 + 4 * r1] + +movum0, [r2] +movum1, [r2 + mmsize] +movum2, [r2 + r3] +movum3, [r2 + r3 + mmsize] + +movu[r0], m0 +movu[r0 + mmsize], m1 +movu[r0 + r1], m2 +movu[r0 + r1 + mmsize], m3 + +movum0, [r2 + 2 * r3] +movum1, [r2 + 2 * r3 + mmsize] +movum2, [r2 + r6] +movum3, [r2 + r6 + mmsize] + +movu[r0 + 2 * r1], m0 +movu[r0 + 2 * r1 + mmsize], m1 +movu[r0 + r5], m2 +movu[r0 + r5 + mmsize], m3 +%endmacro + +;- +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) +;- +INIT_ZMM avx512 +cglobal blockcopy_ss_64x16, 4, 7, 4 +add r1, r1 +add r3, r3 +lea r5, [3 * r1] +lea r6, [3 * r3] + +PROCESS_BLOCKCOPY_SS_W64_H8_avx512 +PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 +RET + +INIT_ZMM avx512 +cglobal blockcopy_ss_64x32, 4, 7, 4 +add r1, r1 +add r3, r3 +lea r5, [3 * r1] +lea r6, [3 * r3] + +PROCESS_BLOCKCOPY_SS_W64_H8_avx512 +PROCESS_BLOCKCOPY_SS_W64_H8_avx512 +PROCESS_BLOCKCOPY_SS_W64_H8_avx512 +PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 +RET + +INIT_ZMM avx512 +cglobal blockcopy_ss_64x48, 4, 7, 4 +add r1, r1 +add r3, r3 +lea r5, [3 * r1] +lea r6, [3 * r3] + +PROCESS_BLOCKCOPY_SS_W64_H8_avx512 +PROCESS_BLOCKCOPY_SS_W64_H8_avx512 +PROCESS_BLOCKCOPY_SS_W64_H8_avx512 +PROCESS_BLOCKCOPY_SS_W64_H8_avx512 +PROCESS_BLOCKCOPY_SS_W64_H8_avx512 +PROCESS_BLOCKCOPY_SS_W64_H8_LAST_avx512 +RET + +INIT_ZMM avx512 +cglobal
[x265] [PATCH 028 of 307] x86: AVX512 pixel_sad_x3_48x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1500260234 -19800 # Mon Jul 17 08:27:14 2017 +0530 # Node ID 229c13a0d7e4a1dafad7b0a2e9eef041ecccdb77 # Parent 5a2d94db6fcaabf532f00848a72fa337bb5e65ac x86: AVX512 pixel_sad_x3_48x64 AVX2 performance : 59.91x AVX512 performance: 61.95x diff -r 5a2d94db6fca -r 229c13a0d7e4 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Sun Jul 16 18:05:11 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Jul 17 08:27:14 2017 +0530 @@ -3745,6 +3745,7 @@ p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512); p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512); p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx512); +p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx512); p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512); p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512); diff -r 5a2d94db6fca -r 229c13a0d7e4 source/common/x86/sad-a.asm --- a/source/common/x86/sad-a.asm Sun Jul 16 18:05:11 2017 +0530 +++ b/source/common/x86/sad-a.asm Mon Jul 17 08:27:14 2017 +0530 @@ -6306,6 +6306,125 @@ paddd m2, m3 %endmacro +%macro SAD_X3_48x8_AVX512 0 +movuym3, [r0] +vinserti32x8m3, [r0 + FENC_STRIDE], 1 +movuym4, [r1] +vinserti32x8m4, [r1 + r4], 1 +movuym5, [r2] +vinserti32x8m5, [r2 + r4], 1 +movuym6, [r3] +vinserti32x8m6, [r3 + r4], 1 + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movuym3, [r0 + FENC_STRIDE * 2] +vinserti32x8 m3, [r0 + FENC_STRIDE * 3], 1 +movuym4, [r1 + r4 * 2] +vinserti32x8 m4, [r1 + r6], 1 +movuym5, [r2 + r4 * 2] +vinserti32x8 m5, [r2 + r6], 1 +movuym6, [r3 + r4 * 2] +vinserti32x8 m6, [r3 + r6], 1 + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movu xm3, [r0 + 32] +vinserti32x4m3, [r0 + FENC_STRIDE + 32], 1 +vinserti32x4m3, [r0 + 2 * FENC_STRIDE + 32], 2 +vinserti32x4m3, [r0 + 3 * FENC_STRIDE + 32], 3 +movu xm4, [r1 + 32] +vinserti32x4m4, [r1 + r4 + 32], 1 +vinserti32x4m4, [r1 + 2 * r4 + 32], 2 +vinserti32x4m4, [r1 + r6 + 32], 3 +movu xm5, [r2 + 32] +vinserti32x4m5, [r2 + r4 + 32], 1 +vinserti32x4m5, [r2 + 2 * r4 + 32], 2 +vinserti32x4m5, [r2 + r6 + 32], 3 +movu xm6, [r3 + 32] +vinserti32x4m6, [r3 + r4 + 32], 1 +vinserti32x4m6, [r3 + 2 * r4 + 32], 2 +vinserti32x4m6, [r3 + r6 + 32], 3 + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +add r0, FENC_STRIDE * 4 +lea r1, [r1 + r4 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r4 * 4] + +movuym3, [r0] +vinserti32x8m3, [r0 + FENC_STRIDE], 1 +movuym4, [r1] +vinserti32x8m4, [r1 + r4], 1 +movuym5, [r2] +vinserti32x8m5, [r2 + r4], 1 +movuym6, [r3] +vinserti32x8m6, [r3 + r4], 1 + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movuym3, [r0 + FENC_STRIDE * 2] +vinserti32x8 m3, [r0 + FENC_STRIDE * 3], 1 +movuym4, [r1 + r4 * 2] +vinserti32x8 m4, [r1 + r6], 1 +movuym5, [r2 + r4 * 2] +vinserti32x8 m5, [r2 + r6], 1 +movuym6, [r3 + r4 * 2] +vinserti32x8 m6, [r3 + r6], 1 + +psadbw m7, m3, m4 +paddd m0, m7 +psadbw m4, m3, m5 +paddd m1, m4 +psadbw m3, m6 +paddd m2, m3 + +movu xm3, [r0 + 32] +vinserti32x4m3, [r0 + FENC_STRIDE + 32], 1 +vinserti32x4m3, [r0 + 2 * FENC_STRIDE + 32], 2 +vinserti32x4m3, [r0 + 3 * FENC_STRIDE + 32], 3 +movu xm4, [r1 + 32] +vinserti32x4m4, [r1 + r4 + 32], 1 +vinserti32x4m4, [r1 + 2 * r4 + 32], 2 +vinserti32x4m4, [r1 + r6 + 32], 3 +movu xm5, [r2 + 32] +vinserti32x4m5, [r2 + r4 + 32], 1 +vinserti32x4m5, [r2 + 2 * r4 + 32], 2 +vinserti32x4m5, [r2 + r6 + 32], 3 +movu xm6, [r3 + 32] +vinserti32x4m6, [r3 + r4 + 32], 1 +vinserti32x4m6, [r3 + 2 * r4 + 32], 2 +vinserti32x4m6, [r3 + r6 + 32], 3 + +psadbw m7,
[x265] [PATCH 035 of 307] x86: AVX512 ssd_ss_32x32
# HG changeset patch # User Vignesh Vijayakumar # Date 1500550192 -19800 # Thu Jul 20 16:59:52 2017 +0530 # Node ID 2eda6628c75302a10d59918a58740d6e27434293 # Parent 0320e60b3323546eb6767508f1c39cd088e9f03e x86: AVX512 ssd_ss_32x32 AVX2 performance : 12.73x AVX512 performance : 19.72x diff -r 0320e60b3323 -r 2eda6628c753 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Jul 20 10:56:37 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Jul 20 16:59:52 2017 +0530 @@ -3852,6 +3852,7 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512); +p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512); } #endif diff -r 0320e60b3323 -r 2eda6628c753 source/common/x86/ssd-a.asm --- a/source/common/x86/ssd-a.asm Thu Jul 20 10:56:37 2017 +0530 +++ b/source/common/x86/ssd-a.asm Thu Jul 20 16:59:52 2017 +0530 @@ -1457,6 +1457,47 @@ paddd m5, m3 %endmacro +%macro PROCESS_SSD_SS_32x8_AVX512 0 +movum0, [r0] +movum1, [r0 + r1] +movum2, [r0 + 2 * r1] +movum3, [r0 + r5] + +psubw m0, [r2] +psubw m1, [r2 + r3] +psubw m2, [r2 + 2 * r3] +psubw m3, [r2 + r6] +pmaddwd m0, m0 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +paddd m4, m0 +paddd m5, m1 +paddd m4, m2 +paddd m5, m3 + +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] + +movum0, [r0] +movum1, [r0 + r1] +movum2, [r0 + 2 * r1] +movum3, [r0 + r5] + +psubw m0, [r2] +psubw m1, [r2 + r3] +psubw m2, [r2 + 2 * r3] +psubw m3, [r2 + r6] +pmaddwd m0, m0 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +paddd m4, m0 +paddd m5, m1 +paddd m4, m2 +paddd m5, m3 +%endmacro + INIT_ZMM avx512 cglobal pixel_ssd_ss_64x64, 4,7,6 add r1d, r1d @@ -1492,6 +1533,30 @@ HADDD m4, m0 movdeax, xm4 RET + +INIT_ZMM avx512 +cglobal pixel_ssd_ss_32x32, 4,7,6 +add r1d, r1d +add r3d, r3d +lea r5, [r1 * 3] +lea r6, [r3 * 3] +pxorm4, m4 +pxorm5, m5 + +PROCESS_SSD_SS_32x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_32x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_32x8_AVX512 +lea r0, [r0 + 4 * r1] +lea r2, [r2 + 4 * r3] +PROCESS_SSD_SS_32x8_AVX512 +paddd m4, m5 +HADDD m4, m0 +movdeax, xm4 +RET ;- ; ssd_ss avx512 code end ;- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 038 of 307] x86: AVX512 getResidual32
# HG changeset patch # User Jayashri Murugan# Date 1500627732 -19800 # Fri Jul 21 14:32:12 2017 +0530 # Node ID 49123506b563fd44378e856e6833c77812d0349e # Parent ef8989f43083cd5195ff3ba360959fe3900399e5 x86: AVX512 getResidual32 BIT_DEPTH = 8 AVX2 performance over C code : 2.99x AVX512 performance over C code : 5.46x HIGH_BIT_DEPTH AVX2 performance over C code : 3.10x AVX512 performance over C code : 5.60x diff -r ef8989f43083 -r 49123506b563 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 04 18:02:59 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jul 21 14:32:12 2017 +0530 @@ -3723,6 +3723,7 @@ p.integral_inith[INTEGRAL_24] = PFX(integral24h_avx2); p.integral_inith[INTEGRAL_32] = PFX(integral32h_avx2); +p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512); } if (cpuMask & X265_CPU_AVX512) { @@ -3859,6 +3860,8 @@ p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_avx512); p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512); +p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx512); + } #endif } diff -r ef8989f43083 -r 49123506b563 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Tue Jul 04 18:02:59 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Fri Jul 21 14:32:12 2017 +0530 @@ -554,6 +554,135 @@ %endrep RET %endif + +%macro PROCESS_GETRESIDUAL32_W4_HBD_AVX512 0 +movum0, [r0] +movum1, [r0 + r3] +movum2, [r0 + r3 * 2] +movum3, [r0 + r4] +lea r0, [r0 + r3 * 4] + +movum4, [r1] +movum5, [r1 + r3] +movum6, [r1 + r3 * 2] +movum7, [r1 + r4] +lea r1, [r1 + r3 * 4] + +psubw m0, m4 +psubw m1, m5 +psubw m2, m6 +psubw m3, m7 + +movu[r2], m0 +movu[r2 + r3], m1 +movu[r2 + r3 * 2], m2 +movu[r2 + r4], m3 +lea r2, [r2 + r3 * 4] +%endmacro + +%macro PROCESS_GETRESIDUAL32_W4_HBD_AVX512_END 0 +movum0, [r0] +movum1, [r0 + r3] +movum2, [r0 + r3 * 2] +movum3, [r0 + r4] + +movum4, [r1] +movum5, [r1 + r3] +movum6, [r1 + r3 * 2] +movum7, [r1 + r4] + +psubw m0, m4 +psubw m1, m5 +psubw m2, m6 +psubw m3, m7 + +movu[r2], m0 +movu[r2 + r3], m1 +movu[r2 + r3 * 2], m2 +movu[r2 + r4], m3 +%endmacro + +%macro PROCESS_GETRESIDUAL32_W4_AVX512 0 +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + r3] +pmovzxbwm2, [r0 + r3 * 2] +pmovzxbwm3, [r0 + r4] +lea r0, [r0 + r3 * 4] + +pmovzxbwm4, [r1] +pmovzxbwm5, [r1 + r3] +pmovzxbwm6, [r1 + r3 * 2] +pmovzxbwm7, [r1 + r4] +lea r1, [r1 + r3 * 4] + +psubw m0, m4 +psubw m1, m5 +psubw m2, m6 +psubw m3, m7 + +movu[r2], m0 +movu[r2 + r3 * 2], m1 +lea r2, [r2 + r3 * 4] +movu[r2], m2 +movu[r2 + r3 * 2], m3 +lea r2, [r2 + r3 * 4] +%endmacro + +%macro PROCESS_GETRESIDUAL32_W4_AVX512_END 0 +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + r3] +pmovzxbwm2, [r0 + r3 * 2] +pmovzxbwm3, [r0 + r4] + +pmovzxbwm4, [r1] +pmovzxbwm5, [r1 + r3] +pmovzxbwm6, [r1 + r3 * 2] +pmovzxbwm7, [r1 + r4] + +psubw m0, m4 +psubw m1, m5 +psubw m2, m6 +psubw m3, m7 + +movu[r2], m0 +movu[r2 + r3 * 2], m1 +lea r2, [r2 + r3 * 4] +movu[r2], m2 +movu[r2 + r3 * 2], m3 +%endmacro + + +%if HIGH_BIT_DEPTH +INIT_ZMM avx512 +cglobal getResidual32, 4,5,8 +add r3, r3 +lea r4, [r3 * 3] + +PROCESS_GETRESIDUAL32_W4_HBD_AVX512 +PROCESS_GETRESIDUAL32_W4_HBD_AVX512 +PROCESS_GETRESIDUAL32_W4_HBD_AVX512 +PROCESS_GETRESIDUAL32_W4_HBD_AVX512 +PROCESS_GETRESIDUAL32_W4_HBD_AVX512 +PROCESS_GETRESIDUAL32_W4_HBD_AVX512 +PROCESS_GETRESIDUAL32_W4_HBD_AVX512 +PROCESS_GETRESIDUAL32_W4_HBD_AVX512_END +RET +%else +INIT_ZMM avx512 +cglobal getResidual32, 4,5,8 +lea r4, [r3 * 3] + +PROCESS_GETRESIDUAL32_W4_AVX512 +PROCESS_GETRESIDUAL32_W4_AVX512 +PROCESS_GETRESIDUAL32_W4_AVX512 +PROCESS_GETRESIDUAL32_W4_AVX512 +PROCESS_GETRESIDUAL32_W4_AVX512 +PROCESS_GETRESIDUAL32_W4_AVX512 +PROCESS_GETRESIDUAL32_W4_AVX512 +PROCESS_GETRESIDUAL32_W4_AVX512_END +RET +%endif + ;- ; uint32_t quant(int16_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
[x265] [PATCH 037 of 307] x86: AVX512 blockcopy_ss_32xN
# HG changeset patch # User Jayashri Murugan# Date 1499171579 -19800 # Tue Jul 04 18:02:59 2017 +0530 # Node ID ef8989f43083cd5195ff3ba360959fe3900399e5 # Parent 3e3a44c6d77c0c0a7b3a084127a0dc6c835ff392 x86: AVX512 blockcopy_ss_32xN AVX2 performance over C code : 1.82x AVX512 performance over C code : 4.56x diff -r 3e3a44c6d77c -r ef8989f43083 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Jul 04 15:23:31 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Jul 04 18:02:59 2017 +0530 @@ -3854,6 +3854,9 @@ p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512); p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512); +p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512); +p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_avx512); +p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_avx512); p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_avx512); } diff -r 3e3a44c6d77c -r ef8989f43083 source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Tue Jul 04 15:23:31 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Tue Jul 04 18:02:59 2017 +0530 @@ -4164,6 +4164,143 @@ BLOCKCOPY_SS_W32_H4_avx 32, 48 BLOCKCOPY_SS_W32_H4_avx 32, 64 +%macro PROCESS_BLOCKCOPY_SS_W32_H8_avx512 0 +movum0, [r2] +movum1, [r2 + r3] +movum2, [r2 + 2 * r3] +movum3, [r2 + r6] +lea r2, [r2 + 4 * r3] + +movu[r0], m0 +movu[r0 + r1], m1 +movu[r0 + 2 * r1], m2 +movu[r0 + r5], m3 +lea r0, [r0 + 4 * r1] + +movum0, [r2] +movum1, [r2 + r3] +movum2, [r2 + 2 * r3] +movum3, [r2 + r6] +lea r2, [r2 + 4 * r3] + +movu[r0], m0 +movu[r0 + r1], m1 +movu[r0 + 2 * r1], m2 +movu[r0 + r5], m3 +lea r0, [r0 + 4 * r1] +%endmacro + +%macro PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 0 +movum0, [r2] +movum1, [r2 + r3] +movum2, [r2 + 2 * r3] +movum3, [r2 + r6] +lea r2, [r2 + 4 * r3] + +movu[r0], m0 +movu[r0 + r1], m1 +movu[r0 + 2 * r1], m2 +movu[r0 + r5], m3 +lea r0, [r0 + 4 * r1] + +movum0, [r2] +movum1, [r2 + r3] +movum2, [r2 + 2 * r3] +movum3, [r2 + r6] + +movu[r0], m0 +movu[r0 + r1], m1 +movu[r0 + 2 * r1], m2 +movu[r0 + r5], m3 +%endmacro + +;- +; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) +;- +INIT_ZMM avx512 +cglobal blockcopy_ss_32x8, 4, 7, 4 + +addr1, r1 +addr3, r3 +lear5, [3 * r1] +lear6, [3 * r3] + +PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 +RET + +INIT_ZMM avx512 +cglobal blockcopy_ss_32x16, 4, 7, 4 + +addr1, r1 +addr3, r3 +lear5, [3 * r1] +lear6, [3 * r3] + +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 +RET + +INIT_ZMM avx512 +cglobal blockcopy_ss_32x24, 4, 7, 4 + +addr1, r1 +addr3, r3 +lear5, [3 * r1] +lear6, [3 * r3] + +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 +RET + +INIT_ZMM avx512 +cglobal blockcopy_ss_32x32, 4, 7, 4 + +addr1, r1 +addr3, r3 +lear5, [3 * r1] +lear6, [3 * r3] + +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 +RET + +INIT_ZMM avx512 +cglobal blockcopy_ss_32x48, 4, 7, 4 + +addr1, r1 +addr3, r3 +lear5, [3 * r1] +lear6, [3 * r3] + +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 +RET + +INIT_ZMM avx512 +cglobal blockcopy_ss_32x64, 4, 7, 4 + +addr1, r1 +addr3, r3 +lear5, [3 * r1] +lear6, [3 * r3] + +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_avx512 +PROCESS_BLOCKCOPY_SS_W32_H8_LAST_avx512 +RET + ;- ; void blockcopy_ss_%1x%2(int16_t* dst, intptr_t dstStride,
[x265] [PATCH 079 of 307] [x265-avx512]x86: AVX512 pixel_sad_x3_48x64 for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty# Date 1502187312 -19800 # Tue Aug 08 15:45:12 2017 +0530 # Node ID 95c8818a26eea8a17a6a9471f861b89ab9e210c6 # Parent aa1747a46469afe6fc2d5e6295a4b43a14ea [x265-avx512]x86: AVX512 pixel_sad_x3_48x64 for high bit depth AVX2 performance: 20.10x AVX512 performance: 36.00x diff -r aa1747a46469 -r 95c8818a26ee source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 08 11:18:41 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 08 15:45:12 2017 +0530 @@ -2302,6 +2302,7 @@ p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512); p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512); p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512); +p.pu[LUMA_48x64].sad_x3 = PFX(pixel_sad_x3_48x64_avx512); p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx512); p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512); p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512); diff -r aa1747a46469 -r 95c8818a26ee source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Tue Aug 08 11:18:41 2017 +0530 +++ b/source/common/x86/sad16-a.asm Tue Aug 08 15:45:12 2017 +0530 @@ -2844,6 +2844,133 @@ PROCESS_SAD_X3_END_AVX512 RET +; +; int pixel_sad_x3_48x64( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res ) +; +INIT_ZMM avx512 +cglobal pixel_sad_x3_48x64, 4, 8, 17 +pxorm0, m0 +pxorm1, m1 +pxorm2, m2 +mov r7d, 64/4 +vbroadcasti32x8 m16, [pw_1] + +add r4d, r4d +lea r6d, [r4 * 3] +.loop: +movum4, [r0] +movum5, [r0 + 2 * FENC_STRIDE] +movu ym6, [r0 + mmsize] +vinserti32x8m6, [r0 + 2 * FENC_STRIDE + mmsize], 1 +movum7, [r1] +movum8, [r1 + r4] +movu ym9, [r1 + mmsize] +vinserti32x8m9, [r1 + r4 + mmsize], 1 +movum10, [r2] +movum11, [r2 + r4] +movu ym12, [r2 + mmsize] +vinserti32x8m12, [r2 + r4 + mmsize], 1 +movum13, [r3] +movum14, [r3 + r4] +movu ym15, [r3 + mmsize] +vinserti32x8m15, [r3 + r4 + mmsize], 1 + +psubw m7, m4 +psubw m8, m5 +psubw m9, m6 +psubw m10, m4 +psubw m11, m5 +psubw m12, m6 +psubw m13, m4 +psubw m14, m5 +psubw m15, m6 + +pabsw m7, m7 +pabsw m8, m8 +pabsw m9, m9 +pabsw m10, m10 +pabsw m11, m11 +pabsw m12, m12 +pabsw m13, m13 +pabsw m14, m14 +pabsw m15, m15 + +paddw m7, m8 +paddw m7, m9 +paddw m10, m11 +paddw m10, m12 +paddw m13, m14 +paddw m13, m15 + +pmaddwd m7, m16 +paddd m0, m7 +pmaddwd m10, m16 +paddd m1, m10 +pmaddwd m13, m16 +paddd m2, m13 + +movum4, [r0 + 4 * FENC_STRIDE] +movum5, [r0 + 6 * FENC_STRIDE] +movu ym6, [r0 + 4 * FENC_STRIDE + mmsize] +vinserti32x8m6, [r0 + 6 * FENC_STRIDE + mmsize], 1 +movum7, [r1 + 2 * r4] +movum8, [r1 + r6] +movu ym9, [r1 + 2 * r4 + mmsize] +vinserti32x8m9, [r1 + r6 + mmsize], 1 +movum10, [r2 + 2 * r4] +movum11, [r2 + r6] +movu ym12, [r2 + 2 * r4 + mmsize] +vinserti32x8m12, [r2 + r6 + mmsize], 1 +movum13, [r3 + 2 * r4] +movum14, [r3 + r6] +movu ym15, [r3 + 2 * r4 + mmsize] +vinserti32x8m15, [r3 + r6 + mmsize], 1 + +psubw m7, m4 +psubw m8, m5 +psubw m9, m6 +psubw m10, m4 +psubw m11, m5 +psubw m12, m6 +psubw m13, m4 +psubw m14, m5 +psubw m15, m6 + +pabsw m7, m7 +pabsw m8, m8 +pabsw m9, m9 +pabsw m10, m10 +pabsw m11, m11 +pabsw m12, m12 +pabsw m13, m13 +pabsw m14, m14 +pabsw m15, m15 + +paddw m7, m8 +paddw m7, m9 +paddw m10, m11 +paddw m10, m12 +paddw m13, m14 +paddw m13, m15 + +pmaddwd m7, m16 +paddd m0, m7 +pmaddwd m10, m16 +paddd m1, m10 +pmaddwd m13, m16 +paddd m2, m13 + +add r0, FENC_STRIDE * 8 +lea r1, [r1 + r4 * 4] +lea r2, [r2 + r4 * 4] +lea r3, [r3 + r4 * 4] + +dec r7d +jg .loop + +PROCESS_SAD_X3_END_AVX512 +RET +
[x265] [PATCH 081 of 307] x86: AVX512 cleanup blockcopy_sp_64x64
# HG changeset patch # User Vignesh Vijayakumar # Date 1502709712 -19800 # Mon Aug 14 16:51:52 2017 +0530 # Node ID 5c18b655a88a739b87c6b071d186a2b9286b8266 # Parent 4a643ecb8c3bcc4dab96bfe56217d4449564bae0 x86: AVX512 cleanup blockcopy_sp_64x64 diff -r 4a643ecb8c3b -r 5c18b655a88a source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Tue Aug 08 17:01:50 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Mon Aug 14 16:51:52 2017 +0530 @@ -26,7 +26,10 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA 32 +SECTION_RODATA 64 + +ALIGN 64 +const shuf1_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7 cextern pb_4 cextern pb_1 @@ -2162,7 +2165,7 @@ BLOCKCOPY_SP_W64_H4_avx2 64, 64 -%macro PROCESS_BLOCKCOPY_SP_64x8_AVX512 0 +%macro PROCESS_BLOCKCOPY_SP_64x4_AVX512 0 movu m0, [r2] movu m1, [r2 + 64] movu m2, [r2 + r3] @@ -2170,10 +2173,8 @@ packuswb m0, m1 packuswb m2, m3 -vpermq m0, m0, 11011000b -vpermq m2, m2, 11011000b -vshufi64x2 m0, m0, 11011000b -vshufi64x2 m2, m2, 11011000b +vpermq m0, m4, m0 +vpermq m2, m4, m2 movu [r0], m0 movu [r0 + r1], m2 @@ -2184,73 +2185,25 @@ packuswb m0, m1 packuswb m2, m3 -vpermq m0, m0, 11011000b -vpermq m2, m2, 11011000b -vshufi64x2 m0, m0, 11011000b -vshufi64x2 m2, m2, 11011000b -movu [r0 + 2 * r1], m0 -movu [r0 + r5], m2 - -lear0, [r0 + 4 * r1] -lear2, [r2 + 4 * r3] - -movu m0, [r2] -movu m1, [r2 + 64] -movu m2, [r2 + r3] -movu m3, [r2 + r3 + 64] - -packuswb m0, m1 -packuswb m2, m3 -vpermq m0, m0, 11011000b -vpermq m2, m2, 11011000b -vshufi64x2 m0, m0, 11011000b -vshufi64x2 m2, m2, 11011000b -movu [r0], m0 -movu [r0 + r1], m2 - -movu m0, [r2 + 2 * r3] -movu m1, [r2 + 2 * r3 + 64] -movu m2, [r2 + r4] -movu m3, [r2 + r4 + 64] - -packuswb m0, m1 -packuswb m2, m3 -vpermq m0, m0, 11011000b -vpermq m2, m2, 11011000b -vshufi64x2 m0, m0, 11011000b -vshufi64x2 m2, m2, 11011000b +vpermq m0, m4, m0 +vpermq m2, m4, m2 movu [r0 + 2 * r1], m0 movu [r0 + r5], m2 %endmacro INIT_ZMM avx512 -cglobal blockcopy_sp_64x64, 4, 6, 4 +cglobal blockcopy_sp_64x64, 4, 6, 5 +mova m4, [shuf1_avx512] addr3, r3 lear4, [3 * r3] lear5, [3 * r1] -PROCESS_BLOCKCOPY_SP_64x8_AVX512 +%rep 15 +PROCESS_BLOCKCOPY_SP_64x4_AVX512 lear0, [r0 + 4 * r1] lear2, [r2 + 4 * r3] -PROCESS_BLOCKCOPY_SP_64x8_AVX512 -lear0, [r0 + 4 * r1] -lear2, [r2 + 4 * r3] -PROCESS_BLOCKCOPY_SP_64x8_AVX512 -lear0, [r0 + 4 * r1] -lear2, [r2 + 4 * r3] -PROCESS_BLOCKCOPY_SP_64x8_AVX512 -lear0, [r0 + 4 * r1] -lear2, [r2 + 4 * r3] -PROCESS_BLOCKCOPY_SP_64x8_AVX512 -lear0, [r0 + 4 * r1] -lear2, [r2 + 4 * r3] -PROCESS_BLOCKCOPY_SP_64x8_AVX512 -lear0, [r0 + 4 * r1] -lear2, [r2 + 4 * r3] -PROCESS_BLOCKCOPY_SP_64x8_AVX512 -lear0, [r0 + 4 * r1] -lear2, [r2 + 4 * r3] -PROCESS_BLOCKCOPY_SP_64x8_AVX512 +%endrep +PROCESS_BLOCKCOPY_SP_64x4_AVX512 RET ;- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 084 of 307] x86: AVX512 interp_4tap_horiz_ps_64xN
# HG changeset patch # User Jayashri Murugan # Date 1502430475 25200 # Thu Aug 10 22:47:55 2017 -0700 # Node ID 951e9a16296e5d1e528c0083630fde8122bd15c1 # Parent 3d8c45642752803c560891fdfbe0a8b5c03ca76a x86: AVX512 interp_4tap_horiz_ps_64xN Size | AVX2 performance | AVX512 performance -- 64x16 | 26.50x | 35.13x 64x32 | 25.48x | 38.62x 64x48 | 27.52x | 40.34x 64x64 | 27.85x | 40.43x diff -r 3d8c45642752 -r 951e9a16296e source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Aug 11 14:36:18 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Aug 10 22:47:55 2017 -0700 @@ -4029,6 +4029,11 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512); p.weight_pp = PFX(weight_pp_avx512); +//i444 chroma_hps +p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hps = PFX(interp_4tap_horiz_ps_64x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hps = PFX(interp_4tap_horiz_ps_64x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hps = PFX(interp_4tap_horiz_ps_64x48_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hps = PFX(interp_4tap_horiz_ps_64x16_avx512); } #endif } diff -r 3d8c45642752 -r 951e9a16296e source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Fri Aug 11 14:36:18 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Thu Aug 10 22:47:55 2017 -0700 @@ -26,7 +26,7 @@ %include "x86inc.asm" %include "x86util.asm" -SECTION_RODATA 32 +SECTION_RODATA 64 const tab_Tm,db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 db 8, 9,10,11, 9,10,11,12,10,11,12,13,11,12,13, 14 @@ -152,6 +152,9 @@ const interp4_horiz_shuf_store1_avx512, dd 0 ,8, 1, 9, 4, 12, 5, 13, 2, 10, 3, 11, 6, 14, 7, 15 +ALIGN 64 +const interp8_hps_shuf_avx512, dq 0, 4, 1, 5, 2, 6, 3, 7 + SECTION .text cextern pb_128 @@ -9836,7 +9839,7 @@ FILTER_VER_LUMA_S_AVX2_32x24 sp FILTER_VER_LUMA_S_AVX2_32x24 ss ;- -;ipfilter_chroma_pp_avx512 code start +;ipfilter_chroma_avx512 code start ;- %macro PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512 0 ; register map @@ -9976,6 +9979,86 @@ IPFILTER_CHROMA_PP_32xN_AVX512 32 IPFILTER_CHROMA_PP_32xN_AVX512 64 IPFILTER_CHROMA_PP_32xN_AVX512 48 + +%macro PROCESS_IPFILTER_CHROMA_PS_64x1_AVX512 0 +movu ym6, [r0] +vinserti32x8 m6, [r0 + 4], 1 +pshufb m7, m6, m2 +pshufb m6, m1 +pmaddubsw m6, m0 +pmaddubsw m7, m0 +pmaddwdm6, m3 +pmaddwdm7, m3 + +movu ym8, [r0 + 32] +vinserti32x8 m8, [r0 + 36], 1 +pshufb m9, m8, m2 +pshufb m8, m1 +pmaddubsw m8, m0 +pmaddubsw m9, m0 +pmaddwdm8, m3 +pmaddwdm9, m3 + +packssdw m6, m7 +packssdw m8, m9 +psubw m6, m4 +psubw m8, m4 +vpermq m6, m10, m6 +vpermq m8, m10, m8 +movu [r2], m6 +movu [r2 + mmsize],m8 +%endmacro + ;- -;ipfilter_chroma_pp_avx512 code end +; void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;- +%macro IPFILTER_CHROMA_PS_64xN_AVX512 1 +INIT_ZMM avx512 +cglobal interp_4tap_horiz_ps_64x%1, 4,7,11 +mov r4d, r4m +mov r5d, r5m + +%ifdef PIC +lea r6, [tab_ChromaCoeff] +vpbroadcastd m0, [r6 + r4 * 4] +%else +vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + +vbroadcasti32x8m1, [interp4_horiz_shuf_load1_avx512] +vbroadcasti32x8m2, [interp4_horiz_shuf_load2_avx512] +vbroadcasti32x8m3, [pw_1] +vbroadcasti32x8m4, [pw_2000] +mova m10, [interp8_hps_shuf_avx512] + +; register map +; m0- interpolate coeff +; m1,m2 - load shuffle order table +;
[x265] [PATCH 082 of 307] x86: AVX512 blockcopy_sp_32xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1502711388 -19800 # Mon Aug 14 17:19:48 2017 +0530 # Node ID b30539ebe5c9b2d9412d3a39458a90a7574ac744 # Parent 5c18b655a88a739b87c6b071d186a2b9286b8266 x86: AVX512 blockcopy_sp_32xN Size | AVX2 performance | AVX512 performance -- 32x32 | 6.77x | 11.27x i420 32x32 | 6.73x | 11.43x i422 32x64 | 6.68x | 12.19x diff -r 5c18b655a88a -r b30539ebe5c9 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 14 16:51:52 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Aug 14 17:19:48 2017 +0530 @@ -3948,6 +3948,10 @@ p.pu[LUMA_64x16].copy_pp = PFX(blockcopy_pp_64x16_avx512); p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_avx512); +p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx512); +p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = PFX(blockcopy_sp_32x32_avx512); +p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_avx512); + p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512); p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx512); p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx512); diff -r 5c18b655a88a -r b30539ebe5c9 source/common/x86/blockcopy8.asm --- a/source/common/x86/blockcopy8.asm Mon Aug 14 16:51:52 2017 +0530 +++ b/source/common/x86/blockcopy8.asm Mon Aug 14 17:19:48 2017 +0530 @@ -2191,6 +2191,25 @@ movu [r0 + r5], m2 %endmacro +%macro PROCESS_BLOCKCOPY_SP_32x4_AVX512 0 +movu m0, [r2] +movu m1, [r2 + r3] +movu m2, [r2 + 2 * r3] +movu m3, [r2 + r4] + +packuswb m0, m1 +packuswb m2, m3 +vpermq m0, m4, m0 +vpermq m2, m4, m2 +movu [r0], ym0 +vextracti32x8 [r0 + r1], m0, 1 +movu [r0 + 2 * r1], ym2 +vextracti32x8 [r0 + r5], m2, 1 +%endmacro + +;- +; void blockcopy_sp_%1x%2(pixel* dst, intptr_t dstStride, const int16_t* src, intptr_t srcStride) +;- INIT_ZMM avx512 cglobal blockcopy_sp_64x64, 4, 6, 5 mova m4, [shuf1_avx512] @@ -2206,6 +2225,26 @@ PROCESS_BLOCKCOPY_SP_64x4_AVX512 RET +%macro BLOCKCOPY_SP_32xN_AVX512 1 +INIT_ZMM avx512 +cglobal blockcopy_sp_32x%1, 4, 6, 5 +mova m4, [shuf1_avx512] +addr3, r3 +lear4, [3 * r3] +lear5, [3 * r1] + +%rep %1/4 - 1 +PROCESS_BLOCKCOPY_SP_32x4_AVX512 +lear0, [r0 + 4 * r1] +lear2, [r2 + 4 * r3] +%endrep +PROCESS_BLOCKCOPY_SP_32x4_AVX512 +RET +%endmacro + +BLOCKCOPY_SP_32xN_AVX512 32 +BLOCKCOPY_SP_32xN_AVX512 64 + ;- ; void blockfill_s_4x4(int16_t* dst, intptr_t dstride, int16_t val) ;- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 074 of 307] x86: AVX512 interp_4tap_horiz_pp_64xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1501222403 -19800 # Fri Jul 28 11:43:23 2017 +0530 # Node ID 563b3c4f91eb20374311ed18fb18ad12aeebaf26 # Parent 7d7f2a4e771c7c2b573db9bc298d1a35bb72f32d x86: AVX512 interp_4tap_horiz_pp_64xN Size | AVX2 performance | AVX512 performance -- 64x16 | 21.45x | 39.29x 64x32 | 22.27x | 39.37x 64x48 | 22.76x | 40.75x 64x64 | 22.76x | 40.90x diff -r 7d7f2a4e771c -r 563b3c4f91eb source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 08 15:25:11 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Jul 28 11:43:23 2017 +0530 @@ -3996,6 +3996,11 @@ p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = PFX(interp_4tap_horiz_pp_64x16_avx512); + } #endif } diff -r 7d7f2a4e771c -r 563b3c4f91eb source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Tue Aug 08 15:25:11 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Fri Jul 28 11:43:23 2017 +0530 @@ -137,6 +137,10 @@ const interp8_hps_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7 +const interp4_horiz_shuf_load1_avx512, times 2 db 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 + +const interp4_horiz_shuf_load2_avx512, times 2 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 + SECTION .text cextern pb_128 @@ -9820,3 +9824,75 @@ FILTER_VER_LUMA_S_AVX2_32x24 sp FILTER_VER_LUMA_S_AVX2_32x24 ss +;- +;ipfilter_chroma_pp_avx512 code start +;- +%macro PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512 0 +; register map +; m0 - interpolate coeff +; m1, m2 - shuffle order table +; m3 - constant word 1 +; m4 - constant word 512 + +movu m5, [r0] +pshufb m6, m5, m2 +pshufb m5, m5, m1 +pmaddubsw m5, m0 +pmaddubsw m6, m0 +pmaddwdm5, m3 +pmaddwdm6, m3 + +movu m7, [r0 + 4] +pshufb m8, m7, m2 +pshufb m7, m7, m1 +pmaddubsw m7, m0 +pmaddubsw m8, m0 +pmaddwdm7, m3 +pmaddwdm8, m3 + +packssdw m5, m7 +packssdw m6, m8 +pmulhrsw m5, m4 +pmulhrsw m6, m4 +packuswb m5, m6 +movu [r2], m5 +%endmacro + +;- +; void interp_4tap_horiz_pp_64xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx +;- +%macro IPFILTER_CHROMA_PP_64xN_AVX512 1 +INIT_ZMM avx512 +cglobal interp_4tap_horiz_pp_64x%1, 4,6,9 +mov r4d, r4m + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +vpbroadcastd m0, [r5 + r4 * 4] +%else +vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + +vbroadcasti32x8 m1, [interp4_horiz_shuf_load1_avx512] +vbroadcasti32x8 m2, [interp4_horiz_shuf_load2_avx512] +vbroadcasti32x8 m3, [pw_1] +vbroadcasti32x8 m4, [pw_512] +dec r0 + +%rep %1 - 1 +PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512 +lea r2, [r2 + r3] +lea r0, [r0 + r1] +%endrep +PROCESS_IPFILTER_CHROMA_PP_64x1_AVX512 +RET +%endmacro + +IPFILTER_CHROMA_PP_64xN_AVX512 64 +IPFILTER_CHROMA_PP_64xN_AVX512 32 +IPFILTER_CHROMA_PP_64xN_AVX512 48 +IPFILTER_CHROMA_PP_64xN_AVX512 16 + +;- +;ipfilter_chroma_pp_avx512 code end +;- ___ x265-devel mailing list
[x265] [PATCH 080 of 307] [x265-avx512]x86: AVX512 pixel_sad_x4_48x64 for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty# Date 1502191910 -19800 # Tue Aug 08 17:01:50 2017 +0530 # Node ID 4a643ecb8c3bcc4dab96bfe56217d4449564bae0 # Parent 95c8818a26eea8a17a6a9471f861b89ab9e210c6 [x265-avx512]x86: AVX512 pixel_sad_x4_48x64 for high bit depth AVX2 performance: 19.96x AVX512 performance: 34.24x diff -r 95c8818a26ee -r 4a643ecb8c3b source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Aug 08 15:45:12 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 08 17:01:50 2017 +0530 @@ -2313,6 +2313,7 @@ p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512); p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512); p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512); +p.pu[LUMA_48x64].sad_x4 = PFX(pixel_sad_x4_48x64_avx512); p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx512); p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512); p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512); diff -r 95c8818a26ee -r 4a643ecb8c3b source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Tue Aug 08 15:45:12 2017 +0530 +++ b/source/common/x86/sad16-a.asm Tue Aug 08 17:01:50 2017 +0530 @@ -3487,6 +3487,165 @@ RET ; +; void pixel_sad_x4_48x64( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, const pixel* pix5, intptr_t frefstride, int32_t* res ) +; +INIT_ZMM avx512 +cglobal pixel_sad_x4_48x64, 4, 9, 20 +pxorm0, m0 +pxorm1, m1 +pxorm2, m2 +pxorm3, m3 +mov r8d, 64/4 + +vbroadcasti32x8 m19, [pw_1] + +add r5d, r5d +lea r7d, [r5 * 3] +.loop: +movum4, [r0] +movum5, [r0 + 2 * FENC_STRIDE] +movu ym6, [r0 + mmsize] +vinserti32x8m6, [r0 + 2 * FENC_STRIDE + mmsize], 1 +movum7, [r1] +movum8, [r1 + r5] +movu ym9, [r1 + mmsize] +vinserti32x8m9, [r1 + r5 + mmsize], 1 +movum10, [r2] +movum11, [r2 + r5] +movu ym12, [r2 + mmsize] +vinserti32x8m12, [r2 + r5 + mmsize], 1 +movum13, [r3] +movum14, [r3 + r5] +movu ym15, [r3 + mmsize] +vinserti32x8m15, [r3 + r5 + mmsize], 1 +movum16, [r4] +movum17, [r4 + r5] +movu ym18, [r4 + mmsize] +vinserti32x8m18, [r4 + r5 + mmsize], 1 + +psubw m7, m4 +psubw m8, m5 +psubw m9, m6 +psubw m10, m4 +psubw m11, m5 +psubw m12, m6 +psubw m13, m4 +psubw m14, m5 +psubw m15, m6 +psubw m16, m4 +psubw m17, m5 +psubw m18, m6 + +pabsw m7, m7 +pabsw m8, m8 +pabsw m9, m9 +pabsw m10, m10 +pabsw m11, m11 +pabsw m12, m12 +pabsw m13, m13 +pabsw m14, m14 +pabsw m15, m15 +pabsw m16, m16 +pabsw m17, m17 +pabsw m18, m18 + +paddw m7, m8 +paddw m7, m9 +paddw m10, m11 +paddw m10, m12 +paddw m13, m14 +paddw m13, m15 +paddw m16, m17 +paddw m16, m18 + +pmaddwd m7, m19 +paddd m0, m7 +pmaddwd m10, m19 +paddd m1, m10 +pmaddwd m13, m19 +paddd m2, m13 +pmaddwd m16, m19 +paddd m3, m16 + +movum4, [r0 + 4 * FENC_STRIDE] +movum5, [r0 + 6 * FENC_STRIDE] +movu ym6, [r0 + 4 * FENC_STRIDE + mmsize] +vinserti32x8m6, [r0 + 6 * FENC_STRIDE + mmsize], 1 +movum7, [r1 + 2 * r5] +movum8, [r1 + r7] +movu ym9, [r1 + 2 * r5 + mmsize] +vinserti32x8m9, [r1 + r7 + mmsize], 1 +movum10, [r2 + 2 * r5] +movum11, [r2 + r7] +movu ym12, [r2 + 2 * r5 + mmsize] +vinserti32x8m12, [r2 + r7 + mmsize], 1 +movum13, [r3 + 2 * r5] +movum14, [r3 + r7] +movu ym15, [r3 + 2 * r5 + mmsize] +vinserti32x8m15, [r3 + r7 + mmsize], 1 +movum16, [r4 + 2 * r5] +movum17, [r4 + r7] +movu ym18, [r4 + 2 * r5 + mmsize] +vinserti32x8m18, [r4 + r7 + mmsize], 1 + + +psubw m7, m4 +psubw m8, m5 +psubw m9, m6 +psubw m10, m4 +psubw m11, m5 +psubw m12, m6 +psubw m13, m4 +psubw m14, m5 +psubw m15, m6 +psubw m16, m4 +psubw m17, m5 +psubw m18, m6 + +pabsw m7, m7 +pabsw m8,
[x265] [PATCH 083 of 307] [x265-avx512]x86: AVX512 weight_pp
# HG changeset patch # User Gopi Satykrishna Akisetty# Date 1502442378 -19800 # Fri Aug 11 14:36:18 2017 +0530 # Node ID 3d8c45642752803c560891fdfbe0a8b5c03ca76a # Parent b30539ebe5c9b2d9412d3a39458a90a7574ac744 [x265-avx512]x86: AVX512 weight_pp BitDepth | AVX2 performance | AVX512 performance 8 | 6.23x| 10.60x 10 | 9.43x| 14.59x diff -r b30539ebe5c9 -r 3d8c45642752 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 14 17:19:48 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Aug 11 14:36:18 2017 +0530 @@ -2322,6 +2322,7 @@ p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512); p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); +p.weight_pp = PFX(weight_pp_avx512); } } @@ -4026,6 +4027,7 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512); +p.weight_pp = PFX(weight_pp_avx512); } #endif diff -r b30539ebe5c9 -r 3d8c45642752 source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Mon Aug 14 17:19:48 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Fri Aug 11 14:36:18 2017 +0530 @@ -1662,6 +1662,116 @@ jnz .loopH RET %endif + +%if HIGH_BIT_DEPTH +INIT_ZMM avx512 +cglobal weight_pp, 6, 7, 7 +%define correction (14 - BIT_DEPTH) +mov r6d, r6m +shl r6d, 16 - correction +or r6d, r5d + +movd xm0, r6d +vpbroadcastd m0, xm0 +mov r5d, r7m +sub r5d, correction +movd xm1, r5d + +vpbroadcastdm2, r8m +vbroadcasti32x8 m5, [pw_1] +vbroadcasti32x8 m6, [pw_pixel_max] + +add r2d, r2d +add r3d, r3d +sub r2d, r3d +shr r3d, 6 + +.loopH: +mov r5d, r3d + +.loopW: +movum4, [r0] +punpcklwd m3, m4, m5 +pmaddwd m3, m0 +psrad m3, xm1 +paddd m3, m2 + +punpckhwd m4, m5 +pmaddwd m4, m0 +psrad m4, xm1 +paddd m4, m2 + +packusdwm3, m4 +pminuw m3, m6 +movu[r1], m3 + +add r0, 64 +add r1, 64 + +dec r5d +jnz .loopW + +lea r0, [r0 + r2] +lea r1, [r1 + r2] + +dec r4d +jnz .loopH +%undef correction +RET +%else +INIT_ZMM avx512 +cglobal weight_pp, 6, 7, 6 + +shl r5d, 6 +mov r6d, r6m +shl r6d, 16 +or r6d, r5d + +movd xm0, r6d +vpbroadcastd m0, xm0 +movd xm1, r7m +vpbroadcastd m2, r8m + +vbroadcasti32x8 m5, [pw_1] + +sub r2d, r3d +shr r3d, 5 + +.loopH: +mov r5d, r3d + +.loopW: +pmovzxbwm4, [r0] +punpcklwd m3, m4, m5 +pmaddwd m3, m0 +psrad m3, xm1 +paddd m3, m2 + +punpckhwd m4, m5 +pmaddwd m4, m0 +psrad m4, xm1 +paddd m4, m2 + +packssdw m3, m4 +vextracti64x4 ym4, m3, 1 +packuswb ym3, ym4 +vpermqym3, ym3, q3120 +movu [r1], ym3 + +add r0, 32 +add r1, 32 + +dec r5d +jnz .loopW + +lea r0, [r0 + r2] +lea r1, [r1 + r2] + +dec r4d +jnz .loopH +RET +%endif + ;- ;void weight_sp(int16_t *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset) ;- diff -r b30539ebe5c9 -r 3d8c45642752 source/encoder/reference.cpp --- a/source/encoder/reference.cpp Mon Aug 14 17:19:48 2017 +0530 +++ b/source/encoder/reference.cpp Fri Aug 11 14:36:18 2017 +0530 @@ -155,12 +155,10 @@ const pixel* src = reconPic->m_picOrg[c] + numWeightedRows * cuHeight * stride; pixel* dst = fpelPlane[c] + numWeightedRows * cuHeight * stride; - // Computing weighted CU rows int correction = IF_INTERNAL_PREC - X265_DEPTH; // intermediate interpolation depth -int padwidth = (width + 15) & ~15; // weightp assembly needs even 16 byte widths +int padwidth = (width + 31) & ~31; //
[x265] [PATCH 086 of 307] x86: AVX512 cleanup add_ps code
# HG changeset patch # User Vignesh Vijayakumar # Date 1502773372 -19800 # Tue Aug 15 10:32:52 2017 +0530 # Node ID 2db192bac0f14d55f7f82b8964d6c67c3a3637c3 # Parent 6f811dfd5690866f4c432911982a30665dc0e91c x86: AVX512 cleanup add_ps code diff -r 6f811dfd5690 -r 2db192bac0f1 source/common/x86/pixeladd8.asm --- a/source/common/x86/pixeladd8.asm Fri Aug 11 12:32:50 2017 +0530 +++ b/source/common/x86/pixeladd8.asm Tue Aug 15 10:32:52 2017 +0530 @@ -24,11 +24,11 @@ %include "x86inc.asm" %include "x86util.asm" +SECTION_RODATA 64 -SECTION_RODATA 32 - +ALIGN 64 +const store_shuf1_avx512, dq 0, 2, 4, 6, 1, 3, 5, 7 SECTION .text - cextern pw_pixel_max ;- @@ -1148,157 +1148,46 @@ ;- ; pixel_add_ps avx512 code start ;- -%macro PROCESS_ADD_PS_64x8_AVX512 0 +%macro PROCESS_ADD_PS_64x4_AVX512 0 pmovzxbwm0, [r2] pmovzxbwm1, [r2 + 32] movum2, [r3] movum3, [r3 + 64] -pmovzxbwm4, [r2 + r4] -pmovzxbwm5, [r2 + r4 + 32] -movum6, [r3 + r5] -movum7, [r3 + r5 + 64] - paddw m0, m2 paddw m1, m3 -paddw m4, m6 -paddw m5, m7 packuswbm0, m1 -packuswbm4, m5 -vpermq m0, m0, 11011000b -vpermq m4, m4, 11011000b -vshufi64x2 m0, m0, 11011000b -vshufi64x2 m4, m4, 11011000b +vpermq m0, m4, m0 movu[r0], m0 -movu[r0 + r1], m4 - -lea r2, [r2 + r4 * 2] -lea r3, [r3 + r5 * 2] -lea r0, [r0 + r1 * 2] - -pmovzxbwm0, [r2] -pmovzxbwm1, [r2 + 32] -movum2, [r3] -movum3, [r3 + 64] -pmovzxbwm4, [r2 + r4] -pmovzxbwm5, [r2 + r4 + 32] -movum6, [r3 + r5] -movum7, [r3 + r5 + 64] - +pmovzxbwm0, [r2 + r4] +pmovzxbwm1, [r2 + r4 + 32] +movum2, [r3 + r5] +movum3, [r3 + r5 + 64] paddw m0, m2 paddw m1, m3 -paddw m4, m6 -paddw m5, m7 packuswbm0, m1 -packuswbm4, m5 -vpermq m0, m0, 11011000b -vpermq m4, m4, 11011000b -vshufi64x2 m0, m0, 11011000b -vshufi64x2 m4, m4, 11011000b -movu[r0], m0 -movu[r0 + r1], m4 - -lea r2, [r2 + r4 * 2] -lea r3, [r3 + r5 * 2] -lea r0, [r0 + r1 * 2] - -pmovzxbwm0, [r2] -pmovzxbwm1, [r2 + 32] -movum2, [r3] -movum3, [r3 + 64] -pmovzxbwm4, [r2 + r4] -pmovzxbwm5, [r2 + r4 + 32] -movum6, [r3 + r5] -movum7, [r3 + r5 + 64] - +vpermq m0, m4, m0 +movu[r0 + r1], m0 +pmovzxbwm0, [r2 + 2 * r4] +pmovzxbwm1, [r2 + 2 * r4 + 32] +movum2, [r3 + 2 * r5] +movum3, [r3 + 2 * r5 + 64] paddw m0, m2 paddw m1, m3 -paddw m4, m6 -paddw m5, m7 packuswbm0, m1 -packuswbm4, m5 -vpermq m0, m0, 11011000b -vpermq m4, m4, 11011000b -vshufi64x2 m0, m0, 11011000b -vshufi64x2 m4, m4, 11011000b -movu[r0], m0 -movu[r0 + r1], m4 +vpermq m0, m4, m0 +movu[r0 + 2 * r1], m0 -lea r2, [r2 + r4 * 2] -lea r3, [r3 + r5 * 2] -lea r0, [r0 + r1 * 2] - -pmovzxbwm0, [r2] -pmovzxbwm1, [r2 + 32] -movum2, [r3] -movum3, [r3 + 64] -pmovzxbwm4, [r2 + r4] -pmovzxbwm5, [r2 + r4 + 32] -movum6, [r3 + r5] -movum7, [r3 + r5 + 64] - +pmovzxbwm0, [r2 + r7] +pmovzxbwm1, [r2 + r7 + 32] +movum2, [r3 + r8] +movum3, [r3 + r8 + 64] paddw m0, m2 paddw m1, m3 -paddw m4, m6 -paddw m5, m7 packuswbm0, m1 -packuswbm4, m5 -vpermq m0, m0, 11011000b -vpermq m4, m4, 11011000b -vshufi64x2 m0,
[x265] [PATCH 071 of 307] x86: AVX512 addAvg_48x64 for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1501589225 -19800 # Tue Aug 01 17:37:05 2017 +0530 # Node ID aac415b7223acced7fc844c4a07225704b811df0 # Parent ad756cf6d35f0d1460c5a079bea8781ffd67b7c7 x86: AVX512 addAvg_48x64 for high bit depth AVX2 performance: 10.61x AVX512 performance: 13.18x diff -r ad756cf6d35f -r aac415b7223a source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 07 16:30:18 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 01 17:37:05 2017 +0530 @@ -2276,6 +2276,7 @@ p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx512); p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_avx512); p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_avx512); +p.pu[LUMA_48x64].addAvg = PFX(addAvg_48x64_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = PFX(addAvg_32x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_avx512); diff -r ad756cf6d35f -r aac415b7223a source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmMon Aug 07 16:30:18 2017 +0530 +++ b/source/common/x86/mc-a.asmTue Aug 01 17:37:05 2017 +0530 @@ -1812,6 +1812,79 @@ movu[r2 + r8 + mmsize], m0 %endmacro +%macro PROCESS_ADDAVG_48x4_HBD_AVX512 0 +movum0, [r0] +movum1, [r1] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2],m0 + +movuym0, [r0 + mmsize] +movuym1, [r1 + mmsize] +paddw ym0, ym1 +pmulhrswym0, ym3 +paddw ym0, ym4 +pmaxsw ym0, ym2 +pminsw ym0, ym5 +movu[r2 + mmsize],ym0 + +movum0, [r0 + r3] +movum1, [r1 + r4] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2 + r5], m0 + +movuym0, [r0 + r3 + mmsize] +movuym1, [r1 + r4 + mmsize] +paddw ym0, ym1 +pmulhrswym0, ym3 +paddw ym0, ym4 +pmaxsw ym0, ym2 +pminsw ym0, ym5 +movu[r2 + r5 + mmsize], ym0 + +movum0, [r0 + 2 * r3] +movum1, [r1 + 2 * r4] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2 + 2 * r5], m0 + +movuym0, [r0 + 2 * r3 + mmsize] +movuym1, [r1 + 2 * r4 + mmsize] +paddw ym0, ym1 +pmulhrswym0, ym3 +paddw ym0, ym4 +pmaxsw ym0, ym2 +pminsw ym0, ym5 +movu[r2 + 2 * r5 + mmsize], ym0 + +movum0, [r0 + r6] +movum1, [r1 + r7] +paddw m0, m1 +pmulhrswm0, m3 +paddw m0, m4 +pmaxsw m0, m2 +pminsw m0, m5 +movu[r2 + r8], m0 + +movuym0, [r0 + r6 + mmsize] +movuym1, [r1 + r7 + mmsize] +paddw ym0, ym1 +pmulhrswym0, ym3 +paddw ym0, ym4 +pmaxsw ym0, ym2 +pminsw ym0, ym5 +movu[r2 + r8 + mmsize], ym0 +%endmacro ;- ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) ;- @@ -1874,6 +1947,28 @@ ADDAVG_W64_HBD_AVX512 32 ADDAVG_W64_HBD_AVX512 48 ADDAVG_W64_HBD_AVX512 64 + +INIT_ZMM avx512 +cglobal addAvg_48x64, 6,9,6 +vbroadcasti32x8m4, [pw_ %+ ADDAVG_ROUND] +vbroadcasti32x8m5, [pw_pixel_max] +vbroadcasti32x8m3, [pw_ %+ ADDAVG_FACTOR] +pxorm2, m2 +add r3, r3 +add r4, r4 +add r5, r5 +lea r6, [3 * r3] +lea r7, [3 * r4] +lea r8, [3 * r5] + +%rep 15 +
[x265] [PATCH 076 of 307] x86: AVX512 interp_4tap_horiz_pp_32xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1502347959 -19800 # Thu Aug 10 12:22:39 2017 +0530 # Node ID f489bc0b864c48f557cc40b739e84fe1040e8728 # Parent 7bdf20f62d02f5714c1332695ffa8c7c6a9d8a5a x86: AVX512 interp_4tap_horiz_pp_32xN Color Space i444 Size| AVX2 performance | AVX512 performance 32x8| 23.96x | 31.57x 32x16 | 24.38x | 33.22x 32x24 | 22.41x | 36.92x 32x32 | 21.54x | 34.09x 32x64 | 23.27x | 29.14x Color Space i422 Size| AVX2 performance | AVX512 performance 32x16 | 25.55x | 33.16x 32x32 | 22.08x | 35.13x 32x48 | 24.01x | 34.53x 32x64 | 23.76x | 35.21x diff -r 7bdf20f62d02 -r f489bc0b864c source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Apr 04 16:47:58 2018 -0700 +++ b/source/common/x86/asm-primitives.cpp Thu Aug 10 12:22:39 2017 +0530 @@ -4001,6 +4001,18 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = PFX(interp_4tap_horiz_pp_64x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512); + +p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); + +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512); + } #endif } diff -r 7bdf20f62d02 -r f489bc0b864c source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Wed Apr 04 16:47:58 2018 -0700 +++ b/source/common/x86/ipfilter8.asm Thu Aug 10 12:22:39 2017 +0530 @@ -150,6 +150,8 @@ const interp4_horiz_shuf_load2_avx512, times 2 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 +const interp4_horiz_shuf_store1_avx512, dd 0 ,8, 1, 9, 4, 12, 5, 13, 2, 10, 3, 11, 6, 14, 7, 15 + SECTION .text cextern pb_128 @@ -9867,6 +9869,44 @@ movu [r2], m5 %endmacro +%macro PROCESS_IPFILTER_CHROMA_PP_32x2_AVX512 0 +; register map +; m0 - interpolate coeff +; m1, m2 - shuffle order table +; m3 - constant word 1 +; m4 - constant word 512 +; m9 - store shuffle order table + +movu ym5, [r0] +vinserti32x8 m5, [r0 + 4], 1 + +pshufb m6, m5, m2 +pshufb m5, m5, m1 +pmaddubsw m5, m0 +pmaddubsw m6, m0 +pmaddwdm5, m3 +pmaddwdm6, m3 + +movu ym7, [r0 + r1] +vinserti32x8 m7, [r0 + r1 + 4], 1 + +pshufb m8, m7, m2 +pshufb m7, m7, m1 +pmaddubsw m7, m0 +pmaddubsw m8, m0 +pmaddwdm7, m3 +pmaddwdm8, m3 + +packssdw m5, m6 +packssdw m7, m8 +pmulhrsw m5, m4 +pmulhrsw m7, m4 +packuswb m5, m7 +vpermd m5, m9, m5 +movu [r2], ym5 +vextracti32x8[r2 + r3], m5,1 +%endmacro + ;- ; void interp_4tap_horiz_pp_64xN(pixel *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx ;- @@ -9902,6 +9942,40 @@ IPFILTER_CHROMA_PP_64xN_AVX512 48 IPFILTER_CHROMA_PP_64xN_AVX512 16 +%macro IPFILTER_CHROMA_PP_32xN_AVX512 1 +INIT_ZMM avx512 +cglobal interp_4tap_horiz_pp_32x%1, 4,6,10 +mov r4d, r4m + +%ifdef PIC +lea r5, [tab_ChromaCoeff] +vpbroadcastd m0, [r5 + r4 * 4] +%else +vpbroadcastd m0, [tab_ChromaCoeff + r4 * 4] +%endif + +vbroadcasti32x8 m1,
[x265] [PATCH 078 of 307] [x265-avx512]x86: AVX512 pixel_sad_x4_64xN for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty# Date 1502171321 -19800 # Tue Aug 08 11:18:41 2017 +0530 # Node ID aa1747a46469afe6fc2d5e6295a4b43a14ea # Parent d0e43a0e3b531f3e4f42be169c224563753b0210 [x265-avx512]x86: AVX512 pixel_sad_x4_64xN for high bit depth Size| AVX2 performance | AVX512 performance 64x16 | 19.41x | 33.30x 64x32 | 19.75x | 33.22x 64x48 | 20.39x | 35.05x 64x64 | 20.25x | 36.72x diff -r d0e43a0e3b53 -r aa1747a46469 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Aug 07 17:04:23 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 08 11:18:41 2017 +0530 @@ -2312,6 +2312,10 @@ p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512); p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512); p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512); +p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx512); +p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx512); +p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx512); +p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx512); p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16_avx512); p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); diff -r d0e43a0e3b53 -r aa1747a46469 source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Mon Aug 07 17:04:23 2017 +0530 +++ b/source/common/x86/sad16-a.asm Tue Aug 08 11:18:41 2017 +0530 @@ -2136,6 +2136,172 @@ paddd m3, m7 %endmacro +%macro PROCESS_SAD_X4_64x4_AVX512 0 +movum8, [r0] +movum10, [r0 + mmsize] +movum4, [r1] +movum11, [r1 + mmsize] +movum5, [r2] +movum12, [r2 + mmsize] +movum6, [r3] +movum13, [r3 + mmsize] +movum7, [r4] +movum14, [r4 + mmsize] + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +psubw m11, m10 +psubw m12, m10 +psubw m13, m10 +psubw m14, m10 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 +pabsw m11, m11 +pabsw m12, m12 +pabsw m13, m13 +pabsw m14, m14 +paddw m4, m11 +paddw m5, m12 +paddw m6, m13 +paddw m7, m14 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 + + +movum8, [r0 + 2 * FENC_STRIDE] +movum10, [r0 + 2 * FENC_STRIDE + mmsize] +movum4, [r1 + r5] +movum11, [r1 + r5 + mmsize] +movum5, [r2 + r5] +movum12, [r2 + r5 + mmsize] +movum6, [r3 + r5] +movum13, [r3 + r5 + mmsize] +movum7, [r4 + r5] +movum14, [r4 + r5 + mmsize] + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +psubw m11, m10 +psubw m12, m10 +psubw m13, m10 +psubw m14, m10 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 +pabsw m11, m11 +pabsw m12, m12 +pabsw m13, m13 +pabsw m14, m14 +paddw m4, m11 +paddw m5, m12 +paddw m6, m13 +paddw m7, m14 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 + +movum8, [r0 + 4 * FENC_STRIDE] +movum10, [r0 + 4 * FENC_STRIDE + mmsize] +movum4, [r1 + 2 * r5] +movum11, [r1 + 2 * r5 + mmsize] +movum5, [r2 + 2 * r5] +movum12, [r2 + 2 * r5 + mmsize] +movum6, [r3 + 2 * r5] +movum13, [r3 + 2 * r5 + mmsize] +movum7, [r4 + 2 * r5] +movum14, [r4 + 2 * r5 + mmsize] + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +psubw m11, m10 +psubw m12, m10 +psubw m13, m10 +psubw m14, m10 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 +pabsw m11, m11 +pabsw m12, m12 +pabsw m13, m13 +pabsw m14, m14 +paddw m4, m11 +paddw m5, m12 +paddw m6, m13 +paddw m7, m14 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 + +movum8, [r0 + 6 * FENC_STRIDE] +movum10, [r0 + 6 * FENC_STRIDE + mmsize] +movum4, [r1 + r7] +movum11, [r1 + r7 + mmsize] +movum5, [r2 + r7] +movum12, [r2 + r7 + mmsize] +movum6, [r3 + r7] +movum13, [r3 + r7 + mmsize] +movum7, [r4 + r7] +movum14, [r4 + r7 + mmsize] + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7,
[x265] [PATCH 066 of 307] [x265-avx512]x86: AVX512 pixel_sad_x3_32xN for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty# Date 1501765251 -19800 # Thu Aug 03 18:30:51 2017 +0530 # Node ID 241f318be574498b7bb77939937a907e4721dc32 # Parent df45017fca906d5f3370dcc78e43284622753a73 [x265-avx512]x86: AVX512 pixel_sad_x3_32xN for high bit depth Size| AVX2 performance | AVX512 performance 32x8| 20.72x | 29.20x 32x16 | 19.31x | 30.53x 32x24 | 19.78x | 33.32x 32x32 | 20.02x | 32.71x 32x64 | 20.40x | 33.30x diff -r df45017fca90 -r 241f318be574 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Aug 03 18:30:51 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Thu Aug 03 18:30:51 2017 +0530 @@ -2313,6 +2313,12 @@ p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512); p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512); +p.pu[LUMA_32x8].sad_x3 = PFX(pixel_sad_x3_32x8_avx512); +p.pu[LUMA_32x16].sad_x3 = PFX(pixel_sad_x3_32x16_avx512); +p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512); +p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512); +p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512); + } } #else // if HIGH_BIT_DEPTH diff -r df45017fca90 -r 241f318be574 source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Thu Aug 03 18:30:51 2017 +0530 +++ b/source/common/x86/sad16-a.asm Thu Aug 03 18:30:51 2017 +0530 @@ -2856,3 +2856,362 @@ PROCESS_SAD_X3_END_AVX512 RET +; +; SAD x3/x4 avx512 code start +; + +%macro PROCESS_SAD_X3_32x4_AVX512 0 +movum6, [r0] +movum3, [r1] +movum4, [r2] +movum5, [r3] + + +psubw m3, m6 +psubw m4, m6 +psubw m5, m6 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 + +movum6, [r0 + 2 * FENC_STRIDE] +movum3, [r1 + r4] +movum4, [r2 + r4] +movum5, [r3 + r4] + +psubw m3, m6 +psubw m4, m6 +psubw m5, m6 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 + +movum6, [r0 + 4 * FENC_STRIDE] +movum3, [r1 + 2 * r4] +movum4, [r2 + 2 * r4] +movum5, [r3 + 2 * r4] + +psubw m3, m6 +psubw m4, m6 +psubw m5, m6 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 + +movum6, [r0 + 6 * FENC_STRIDE] +movum3, [r1 + r6] +movum4, [r2 + r6] +movum5, [r3 + r6] + +psubw m3, m6 +psubw m4, m6 +psubw m5, m6 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 +%endmacro + + +%macro PROCESS_SAD_X3_END_AVX512 0 +vextracti32x8 ym3, m0, 1 +vextracti32x8 ym4, m1, 1 +vextracti32x8 ym5, m2, 1 + +paddd ym0, ym3 +paddd ym1, ym4 +paddd ym2, ym5 + +vextracti64x2 xm3, m0, 1 +vextracti64x2 xm4, m1, 1 +vextracti64x2 xm5, m2, 1 + +paddd xm0, xm3 +paddd xm1, xm4 +paddd xm2, xm5 + +pshufd xm3, xm0, 1110b +pshufd xm4, xm1, 1110b +pshufd xm5, xm2, 1110b + +paddd xm0, xm3 +paddd xm1, xm4 +paddd xm2, xm5 + +pshufd xm3, xm0, 0001b +pshufd xm4, xm1, 0001b +pshufd xm5, xm2, 0001b + +paddd xm0, xm3 +paddd xm1, xm4 +paddd xm2, xm5 + +movd [r5 + 0], xm0 +movd [r5 + 4], xm1 +movd [r5 + 8], xm2 +%endmacro + + +;-- +; void pixel_sad_x3_32x%1( const pixel* pix1, const pixel* pix2, const pixel* pix3, const pixel* pix4, intptr_t frefstride, int32_t* res ) +;-- + +INIT_ZMM avx512 +cglobal pixel_sad_x3_32x8, 6,7,8 +pxorm0, m0 +pxorm1, m1 +pxorm2, m2 + +vbroadcasti32x8 m7, [pw_1] + +add r4d, r4d +lea r6d, [r4 * 3] + +PROCESS_SAD_X3_32x4_AVX512 +add r0, FENC_STRIDE * 8 +lea r1, [r1 + r4 * 4] +lea r2, [r2 + r4 * 4]
[x265] [PATCH 067 of 307] [x265-avx512]x86: AVX512 pixel_sad_x4_32xN for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty# Date 1501837071 -19800 # Fri Aug 04 14:27:51 2017 +0530 # Node ID c3a2abd8e46f8db3ba7c276f39fe41ed002ce295 # Parent 241f318be574498b7bb77939937a907e4721dc32 [x265-avx512]x86: AVX512 pixel_sad_x4_32xN for high bit depth Size| AVX2 performance | AVX512 performance 32x8| 16.73x | 25.16x 32x16 | 18.36x | 29.04x 32x24 | 19.52x | 31.03x 32x32 | 18.78x | 31.95x 32x64 | 19.01x | 34.20x diff -r 241f318be574 -r c3a2abd8e46f source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Aug 03 18:30:51 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Aug 04 14:27:51 2017 +0530 @@ -2319,6 +2319,12 @@ p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512); p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512); +p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx512); +p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512); +p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx512); +p.pu[LUMA_32x32].sad_x4 = PFX(pixel_sad_x4_32x32_avx512); +p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx512); + } } #else // if HIGH_BIT_DEPTH diff -r 241f318be574 -r c3a2abd8e46f source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Thu Aug 03 18:30:51 2017 +0530 +++ b/source/common/x86/sad16-a.asm Fri Aug 04 14:27:51 2017 +0530 @@ -2501,6 +2501,160 @@ ; SAD x3/x4 avx512 code start ; +%macro PROCESS_SAD_X4_32x4_AVX512 0 +movum8, [r0] +movum4, [r1] +movum5, [r2] +movum6, [r3] +movum7, [r4] + + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 + + +movum8, [r0 + 2 * FENC_STRIDE] +movum4, [r1 + r5] +movum5, [r2 + r5] +movum6, [r3 + r5] +movum7, [r4 + r5] + + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 + +movum8, [r0 + 4 * FENC_STRIDE] +movum4, [r1 + 2 * r5] +movum5, [r2 + 2 * r5] +movum6, [r3 + 2 * r5] +movum7, [r4 + 2 * r5] + + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 + +movum8, [r0 + 6 * FENC_STRIDE] +movum4, [r1 + r7] +movum5, [r2 + r7] +movum6, [r3 + r7] +movum7, [r4 + r7] + + +psubw m4, m8 +psubw m5, m8 +psubw m6, m8 +psubw m7, m8 +pabsw m4, m4 +pabsw m5, m5 +pabsw m6, m6 +pabsw m7, m7 + +pmaddwd m4, m9 +paddd m0, m4 +pmaddwd m5, m9 +paddd m1, m5 +pmaddwd m6, m9 +paddd m2, m6 +pmaddwd m7, m9 +paddd m3, m7 +%endmacro + + +%macro PROCESS_SAD_X4_END_AVX512 0 +vextracti32x8 ym4, m0, 1 +vextracti32x8 ym5, m1, 1 +vextracti32x8 ym6, m2, 1 +vextracti32x8 ym7, m3, 1 + +paddd ym0, ym4 +paddd ym1, ym5 +paddd ym2, ym6 +paddd ym3, ym7 + +vextracti64x2 xm4, m0, 1 +vextracti64x2 xm5, m1, 1 +vextracti64x2 xm6, m2, 1 +vextracti64x2 xm7, m3, 1 + +paddd xm0, xm4 +paddd xm1, xm5 +paddd xm2, xm6 +paddd xm3, xm7 + +pshufd xm4, xm0, 1110b +pshufd xm5, xm1, 1110b +pshufd xm6, xm2, 1110b +pshufd xm7, xm3, 1110b + +paddd xm0, xm4 +paddd xm1, xm5 +paddd xm2, xm6 +paddd xm3, xm7 + +pshufd xm4, xm0, 0001b +pshufd xm5, xm1, 0001b +pshufd xm6, xm2, 0001b +pshufd xm7, xm3, 0001b + +paddd xm0, xm4 +paddd xm1, xm5 +paddd xm2, xm6 +paddd xm3, xm7 + +mov r0, r6mp +movd [r0 + 0], xm0 +movd [r0 + 4], xm1 +movd [r0 + 8], xm2 +movd [r0 + 12], xm3 +%endmacro + + + %macro PROCESS_SAD_X3_32x4_AVX512 0 movum6, [r0] movum3, [r1] @@
[x265] [PATCH 077 of 307] [x265-avx512]x86: AVX512 pixel_sad_x3_64xN for high bit depth
# HG changeset patch # User Gopi Satykrishna Akisetty# Date 1502105663 -19800 # Mon Aug 07 17:04:23 2017 +0530 # Node ID d0e43a0e3b531f3e4f42be169c224563753b0210 # Parent f489bc0b864c48f557cc40b739e84fe1040e8728 [x265-avx512]x86: AVX512 pixel_sad_x3_64xN for high bit depth Size| AVX2 performance | AVX512 performance 64x16 | 19.69x | 36.23x 64x32 | 20.33x | 37.94x 64x48 | 20.64x | 38.48x 64x64 | 20.51x | 38.49x diff -r f489bc0b864c -r d0e43a0e3b53 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Aug 10 12:22:39 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Aug 07 17:04:23 2017 +0530 @@ -2302,6 +2302,10 @@ p.pu[LUMA_32x24].sad_x3 = PFX(pixel_sad_x3_32x24_avx512); p.pu[LUMA_32x32].sad_x3 = PFX(pixel_sad_x3_32x32_avx512); p.pu[LUMA_32x64].sad_x3 = PFX(pixel_sad_x3_32x64_avx512); +p.pu[LUMA_64x16].sad_x3 = PFX(pixel_sad_x3_64x16_avx512); +p.pu[LUMA_64x32].sad_x3 = PFX(pixel_sad_x3_64x32_avx512); +p.pu[LUMA_64x48].sad_x3 = PFX(pixel_sad_x3_64x48_avx512); +p.pu[LUMA_64x64].sad_x3 = PFX(pixel_sad_x3_64x64_avx512); p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx512); p.pu[LUMA_32x16].sad_x4 = PFX(pixel_sad_x4_32x16_avx512); diff -r f489bc0b864c -r d0e43a0e3b53 source/common/x86/sad16-a.asm --- a/source/common/x86/sad16-a.asm Thu Aug 10 12:22:39 2017 +0530 +++ b/source/common/x86/sad16-a.asm Mon Aug 07 17:04:23 2017 +0530 @@ -2266,6 +2266,135 @@ paddd m2, m5 %endmacro +%macro PROCESS_SAD_X3_64x4_AVX512 0 +movum6, [r0] +movum8, [r0 + mmsize] +movum3, [r1] +movum9, [r1 + mmsize] +movum4, [r2] +movum10, [r2 + mmsize] +movum5, [r3] +movum11, [r3 + mmsize] + +psubw m3, m6 +psubw m9, m8 +psubw m4, m6 +psubw m10, m8 +psubw m5, m6 +psubw m11, m8 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 +pabsw m9, m9 +pabsw m10, m10 +pabsw m11, m11 +paddw m3, m9 +paddw m4, m10 +paddw m5, m11 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 + +movum6, [r0 + 2 * FENC_STRIDE] +movum8, [r0 + 2 * FENC_STRIDE + mmsize] +movum3, [r1 + r4] +movum9, [r1 + r4 + mmsize] +movum4, [r2 + r4] +movum10, [r2 + r4 + mmsize] +movum5, [r3 + r4] +movum11, [r3 + r4 + mmsize] + +psubw m3, m6 +psubw m9, m8 +psubw m4, m6 +psubw m10, m8 +psubw m5, m6 +psubw m11, m8 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 +pabsw m9, m9 +pabsw m10, m10 +pabsw m11, m11 +paddw m3, m9 +paddw m4, m10 +paddw m5, m11 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 + +movum6, [r0 + 4 * FENC_STRIDE] +movum8, [r0 + 4 * FENC_STRIDE + mmsize] +movum3, [r1 + 2 * r4] +movum9, [r1 + 2 * r4 + mmsize] +movum4, [r2 + 2 * r4] +movum10, [r2 + 2 * r4 + mmsize] +movum5, [r3 + 2 * r4] +movum11, [r3 + 2 * r4 + mmsize] + +psubw m3, m6 +psubw m9, m8 +psubw m4, m6 +psubw m10, m8 +psubw m5, m6 +psubw m11, m8 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 +pabsw m9, m9 +pabsw m10, m10 +pabsw m11, m11 +paddw m3, m9 +paddw m4, m10 +paddw m5, m11 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 + +movum6, [r0 + 6 * FENC_STRIDE] +movum8, [r0 + 6 * FENC_STRIDE + mmsize] +movum3, [r1 + r6] +movum9, [r1 + r6 + mmsize] +movum4, [r2 + r6] +movum10, [r2 + r6 + mmsize] +movum5, [r3 + r6] +movum11, [r3 + r6 + mmsize] + +psubw m3, m6 +psubw m9, m8 +psubw m4, m6 +psubw m10, m8 +psubw m5, m6 +psubw m11, m8 +pabsw m3, m3 +pabsw m4, m4 +pabsw m5, m5 +pabsw m9, m9 +pabsw m10, m10 +pabsw m11, m11 +paddw m3, m9 +paddw m4, m10 +paddw m5, m11 + +pmaddwd m3, m7 +paddd m0, m3 +pmaddwd m4, m7 +paddd m1, m4 +pmaddwd m5, m7 +paddd m2, m5 +%endmacro %macro PROCESS_SAD_X3_END_AVX512 0 vextracti32x8 ym3, m0, 1 @@ -2300,9 +2429,16 @@ paddd xm1, xm4 paddd xm2, xm5 -movd [r5 + 0], xm0 -movd [r5 + 4], xm1 -movd [r5 + 8], xm2 +%if UNIX64 +movd [r5 + 0], xm0 +movd [r5 + 4], xm1 +movd [r5 + 8],
[x265] [PATCH 069 of 307] x86: AVX512 pixel_var_32x32
# HG changeset patch # User Vignesh Vijayakumar # Date 1501843838 -19800 # Fri Aug 04 16:20:38 2017 +0530 # Node ID 039ed71e123c3e14bfaabbe3aada944157784b36 # Parent c5b5b7cb9bbef4365692bfaf05a2a83796d5f1b0 x86: AVX512 pixel_var_32x32 AVX2 performance : 9.15x AVX512 performance : 13.49x diff -r c5b5b7cb9bbe -r 039ed71e123c source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Aug 04 14:27:51 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Aug 04 16:20:38 2017 +0530 @@ -3929,6 +3929,7 @@ p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512); p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512); +p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx512); p.pu[LUMA_16x64].pixelavg_pp = PFX(pixel_avg_16x64_avx512); p.pu[LUMA_16x32].pixelavg_pp = PFX(pixel_avg_16x32_avx512); p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx512); diff -r c5b5b7cb9bbe -r 039ed71e123c source/common/x86/pixel-util8.asm --- a/source/common/x86/pixel-util8.asm Fri Aug 04 14:27:51 2017 +0530 +++ b/source/common/x86/pixel-util8.asm Fri Aug 04 16:20:38 2017 +0530 @@ -7105,6 +7105,82 @@ RET %endif ; !HIGH_BIT_DEPTH +%macro PROCESS_VAR_32x8_AVX512 0 +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + r1] +pmovzxbwm2, [r0 + 2 * r1] +pmovzxbwm3, [r0 + r2] + +paddw m4, m0 +paddw m4, m1 +paddw m4, m2 +paddw m4, m3 +pmaddwd m0, m0 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +paddd m5, m0 +paddd m5, m1 +paddd m5, m2 +paddd m5, m3 + +lea r0, [r0 + r1 * 4] + +pmovzxbwm0, [r0] +pmovzxbwm1, [r0 + r1] +pmovzxbwm2, [r0 + 2 * r1] +pmovzxbwm3, [r0 + r2] + +paddw m4, m0 +paddw m4, m1 +paddw m4, m2 +paddw m4, m3 +pmaddwd m0, m0 +pmaddwd m1, m1 +pmaddwd m2, m2 +pmaddwd m3, m3 +paddd m5, m0 +paddd m5, m1 +paddd m5, m2 +paddd m5, m3 +%endmacro + +%macro PROCESS_VAR_AVX512_END 0 +vextracti32x8 ym0, m4, 1 +vextracti32x8 ym1, m5, 1 +paddw ym4, ym0 +paddd ym5, ym1 +vextracti32x4 xm0, m4, 1 +vextracti32x4 xm1, m5, 1 +paddw xm4, xm0 +paddd xm5, xm1 +HADDW xm4, xm2 +HADDD xm5, xm1 +punpckldq xm4, xm5 +movq rax, xm4 +%endmacro + +%if HIGH_BIT_DEPTH==0 +;- +; int pixel_var_wxh( uint8_t *, intptr_t ) +;- +INIT_ZMM avx512 +cglobal pixel_var_32x32, 2,4,6 +pxor m4, m4; sum +pxor m5, m5; sum squared +lea r2, [3 * r1] + +PROCESS_VAR_32x8_AVX512 +lea r0, [r0 + r1 * 4] +PROCESS_VAR_32x8_AVX512 +lea r0, [r0 + r1 * 4] +PROCESS_VAR_32x8_AVX512 +lea r0, [r0 + r1 * 4] +PROCESS_VAR_32x8_AVX512 +PROCESS_VAR_AVX512_END +RET +%endif + %macro VAR_AVX512_CORE 1 ; accum %if %1 paddwm0, m2 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 085 of 307] x86: AVX512 interp_4tap_horiz_ps_32xN
# HG changeset patch # User Jayashri Murugan# Date 1502434970 -19800 # Fri Aug 11 12:32:50 2017 +0530 # Node ID 6f811dfd5690866f4c432911982a30665dc0e91c # Parent 951e9a16296e5d1e528c0083630fde8122bd15c1 x86: AVX512 interp_4tap_horiz_ps_32xN Color Space i444 Size| AVX2 performance | AVX512 performance 32x8| 25.91x | 38.35x 32x16 | 25.45x | 32.02x 32x24 | 25.80x | 32.73x 32x32 | 33.49x | 38.02x 32x64 | 27.42x | 36.20x Color Space i422 Size| AVX2 performance | AVX512 performance 32x16 | 24.74x | 33.95x 32x32 | 33.31x | 34.28x 32x48 | 27.11x | 35.98x 32x64 | 27.32x | 35.02x Color Space i420 Size| AVX2 performance | AVX512 performance 32x8| 27.16x | 36.68x 32x16 | 24.87x | 31.40x 32x24 | 25.98x | 34.08x 32x32 | 33.01x | 34.71x diff -r 951e9a16296e -r 6f811dfd5690 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Aug 10 22:47:55 2017 -0700 +++ b/source/common/x86/asm-primitives.cpp Fri Aug 11 12:32:50 2017 +0530 @@ -4034,6 +4034,25 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hps = PFX(interp_4tap_horiz_ps_64x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hps = PFX(interp_4tap_horiz_ps_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hps = PFX(interp_4tap_horiz_ps_64x16_avx512); + +p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512); + +//i422 chroma_hps +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hps = PFX(interp_4tap_horiz_ps_32x64_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hps = PFX(interp_4tap_horiz_ps_32x48_avx512); + +//i420 chroma_hps +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hps = PFX(interp_4tap_horiz_ps_32x8_avx512); + } #endif } diff -r 951e9a16296e -r 6f811dfd5690 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Thu Aug 10 22:47:55 2017 -0700 +++ b/source/common/x86/ipfilter8.asm Fri Aug 11 12:32:50 2017 +0530 @@ -10010,7 +10010,7 @@ %endmacro ;- -; void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) +; void interp_horiz_ps_64xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt) ;- %macro IPFILTER_CHROMA_PS_64xN_AVX512 1 INIT_ZMM avx512 @@ -10059,6 +10059,74 @@ IPFILTER_CHROMA_PS_64xN_AVX512 48 IPFILTER_CHROMA_PS_64xN_AVX512 16 +%macro PROCESS_IPFILTER_CHROMA_PS_32x1_AVX512 0 +movu ym6, [r0] +vinserti32x8 m6, [r0 + 4], 1 +pshufb m7, m6, m2 +pshufb m6, m6, m1 +pmaddubsw m6, m0 +pmaddubsw m7, m0 +pmaddwdm6, m3 +pmaddwdm7, m3 + +packssdw m6, m7 +psubw m6, m4 +vpermq m6, m8, m6 +movu [r2], m6 +%endmacro + +;- +; void interp_horiz_ps_32xN(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
[x265] [PATCH 091 of 307] x86: AVX512 cleanup interp_4tap_horiz_pp_32xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1504242228 -19800 # Fri Sep 01 10:33:48 2017 +0530 # Node ID dbfcd0ee40e9bd4ee351eb064d8aa0819bd9b3fd # Parent d9200885420957bccd4edea62bf87bbe8831bc62 x86: AVX512 cleanup interp_4tap_horiz_pp_32xN diff -r d92008854209 -r dbfcd0ee40e9 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Sun Aug 13 15:12:25 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Sep 01 10:33:48 2017 +0530 @@ -4011,22 +4011,29 @@ p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); +//i444 chroma_hpp p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = PFX(interp_4tap_horiz_pp_64x16_avx512); - p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512); - p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); +//i422 chroma_hpp p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512); + +//i420 chroma_hpp +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512); + p.weight_pp = PFX(weight_pp_avx512); //i444 chroma_hps diff -r d92008854209 -r dbfcd0ee40e9 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Sun Aug 13 15:12:25 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Fri Sep 01 10:33:48 2017 +0530 @@ -150,8 +150,6 @@ const interp4_horiz_shuf_load2_avx512, times 2 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 const interp4_horiz_shuf_load3_avx512, times 2 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 -const interp4_horiz_shuf_store1_avx512, dd 0 ,8, 1, 9, 4, 12, 5, 13, 2, 10, 3, 11, 6, 14, 7, 15 - ALIGN 64 const interp8_hps_shuf_avx512, dq 0, 4, 1, 5, 2, 6, 3, 7 @@ -9881,31 +9879,30 @@ ; m9 - store shuffle order table movu ym5, [r0] -vinserti32x8 m5, [r0 + 4], 1 +vinserti32x8 m5, [r0 + r1], 1 +movu ym7, [r0 + 4] +vinserti32x8 m7, [r0 + r1 + 4], 1 pshufb m6, m5, m2 -pshufb m5, m5, m1 +pshufb m5, m1 +pshufb m8, m7, m2 +pshufb m7, m1 + pmaddubsw m5, m0 +pmaddubsw m7, m0 +pmaddwdm5, m3 +pmaddwdm7, m3 + pmaddubsw m6, m0 -pmaddwdm5, m3 +pmaddubsw m8, m0 pmaddwdm6, m3 - -movu ym7, [r0 + r1] -vinserti32x8 m7, [r0 + r1 + 4], 1 - -pshufb m8, m7, m2 -pshufb m7, m7, m1 -pmaddubsw m7, m0 -pmaddubsw m8, m0 -pmaddwdm7, m3 pmaddwdm8, m3 -packssdw m5, m6 -packssdw m7, m8 +packssdw m5, m7 +packssdw m6, m8 pmulhrsw m5, m4 -pmulhrsw m7, m4 -packuswb m5, m7 -vpermd m5, m9, m5 +pmulhrsw m6, m4 +packuswb m5, m6 movu [r2], ym5 vextracti32x8[r2 + r3], m5,
[x265] [PATCH 093 of 307] x86: AVX512 addAvg_32xN
# HG changeset patch # User Vignesh Vijayakumar # Date 1503385834 -19800 # Tue Aug 22 12:40:34 2017 +0530 # Node ID 738f07186eb1d4bca84e9acdf70921ee9e2fee92 # Parent ed1932a414bf5962bbeccfd5c9e208b7db90f77f x86: AVX512 addAvg_32xN Size | AVX2 performance | AVX512 performance -- 32x8 | 15.31x | 19.98x 32x16 | 15.14x | 23.25x 32x24 | 14.65x | 23.95x 32x32 | 15.41x | 24.76x 32x64 | 14.56x | 24.53x diff -r ed1932a414bf -r 738f07186eb1 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Sun Aug 13 18:18:28 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Aug 22 12:40:34 2017 +0530 @@ -3964,6 +3964,19 @@ p.pu[LUMA_64x32].addAvg = PFX(addAvg_64x32_avx512); p.pu[LUMA_64x48].addAvg = PFX(addAvg_64x48_avx512); p.pu[LUMA_64x64].addAvg = PFX(addAvg_64x64_avx512); +p.pu[LUMA_32x8].addAvg = PFX(addAvg_32x8_avx512); +p.pu[LUMA_32x16].addAvg = PFX(addAvg_32x16_avx512); +p.pu[LUMA_32x24].addAvg = PFX(addAvg_32x24_avx512); +p.pu[LUMA_32x32].addAvg = PFX(addAvg_32x32_avx512); +p.pu[LUMA_32x64].addAvg = PFX(addAvg_32x64_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].addAvg = PFX(addAvg_32x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].addAvg = PFX(addAvg_32x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].addAvg = PFX(addAvg_32x24_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].addAvg = PFX(addAvg_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].addAvg = PFX(addAvg_32x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].addAvg = PFX(addAvg_32x48_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].addAvg = PFX(addAvg_32x64_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].addAvg = PFX(addAvg_32x32_avx512); p.cu[BLOCK_32x32].blockfill_s = PFX(blockfill_s_32x32_avx512); diff -r ed1932a414bf -r 738f07186eb1 source/common/x86/mc-a.asm --- a/source/common/x86/mc-a.asmSun Aug 13 18:18:28 2017 +0530 +++ b/source/common/x86/mc-a.asmTue Aug 22 12:40:34 2017 +0530 @@ -3317,6 +3317,24 @@ movu[r2 + r5], m0 %endmacro +%macro PROCESS_ADDAVG_32x2_AVX512 0 +movum0, [r0] +movum1, [r1] +movum2, [r0 + r3] +movum3, [r1 + r4] + +paddw m0, m1 +pmulhrswm0, m4 +paddw m0, m5 +paddw m2, m3 +pmulhrswm2, m4 +paddw m2, m5 + +packuswbm0, m2 +vpermq m0, m6, m0 +movu[r2], ym0 +vextracti32x8 [r2 + r5], m0, 1 +%endmacro ; ;void addAvg (int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride) ; @@ -3344,6 +3362,32 @@ ADDAVG_W64_AVX512 32 ADDAVG_W64_AVX512 48 ADDAVG_W64_AVX512 64 + +%macro ADDAVG_W32_AVX512 1 +INIT_ZMM avx512 +cglobal addAvg_32x%1, 6,6,7 +vbroadcasti32x8 m4, [pw_256] +vbroadcasti32x8 m5, [pw_128] +movam6, [shuf_avx512] +add r3, r3 +add r4, r4 + +%rep %1/2 - 1 +PROCESS_ADDAVG_32x2_AVX512 +lea r2, [r2 + 2 * r5] +lea r0, [r0 + 2 * r3] +lea r1, [r1 + 2 * r4] +%endrep +PROCESS_ADDAVG_32x2_AVX512 +RET +%endmacro + +ADDAVG_W32_AVX512 8 +ADDAVG_W32_AVX512 16 +ADDAVG_W32_AVX512 24 +ADDAVG_W32_AVX512 32 +ADDAVG_W32_AVX512 48 +ADDAVG_W32_AVX512 64 ;- ; addAvg avx512 code end ;- ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 097 of 307] x86: AVX512 convert_p2s link 32xN and 64xN chroma_444 primitives
# HG changeset patch # User Vignesh Vijayakumar # Date 1503901717 -19800 # Mon Aug 28 11:58:37 2017 +0530 # Node ID bf199a5eca5be148be8a0c91cd9f2e8e0e908059 # Parent 0355f0128b7d713c4a21c91d3cc5bed1e8b43c47 x86: AVX512 convert_p2s link 32xN and 64xN chroma_444 primitives diff -r 0355f0128b7d -r bf199a5eca5b source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Thu Aug 24 12:20:07 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Aug 28 11:58:37 2017 +0530 @@ -2253,6 +2253,15 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s = PFX(filterPixelToShort_32x8_avx2); +p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s = PFX(filterPixelToShort_32x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s = PFX(filterPixelToShort_32x24_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s = PFX(filterPixelToShort_32x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s = PFX(filterPixelToShort_64x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s = PFX(filterPixelToShort_64x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s = PFX(filterPixelToShort_64x48_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s = PFX(filterPixelToShort_64x64_avx512); p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx512); @@ -4041,6 +4050,15 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = PFX(filterPixelToShort_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = PFX(filterPixelToShort_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x8].p2s = PFX(filterPixelToShort_32x8_avx2); +p.chroma[X265_CSP_I444].pu[LUMA_32x16].p2s = PFX(filterPixelToShort_32x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x24].p2s = PFX(filterPixelToShort_32x24_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x32].p2s = PFX(filterPixelToShort_32x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x64].p2s = PFX(filterPixelToShort_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x16].p2s = PFX(filterPixelToShort_64x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x32].p2s = PFX(filterPixelToShort_64x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x48].p2s = PFX(filterPixelToShort_64x48_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x64].p2s = PFX(filterPixelToShort_64x64_avx512); p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx512); p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx512); ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 146 of 307] x86: Fix crash in 32 bit main10 build from chroma_hps code
# HG changeset patch # User Vignesh Vijayakumar # Date 1509681036 -19800 # Fri Nov 03 09:20:36 2017 +0530 # Node ID 84dc38e191366e8b737d2a6014793afe830f3b35 # Parent d3a1db4790b662306a3f1222cde66c006e10f604 x86: Fix crash in 32 bit main10 build from chroma_hps code diff -r d3a1db4790b6 -r 84dc38e19136 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Thu Nov 02 14:10:18 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Fri Nov 03 09:20:36 2017 +0530 @@ -6447,9 +6447,9 @@ movu[r2], m6 %endmacro +%macro IPFILTER_CHROMA_PS_AVX512_32xN 1 +%if ARCH_X86_64 == 1 INIT_ZMM avx512 -%if ARCH_X86_64 == 1 -%macro IPFILTER_CHROMA_PS_AVX512_32xN 1 cglobal interp_4tap_horiz_ps_32x%1, 4,7,9 add r1d, r1d add r3d, r3d @@ -6486,8 +6486,8 @@ sub r6d, 2 jnz .loop RET +%endif %endmacro -%endif IPFILTER_CHROMA_PS_AVX512_32xN 8 IPFILTER_CHROMA_PS_AVX512_32xN 16 @@ -6645,9 +6645,9 @@ movu[r2 + mmsize],m6 %endmacro +%macro IPFILTER_CHROMA_PS_AVX512_64xN 1 +%if ARCH_X86_64 == 1 INIT_ZMM avx512 -%if ARCH_X86_64 == 1 -%macro IPFILTER_CHROMA_PS_AVX512_64xN 1 cglobal interp_4tap_horiz_ps_64x%1, 4,7,9 add r1d, r1d add r3d, r3d @@ -6683,8 +6683,8 @@ sub r6d, 2 jnz .loop RET +%endif %endmacro -%endif IPFILTER_CHROMA_PS_AVX512_64xN 16 IPFILTER_CHROMA_PS_AVX512_64xN 32 @@ -6750,10 +6750,9 @@ movu[r2], ym6 %endmacro - +%macro IPFILTER_CHROMA_PS_AVX512_16xN 1 +%if ARCH_X86_64 == 1 INIT_ZMM avx512 -%if ARCH_X86_64 == 1 -%macro IPFILTER_CHROMA_PS_AVX512_16xN 1 cglobal interp_4tap_horiz_ps_16x%1, 4,7,9 add r1d, r1d add r3d, r3d @@ -6789,8 +6788,8 @@ sub r6d, 2 jnz .loop RET +%endif %endmacro -%endif IPFILTER_CHROMA_PS_AVX512_16xN 4 IPFILTER_CHROMA_PS_AVX512_16xN 8 @@ -6934,8 +6933,8 @@ movu[r2 + mmsize], ym6 %endmacro +%if ARCH_X86_64 == 1 INIT_ZMM avx512 -%if ARCH_X86_64 == 1 cglobal interp_4tap_horiz_ps_48x64, 4,7,9 add r1d, r1d add r3d, r3d ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 150 of 307] x86: AVX512 interp_4tap_vert_ps_32xN for high bit depth
# HG changeset patch # User Vignesh Vijayakumar# Date 1510029383 -19800 # Tue Nov 07 10:06:23 2017 +0530 # Node ID c983858deccb26e5b4c957fbff959c1e74f84756 # Parent 0775ffcdfc8a0c4ad078e8c4197f6bff7158efd8 x86: AVX512 interp_4tap_vert_ps_32xN for high bit depth i444 Size | AVX2 performance | AVX512 performance -- 32x8 | 26.31x | 43.62x 32x16 | 27.04x | 45.52x 32x24 | 27.33x | 43.80x 32x32 | 27.64x | 44.25x 32x64 | 27.89x | 44.69x diff -r 0775ffcdfc8a -r c983858deccb source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Nov 06 15:41:43 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Nov 07 10:06:23 2017 +0530 @@ -2645,6 +2645,11 @@ p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx512); @@ -2659,6 +2664,10 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vpp = PFX(interp_4tap_vert_pp_32x48_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vpp = PFX(interp_4tap_vert_pp_32x64_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_vps = PFX(interp_4tap_vert_ps_32x48_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_vps = PFX(interp_4tap_vert_ps_32x64_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = PFX(interp_4tap_vert_pp_16x24_avx512); @@ -2673,6 +2682,10 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vps = PFX(interp_4tap_vert_ps_32x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vps = PFX(interp_4tap_vert_ps_32x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_vps = PFX(interp_4tap_vert_ps_32x24_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_vps = PFX(interp_4tap_vert_ps_32x32_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vpp = PFX(interp_4tap_vert_pp_16x4_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = PFX(interp_4tap_vert_pp_16x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx512); diff -r 0775ffcdfc8a -r c983858deccb source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Mon Nov 06 15:41:43 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Tue Nov 07 10:06:23 2017 +0530 @@ -7341,6 +7341,88 @@ jnz .loop RET %endif + +%macro PROCESS_CHROMA_VERT_PS_32x2_AVX512 0 +movu m1, [r0] +movu m3, [r0 + r1] +punpcklwd m0, m1, m3 +pmaddwd m0, [r5] +punpckhwd m1, m3 +pmaddwd m1, [r5] + +movu m4, [r0 + 2 * r1] +punpcklwd m2, m3, m4 +pmaddwd m2, [r5] +punpckhwd
[x265] [PATCH 148 of 307] x86: AVX512 optimise interp_4tap_vert_pp_8xN high bit depth code
# HG changeset patch # User Vignesh Vijayakumar # Date 1509862764 -19800 # Sun Nov 05 11:49:24 2017 +0530 # Node ID 2d94e5d214922d0f6cb0126e4477db8dd33256e7 # Parent 410a223c2caa58321a3a6b3e0a91c1dee512667a x86: AVX512 optimise interp_4tap_vert_pp_8xN high bit depth code diff -r 410a223c2caa -r 2d94e5d21492 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Sat Nov 04 18:05:34 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Sun Nov 05 11:49:24 2017 +0530 @@ -5930,15 +5930,10 @@ punpckhwd m3, m4 pmaddwd m3, [r5] -lea r0, [r0 + 2 * r1] -lea r6, [r6 + 2 * r1] -lea r8, [r8 + 2 * r1] -lea r9, [r9 + 2 * r1] - -movu xm5,[r0 + r1] -vinserti32x4 m5, [r6 + r1], 1 -vinserti32x4 m5, [r8 + r1], 2 -vinserti32x4 m5, [r9 + r1], 3 +movu xm5,[r0 + r10] +vinserti32x4 m5, [r6 + r10], 1 +vinserti32x4 m5, [r8 + r10], 2 +vinserti32x4 m5, [r9 + r10], 3 punpcklwd m6, m4, m5 pmaddwd m6, [r5 + mmsize] paddd m0, m6 @@ -5946,10 +5941,10 @@ pmaddwd m4, [r5 + mmsize] paddd m1, m4 -movu xm4,[r0 + 2 * r1] -vinserti32x4 m4, [r6 + 2 * r1], 1 -vinserti32x4 m4, [r8 + 2 * r1], 2 -vinserti32x4 m4, [r9 + 2 * r1], 3 +movu xm4,[r0 + 4 * r1] +vinserti32x4 m4, [r6 + 4 * r1], 1 +vinserti32x4 m4, [r8 + 4 * r1], 2 +vinserti32x4 m4, [r9 + 4 * r1], 3 punpcklwd m6, m5, m4 pmaddwd m6, [r5 + mmsize] paddd m2, m6 @@ -5987,7 +5982,7 @@ ;- %if ARCH_X86_64 INIT_ZMM avx512 -cglobal interp_4tap_vert_pp_8x8, 5, 10, 9 +cglobal interp_4tap_vert_pp_8x8, 5, 11, 9 add r1d,r1d add r3d,r3d sub r0, r1 @@ -6001,6 +5996,7 @@ %endif vbroadcasti32x8 m7, [INTERP_OFFSET_PP] vbroadcasti32x8 m8, [pw_pixel_max] +lea r10,[3 * r1] lea r7, [3 * r3] PROCESS_CHROMA_VERT_PP_8x8_AVX512 RET @@ -6008,7 +6004,7 @@ %macro FILTER_VER_PP_CHROMA_8xN_AVX512 1 INIT_ZMM avx512 -cglobal interp_4tap_vert_pp_8x%1, 5, 10, 9 +cglobal interp_4tap_vert_pp_8x%1, 5, 11, 9 add r1d,r1d add r3d,r3d sub r0, r1 @@ -6022,10 +6018,11 @@ %endif vbroadcasti32x8 m7, [INTERP_OFFSET_PP] vbroadcasti32x8 m8, [pw_pixel_max] +lea r10,[3 * r1] lea r7, [3 * r3] %rep %1/8 - 1 PROCESS_CHROMA_VERT_PP_8x8_AVX512 -lea r0, [r9] +lea r0, [r8 + 4 * r1] lea r2, [r2 + 4 * r3] %endrep PROCESS_CHROMA_VERT_PP_8x8_AVX512 ___ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel
[x265] [PATCH 151 of 307] x86: AVX512 interp_4tap_vert_ps_64xN for high bit depth
# HG changeset patch # User Vignesh Vijayakumar# Date 1510030534 -19800 # Tue Nov 07 10:25:34 2017 +0530 # Node ID 5517caaeb88b0f76a78706a867a4fa24fb17f64e # Parent c983858deccb26e5b4c957fbff959c1e74f84756 x86: AVX512 interp_4tap_vert_ps_64xN for high bit depth i444 Size | AVX2 performance | AVX512 performance -- 64x16 | 27.45x | 42.45x 64x32 | 27.77x | 43.65x 64x48 | 28.06x | 43.04x 64x64 | 28.18x | 43.34x diff -r c983858deccb -r 5517caaeb88b source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Nov 07 10:06:23 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Nov 07 10:25:34 2017 +0530 @@ -2639,6 +2639,10 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = PFX(interp_4tap_vert_pp_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = PFX(interp_4tap_vert_pp_64x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_vps = PFX(interp_4tap_vert_ps_64x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vps = PFX(interp_4tap_vert_ps_64x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = PFX(interp_4tap_vert_ps_64x48_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = PFX(interp_4tap_vert_ps_64x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = PFX(interp_4tap_vert_pp_48x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512); diff -r c983858deccb -r 5517caaeb88b source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Tue Nov 07 10:06:23 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Tue Nov 07 10:25:34 2017 +0530 @@ -7423,6 +7423,128 @@ FILTER_VER_PS_CHROMA_32xN_AVX512 48 FILTER_VER_PS_CHROMA_32xN_AVX512 64 %endif + +%macro PROCESS_CHROMA_VERT_PS_64x2_AVX512 0 +movu m1, [r0] +movu m3, [r0 + r1] +punpcklwdm0, m1, m3 +pmaddwd m0, [r5] +punpckhwdm1, m3 +pmaddwd m1, [r5] + +movu m9, [r0 + mmsize] +movu m11, [r0 + r1 + mmsize] +punpcklwdm8, m9, m11 +pmaddwd m8, [r5] +punpckhwdm9, m11 +pmaddwd m9, [r5] + +movu m4, [r0 + 2 * r1] +punpcklwdm2, m3, m4 +pmaddwd m2, [r5] +punpckhwdm3, m4 +pmaddwd m3, [r5] + +movu m12, [r0 + 2 * r1 + mmsize] +punpcklwdm10, m11,m12 +pmaddwd m10, [r5] +punpckhwdm11, m12 +pmaddwd m11, [r5] + +lea r0, [r0 + 2 * r1] +movu m5, [r0 + r1] +punpcklwdm6, m4, m5 +pmaddwd m6, [r5 + 1 * mmsize] +padddm0, m6 +punpckhwdm4, m5 +pmaddwd m4, [r5 + 1 * mmsize] +padddm1, m4 + +movu m13, [r0 + r1 + mmsize] +punpcklwdm14, m12,m13 +pmaddwd m14, [r5 + 1 * mmsize] +padddm8, m14 +punpckhwdm12, m13 +pmaddwd m12, [r5 + 1 * mmsize] +padddm9, m12 + +movu m4, [r0 + 2 * r1] +punpcklwdm6, m5, m4 +pmaddwd m6, [r5 + 1 * mmsize] +padddm2, m6 +punpckhwdm5, m4 +pmaddwd m5, [r5 + 1 * mmsize] +padddm3, m5 + +movu m12, [r0 + 2 * r1 + mmsize] +punpcklwdm14, m13,m12 +pmaddwd m14, [r5 +
[x265] [PATCH 147 of 307] x86: AVX512 interp_4tap_vert_pp_48x64 for high bit depth
# HG changeset patch # User Vignesh Vijayakumar # Date 1509798934 -19800 # Sat Nov 04 18:05:34 2017 +0530 # Node ID 410a223c2caa58321a3a6b3e0a91c1dee512667a # Parent 84dc38e191366e8b737d2a6014793afe830f3b35 x86: AVX512 interp_4tap_vert_pp_48x64 for high bit depth AVX2 performance : 26.37x AVX512 performance : 42.37x diff -r 84dc38e19136 -r 410a223c2caa source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Fri Nov 03 09:20:36 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Sat Nov 04 18:05:34 2017 +0530 @@ -2639,6 +2639,7 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_vpp = PFX(interp_4tap_vert_pp_64x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vpp = PFX(interp_4tap_vert_pp_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vpp = PFX(interp_4tap_vert_pp_64x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = PFX(interp_4tap_vert_pp_48x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512); diff -r 84dc38e19136 -r 410a223c2caa source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Fri Nov 03 09:20:36 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Sat Nov 04 18:05:34 2017 +0530 @@ -6242,6 +6242,172 @@ FILTER_VER_PP_CHROMA_32xN_AVX512 64 %endif +%macro PROCESS_CHROMA_VERT_PP_48x4_AVX512 0 +movu m1, [r0] +lea r6, [r0 + 2 * r1] +movu m10,[r6] +movu m3, [r0 + r1] +movu m12,[r6 + r1] +punpcklwd m0, m1, m3 +punpcklwd m9, m10, m12 +pmaddwd m0, [r5] +pmaddwd m9, [r5] +punpckhwd m1, m3 +punpckhwd m10,m12 +pmaddwd m1, [r5] +pmaddwd m10,[r5] + +movu m4, [r0 + 2 * r1] +movu m13,[r6 + 2 * r1] +punpcklwd m2, m3, m4 +punpcklwd m11,m12, m13 +pmaddwd m2, [r5] +pmaddwd m11,[r5] +punpckhwd m3, m4 +punpckhwd m12,m13 +pmaddwd m3, [r5] +pmaddwd m12,[r5] + +movu m5, [r0 + r7] +movu m14,[r6 + r7] +punpcklwd m6, m4, m5 +punpcklwd m15,m13, m14 +pmaddwd m6, [r5 + mmsize] +pmaddwd m15,[r5 + mmsize] +paddd m0, m6 +paddd m9, m15 +punpckhwd m4, m5 +punpckhwd m13,m14 +pmaddwd m4, [r5 + mmsize] +pmaddwd m13,[r5 + mmsize] +paddd m1, m4 +paddd m10,m13 + +movu m4, [r0 + 4 * r1] +movu m13,[r6 + 4 * r1] +punpcklwd m6, m5, m4 +punpcklwd m15,m14, m13 +pmaddwd m6, [r5 + mmsize] +pmaddwd m15,[r5 + mmsize] +paddd m2, m6 +paddd m11,m15 +punpckhwd m5, m4 +punpckhwd m14,m13 +pmaddwd m5, [r5 + mmsize] +pmaddwd m14,[r5 + mmsize] +paddd m3, m5 +paddd m12,m14 + +paddd m0, m7 +paddd m1, m7 +paddd m2, m7 +paddd m3, m7 +paddd m9, m7 +paddd m10,m7 +paddd m11,m7 +paddd m12,m7 + +psrad m0,
[x265] [PATCH 153 of 307] x86: AVX512 interp_4tap_vert_ps_48x64 for high bit depth
# HG changeset patch # User Vignesh Vijayakumar# Date 1510033854 -19800 # Tue Nov 07 11:20:54 2017 +0530 # Node ID 9df6f8ae51300ebbb9d0941f7fc1cce1fdef4e94 # Parent 092438e72985dc1d75bf3be4f0c8c1485ec8 x86: AVX512 interp_4tap_vert_ps_48x64 for high bit depth AVX2 performance : 28.05x AVX512 performance : 39.37x diff -r 092438e7 -r 9df6f8ae5130 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Nov 07 11:04:05 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Nov 07 11:20:54 2017 +0530 @@ -2644,6 +2644,7 @@ p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_vps = PFX(interp_4tap_vert_ps_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_vps = PFX(interp_4tap_vert_ps_64x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vpp = PFX(interp_4tap_vert_pp_48x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_48x64].filter_vps = PFX(interp_4tap_vert_ps_48x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_vpp = PFX(interp_4tap_vert_pp_32x24_avx512); diff -r 092438e7 -r 9df6f8ae5130 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Tue Nov 07 11:04:05 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Tue Nov 07 11:20:54 2017 +0530 @@ -7537,6 +7537,167 @@ FILTER_VER_PS_CHROMA_32xN_AVX512 64 %endif +%macro PROCESS_CHROMA_VERT_PS_48x4_AVX512 0 +movu m1, [r0] +lea r6, [r0 + 2 * r1] +movu m10,[r6] +movu m3, [r0 + r1] +movu m12,[r6 + r1] +punpcklwd m0, m1, m3 +punpcklwd m9, m10, m12 +pmaddwd m0, [r5] +pmaddwd m9, [r5] +punpckhwd m1, m3 +punpckhwd m10,m12 +pmaddwd m1, [r5] +pmaddwd m10,[r5] + +movu m4, [r0 + 2 * r1] +movu m13,[r6 + 2 * r1] +punpcklwd m2, m3, m4 +punpcklwd m11,m12, m13 +pmaddwd m2, [r5] +pmaddwd m11,[r5] +punpckhwd m3, m4 +punpckhwd m12,m13 +pmaddwd m3, [r5] +pmaddwd m12,[r5] + +movu m5, [r0 + r7] +movu m14,[r6 + r7] +punpcklwd m6, m4, m5 +punpcklwd m15,m13, m14 +pmaddwd m6, [r5 + mmsize] +pmaddwd m15,[r5 + mmsize] +paddd m0, m6 +paddd m9, m15 +punpckhwd m4, m5 +punpckhwd m13,m14 +pmaddwd m4, [r5 + mmsize] +pmaddwd m13,[r5 + mmsize] +paddd m1, m4 +paddd m10,m13 + +movu m4, [r0 + 4 * r1] +movu m13,[r6 + 4 * r1] +punpcklwd m6, m5, m4 +punpcklwd m15,m14, m13 +pmaddwd m6, [r5 + mmsize] +pmaddwd m15,[r5 + mmsize] +paddd m2, m6 +paddd m11,m15 +punpckhwd m5, m4 +punpckhwd m14,m13 +pmaddwd m5, [r5 + mmsize] +pmaddwd m14,[r5 + mmsize] +paddd m3, m5 +paddd m12,m14 + +paddd m0, m7 +paddd m1, m7 +paddd m2, m7 +paddd m3, m7 +paddd m9, m7 +paddd m10,m7 +paddd m11,m7 +paddd m12,m7 + +psrad
[x265] [PATCH 152 of 307] x86: AVX512 interp_4tap_vert_ps_16xN for high bit depth
# HG changeset patch # User Vignesh Vijayakumar# Date 1510032845 -19800 # Tue Nov 07 11:04:05 2017 +0530 # Node ID 092438e72985dc1d75bf3be4f0c8c1485ec8 # Parent 5517caaeb88b0f76a78706a867a4fa24fb17f64e x86: AVX512 interp_4tap_vert_ps_16xN for high bit depth i444 Size | AVX2 performance | AVX512 performance -- 16x4 | 27.12x | 33.94x 16x8 | 25.90x | 30.27x 16x12 | 26.81x | 34.40x 16x16 | 27.69x | 33.72x 16x32 | 26.96x | 36.42x 16x64 | 28.37x | 35.85x diff -r 5517caaeb88b -r 092438e7 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Nov 07 10:25:34 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Nov 07 11:04:05 2017 +0530 @@ -2660,6 +2660,12 @@ p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_avx512); p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_avx512); p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_avx512); @@ -2677,6 +2683,11 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vpp = PFX(interp_4tap_vert_pp_16x24_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vpp = PFX(interp_4tap_vert_pp_16x64_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].filter_vps = PFX(interp_4tap_vert_ps_16x24_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].filter_vps = PFX(interp_4tap_vert_ps_16x64_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_avx512); @@ -2695,6 +2706,11 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vpp = PFX(interp_4tap_vert_pp_16x12_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = PFX(interp_4tap_vert_pp_16x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vpp = PFX(interp_4tap_vert_pp_16x32_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].filter_vps = PFX(interp_4tap_vert_ps_16x4_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vps = PFX(interp_4tap_vert_ps_16x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].filter_vps = PFX(interp_4tap_vert_ps_16x12_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vps = PFX(interp_4tap_vert_ps_16x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].filter_vps = PFX(interp_4tap_vert_ps_16x32_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_avx512); diff -r 5517caaeb88b -r 092438e7 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Tue Nov 07 10:25:34 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Tue Nov 07 11:04:05 2017 +0530 @@ -7342,6 +7342,119 @@ RET %endif +%macro PROCESS_CHROMA_VERT_PS_16x4_AVX512 0 +movu ym1,[r0] +lea r6, [r0 + 2 * r1] +vinserti32x8 m1, [r6],1 +movu ym3,
[x265] [PATCH 154 of 307] x86: AVX512 interp_4tap_vert_ps_8xN for high bit depth
# HG changeset patch # User Vignesh Vijayakumar# Date 1510035839 -19800 # Tue Nov 07 11:53:59 2017 +0530 # Node ID ae3775aa94f3acceb7d43ce7db2df6f8be6c6912 # Parent 9df6f8ae51300ebbb9d0941f7fc1cce1fdef4e94 x86: AVX512 interp_4tap_vert_ps_8xN for high bit depth i444 Size | AVX2 performance | AVX512 performance -- 8x8 | 19.97x| 28.50x 8x16 | 22.32x| 27.74x 8x32 | 21.73x| 29.04x diff -r 9df6f8ae5130 -r ae3775aa94f3 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Tue Nov 07 11:20:54 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Tue Nov 07 11:53:59 2017 +0530 @@ -2670,6 +2670,9 @@ p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_avx512); p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_avx512); p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_8x8].filter_vps = PFX(interp_4tap_vert_ps_8x8_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_8x16].filter_vps = PFX(interp_4tap_vert_ps_8x16_avx512); +p.chroma[X265_CSP_I444].pu[LUMA_8x32].filter_vps = PFX(interp_4tap_vert_ps_8x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_vpp = PFX(interp_4tap_vert_pp_32x32_avx512); @@ -2693,6 +2696,10 @@ p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vpp = PFX(interp_4tap_vert_pp_8x64_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].filter_vps = PFX(interp_4tap_vert_ps_8x8_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].filter_vps = PFX(interp_4tap_vert_ps_8x16_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].filter_vps = PFX(interp_4tap_vert_ps_8x32_avx512); +p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].filter_vps = PFX(interp_4tap_vert_ps_8x64_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = PFX(interp_4tap_vert_pp_32x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vpp = PFX(interp_4tap_vert_pp_32x16_avx512); @@ -2715,6 +2722,9 @@ p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vpp = PFX(interp_4tap_vert_pp_8x8_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vpp = PFX(interp_4tap_vert_pp_8x16_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vpp = PFX(interp_4tap_vert_pp_8x32_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vps = PFX(interp_4tap_vert_ps_8x8_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vps = PFX(interp_4tap_vert_ps_8x16_avx512); +p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vps = PFX(interp_4tap_vert_ps_8x32_avx512); p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vpp = PFX(interp_4tap_vert_pp_24x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].filter_vpp = PFX(interp_4tap_vert_pp_24x64_avx512); diff -r 9df6f8ae5130 -r ae3775aa94f3 source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Tue Nov 07 11:20:54 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Tue Nov 07 11:53:59 2017 +0530 @@ -7342,6 +7342,133 @@ RET %endif +%macro PROCESS_CHROMA_VERT_PS_8x8_AVX512 0 +movu xm1,[r0] +lea r6, [r0 + 2 * r1] +lea r8, [r0 + 4 * r1] +lea r9, [r8 + 2 * r1] +vinserti32x4 m1, [r6],1 +vinserti32x4 m1, [r8],2 +vinserti32x4 m1, [r9],3 +movu xm3,[r0 + r1] +vinserti32x4 m3, [r6 + r1], 1 +vinserti32x4 m3, [r8 + r1], 2 +vinserti32x4 m3, [r9 + r1], 3 +punpcklwd m0, m1, m3 +pmaddwd m0, [r5] +punpckhwd m1, m3 +pmaddwd m1, [r5] + +movu xm4,[r0 + 2 * r1] +vinserti32x4 m4, [r6 + 2 * r1], 1 +vinserti32x4 m4, [r8 + 2 * r1], 2 +vinserti32x4 m4, [r9 + 2 * r1], 3 +punpcklwd m2, m3, m4 +