# HG changeset patch # User Vignesh Vijayakumar<vign...@multicorewareinc.com>> # Date 1522981507 25200 # Thu Apr 05 19:25:07 2018 -0700 # Node ID 75d5a01d97daad790cecd35b40ff4b0e4cc34cac # Parent ddd64f4b2ff382d05e86708750b20332ed93f3c9 x86: AVX512 optimise chroma_hps_16xN for high bit depth
diff -r ddd64f4b2ff3 -r 75d5a01d97da source/common/x86/ipfilter16.asm --- a/source/common/x86/ipfilter16.asm Fri Dec 08 14:29:33 2017 +0530 +++ b/source/common/x86/ipfilter16.asm Thu Apr 05 19:25:07 2018 -0700 @@ -160,12 +160,15 @@ times 16 dw 58, -10 times 16 dw 4, -1 -const interp8_hpp_shuf1_load_avx512, times 2 db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 - -const interp8_hpp_shuf2_load_avx512, times 2 db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 - -const interp8_hpp_shuf1_store_avx512, times 2 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15 - +ALIGN 64 +const interp8_hpp_shuf1_load_avx512, times 4 db 0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9 + +ALIGN 64 +const interp8_hpp_shuf2_load_avx512, times 4 db 4, 5, 6, 7, 8, 9, 10, 11, 6, 7, 8, 9, 10, 11, 12, 13 + +ALIGN 64 +const interp8_hpp_shuf1_store_avx512, times 4 db 0, 1, 4, 5, 2, 3, 6, 7, 8, 9, 12, 13, 10, 11, 14, 15 + SECTION .text cextern pd_8 cextern pd_32 @@ -7135,32 +7138,23 @@ movu [r2], ym6 vextracti32x8 [r2 + r3], m6, 1 %endmacro - %macro PROCESS_IPFILTER_CHROMA_PS_16x1_AVX512 0 movu ym6, [r0] - movu ym7, [r0 + 8] - - pshufb ym8, ym6, ym3 - pshufb ym6, ym2 - pmaddwd ym6, ym0 - pmaddwd ym8, ym1 - paddd ym6, ym8 - paddd ym6, ym4 - psrad ym6, INTERP_SHIFT_PS - - pshufb ym8, ym7, ym3 - pshufb ym7, ym2 - pmaddwd ym7, ym0 - pmaddwd ym8, ym1 - paddd ym7, ym8 - paddd ym7, ym4 - psrad ym7, INTERP_SHIFT_PS - - packssdw ym6, ym7 - pshufb ym6, ym5 - movu [r2], ym6 -%endmacro - + vinserti32x8 m6, [r0 + 8], 1 + + pshufb m8, m6, m3 + pshufb m6, m2 + pmaddwd m6, m0 + pmaddwd m8, m1 + paddd m6, m8 + paddd m6, m4 + psrad m6, INTERP_SHIFT_PS + + vextracti32x8 ym7, m6, 1 + packssdw ym6, ym7 + pshufb ym6, ym5 + movu [r2], ym6 +%endmacro %macro IPFILTER_CHROMA_PS_AVX512_16xN 1 %if ARCH_X86_64 == 1 INIT_ZMM avx512 @@ -7177,10 +7171,10 @@ vpbroadcastd m0, [tab_ChromaCoeff + r4 * 8] vpbroadcastd m1, [tab_ChromaCoeff + r4 * 8 + 4] %endif - vbroadcasti32x8 m2, [interp8_hpp_shuf1_load_avx512] - vbroadcasti32x8 m3, [interp8_hpp_shuf2_load_avx512] + mova m2, [interp8_hpp_shuf1_load_avx512] + mova m3, [interp8_hpp_shuf2_load_avx512] vbroadcasti32x4 m4, [INTERP_OFFSET_PS] - vbroadcasti32x8 m5,[interp8_hpp_shuf1_store_avx512] + mova m5, [interp8_hpp_shuf1_store_avx512] mov r6d, %1 sub r0, 2 test r5d, r5d _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel