# HG changeset patch # User Vignesh Vijayakumar # Date 1504242228 -19800 # Fri Sep 01 10:33:48 2017 +0530 # Node ID dbfcd0ee40e9bd4ee351eb064d8aa0819bd9b3fd # Parent d9200885420957bccd4edea62bf87bbe8831bc62 x86: AVX512 cleanup interp_4tap_horiz_pp_32xN
diff -r d92008854209 -r dbfcd0ee40e9 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Sun Aug 13 15:12:25 2017 +0530 +++ b/source/common/x86/asm-primitives.cpp Fri Sep 01 10:33:48 2017 +0530 @@ -4011,22 +4011,29 @@ p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32_avx512); p.cu[BLOCK_32x32].cpy1Dto2D_shl = PFX(cpy1Dto2D_shl_32_avx512); + //i444 chroma_hpp p.chroma[X265_CSP_I444].pu[LUMA_64x64].filter_hpp = PFX(interp_4tap_horiz_pp_64x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x32].filter_hpp = PFX(interp_4tap_horiz_pp_64x32_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x48].filter_hpp = PFX(interp_4tap_horiz_pp_64x48_avx512); p.chroma[X265_CSP_I444].pu[LUMA_64x16].filter_hpp = PFX(interp_4tap_horiz_pp_64x16_avx512); - p.chroma[X265_CSP_I444].pu[LUMA_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512); p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512); - p.chroma[X265_CSP_I444].pu[LUMA_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); + //i422 chroma_hpp p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].filter_hpp = PFX(interp_4tap_horiz_pp_32x64_avx512); p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].filter_hpp = PFX(interp_4tap_horiz_pp_32x48_avx512); + + //i420 chroma_hpp + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hpp = PFX(interp_4tap_horiz_pp_32x32_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hpp = PFX(interp_4tap_horiz_pp_32x16_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hpp = PFX(interp_4tap_horiz_pp_32x24_avx512); + p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_hpp = PFX(interp_4tap_horiz_pp_32x8_avx512); + p.weight_pp = PFX(weight_pp_avx512); //i444 chroma_hps diff -r d92008854209 -r dbfcd0ee40e9 source/common/x86/ipfilter8.asm --- a/source/common/x86/ipfilter8.asm Sun Aug 13 15:12:25 2017 +0530 +++ b/source/common/x86/ipfilter8.asm Fri Sep 01 10:33:48 2017 +0530 @@ -150,8 +150,6 @@ const interp4_horiz_shuf_load2_avx512, times 2 db 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 const interp4_horiz_shuf_load3_avx512, times 2 db 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 -const interp4_horiz_shuf_store1_avx512, dd 0 ,8, 1, 9, 4, 12, 5, 13, 2, 10, 3, 11, 6, 14, 7, 15 - ALIGN 64 const interp8_hps_shuf_avx512, dq 0, 4, 1, 5, 2, 6, 3, 7 @@ -9881,31 +9879,30 @@ ; m9 - store shuffle order table movu ym5, [r0] - vinserti32x8 m5, [r0 + 4], 1 + vinserti32x8 m5, [r0 + r1], 1 + movu ym7, [r0 + 4] + vinserti32x8 m7, [r0 + r1 + 4], 1 pshufb m6, m5, m2 - pshufb m5, m5, m1 + pshufb m5, m1 + pshufb m8, m7, m2 + pshufb m7, m1 + pmaddubsw m5, m0 + pmaddubsw m7, m0 + pmaddwd m5, m3 + pmaddwd m7, m3 + pmaddubsw m6, m0 - pmaddwd m5, m3 + pmaddubsw m8, m0 pmaddwd m6, m3 - - movu ym7, [r0 + r1] - vinserti32x8 m7, [r0 + r1 + 4], 1 - - pshufb m8, m7, m2 - pshufb m7, m7, m1 - pmaddubsw m7, m0 - pmaddubsw m8, m0 - pmaddwd m7, m3 pmaddwd m8, m3 - packssdw m5, m6 - packssdw m7, m8 + packssdw m5, m7 + packssdw m6, m8 pmulhrsw m5, m4 - pmulhrsw m7, m4 - packuswb m5, m7 - vpermd m5, m9, m5 + pmulhrsw m6, m4 + packuswb m5, m6 movu [r2], ym5 vextracti32x8 [r2 + r3], m5, 1 %endmacro @@ -9947,7 +9944,7 @@ %macro IPFILTER_CHROMA_PP_32xN_AVX512 1 INIT_ZMM avx512 -cglobal interp_4tap_horiz_pp_32x%1, 4,6,10 +cglobal interp_4tap_horiz_pp_32x%1, 4,6,9 mov r4d, r4m %ifdef PIC @@ -9959,7 +9956,6 @@ vbroadcasti32x8 m1, [interp4_horiz_shuf_load1_avx512] vbroadcasti32x8 m2, [interp4_horiz_shuf_load2_avx512] - movu m9, [interp4_horiz_shuf_store1_avx512] vbroadcasti32x8 m3, [pw_1] vbroadcasti32x8 m4, [pw_512] dec r0 _______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel