At 2015-03-10 22:05:25,[email protected] wrote: ># HG changeset patch ># User Rajesh Paulraj<[email protected]> ># Date 1425996251 -19800 ># Tue Mar 10 19:34:11 2015 +0530 ># Node ID fdfd37fe64245837628ae0445749811a281e3aae ># Parent 2dc6b50681ccc8b3a5123ea02728786de9aca7a4 >asm: avx2 8bpp code for filter_p2s[4x4](2.26x), filter_p2s[4x8](3.01x), > filter_p2s[4x16](3.00x) > >diff -r 2dc6b50681cc -r fdfd37fe6424 source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Tue Mar 10 18:41:56 2015 +0530 >+++ b/source/common/x86/asm-primitives.cpp Tue Mar 10 19:34:11 2015 +0530 >@@ -1479,6 +1479,10 @@ > p.cu[BLOCK_16x16].calcresidual = x265_getResidual16_avx2; > p.cu[BLOCK_32x32].calcresidual = x265_getResidual32_avx2; > >+ p.pu[LUMA_4x4].filter_p2s = x265_pixelToShort_4x4_avx2; >+ p.pu[LUMA_4x8].filter_p2s = x265_pixelToShort_4x8_avx2; >+ p.pu[LUMA_4x16].filter_p2s = x265_pixelToShort_4x16_avx2; >+ > p.scale1D_128to64 = x265_scale1D_128to64_avx2; > p.weight_pp = x265_weight_pp_avx2; > >diff -r 2dc6b50681cc -r fdfd37fe6424 source/common/x86/ipfilter8.asm >--- a/source/common/x86/ipfilter8.asm Tue Mar 10 18:41:56 2015 +0530 >+++ b/source/common/x86/ipfilter8.asm Tue Mar 10 19:34:11 2015 +0530 >@@ -6027,6 +6027,51 @@ > PIXEL_WH_4xN 4, 4 > PIXEL_WH_4xN 4, 8 > PIXEL_WH_4xN 4, 16 >+ >+;----------------------------------------------------------------------------- >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) >+;----------------------------------------------------------------------------- >+%macro P2S_H_4xN_avx2 1 >+INIT_YMM avx2 >+cglobal pixelToShort_4x%1, 3, 5, 6 >+ >+ ; load height >+ mov r3d, %1 >+ >+ ; load constant >+ vbroadcasti128 m4, [pb_128] >+ vbroadcasti128 m5, [tab_c_64_n64] >+ >+%rep %1 / 4 >+ movd xm0, [r0] >+ movd xm1, [r0 + r1] >+ movd xm2, [r0 + r1 * 2] >+ lea r4, [r1 * 3] >+ movd xm3, [r0 + r4] >+ >+ punpckldq m0, m0, m1 >+ punpckldq m2, m2 , m3 >+ punpcklbw m0, m4 >+ punpcklbw m2, m4 >+ vinserti128 m2, m0, xm2, 1 >+ pmaddubsw m2,m5 >+ >+ movq [r2 + FENC_STRIDE * 0], xm2 >+ movhps [r2 + FENC_STRIDE * 2], xm2 >+ vextracti128 xm2, m2, 1
vinsert+pmaddubsw+vextract are slower than two of pmaddubsw, and the code can't see improve here >+ movq [r2 + FENC_STRIDE * 4], xm2 >+ movhps [r2 + FENC_STRIDE * 6], xm2 >+ >+ lea r0, [r0 + r1 * 4] >+ add r2, FENC_STRIDE * 8 >+ >+%endrep >+ RET >+%endmacro >+P2S_H_4xN_avx2 4 >+P2S_H_4xN_avx2 8 >+P2S_H_4xN_avx2 16 >+ > ;----------------------------------------------------------------------------- > ; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) > ;----------------------------------------------------------------------------- >@@ -6528,7 +6573,7 @@ > movu [r2 + FENC_STRIDE * 6 + 64], m7 > movu [r2 + FENC_STRIDE * 6 + 80], m6 > >- lea r0, [r0 + r1 * 4] >+ lea r0, [r0 + r1 * 4] > add r2, FENC_STRIDE * 8 > %endrep > RET >diff -r 2dc6b50681cc -r fdfd37fe6424 source/common/x86/ipfilter8.h >--- a/source/common/x86/ipfilter8.h Tue Mar 10 18:41:56 2015 +0530 >+++ b/source/common/x86/ipfilter8.h Tue Mar 10 19:34:11 2015 +0530 >@@ -646,6 +646,10 @@ > void x265_pixelToShort_24x32_ssse3(const pixel* src, intptr_t srcStride, > int16_t* dst); > void x265_pixelToShort_48x64_ssse3(const pixel* src, intptr_t srcStride, > int16_t* dst); > >+void x265_pixelToShort_4x4_avx2(const pixel* src, intptr_t srcStride, >int16_t* dst); >+void x265_pixelToShort_4x8_avx2(const pixel* src, intptr_t srcStride, >int16_t* dst); >+void x265_pixelToShort_4x16_avx2(const pixel* src, intptr_t srcStride, >int16_t* dst); >+ > #undef LUMA_FILTERS > #undef LUMA_SP_FILTERS > #undef LUMA_SS_FILTERS >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
