right, but we may reduce count of register in future
At 2015-03-11 14:18:03,"Divya Manivannan" <[email protected]> wrote: ># HG changeset patch ># User Divya Manivannan <[email protected]> ># Date 1426054662 -19800 ># Wed Mar 11 11:47:42 2015 +0530 ># Node ID ad5178906527535e8f0479b7c114fd9a91cdbdb7 ># Parent 8f148ac8dbe4b68e88ceff84f40e33b29e888dc9 >asm: avx2 code for filter_vpp[12x16], filter_vps[12x16]: 1151c->1148c, >996c->855c > >diff -r 8f148ac8dbe4 -r ad5178906527 source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Tue Mar 10 15:46:36 2015 +0530 >+++ b/source/common/x86/asm-primitives.cpp Wed Mar 11 11:47:42 2015 +0530 >@@ -1590,6 +1590,7 @@ > p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].filter_vpp = > x265_interp_4tap_vert_pp_8x6_avx2; > p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vpp = > x265_interp_4tap_vert_pp_8x16_avx2; > p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vpp = > x265_interp_4tap_vert_pp_8x32_avx2; >+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vpp = >x265_interp_4tap_vert_pp_12x16_avx2; > p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vpp = > x265_interp_4tap_vert_pp_16x8_avx2; > p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vpp = > x265_interp_4tap_vert_pp_16x16_avx2; > p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vpp = > x265_interp_4tap_vert_pp_32x8_avx2; >@@ -1609,6 +1610,7 @@ > p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].filter_vps = > x265_interp_4tap_vert_ps_8x8_avx2; > p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].filter_vps = > x265_interp_4tap_vert_ps_8x16_avx2; > p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].filter_vps = > x265_interp_4tap_vert_ps_8x32_avx2; >+ p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].filter_vps = >x265_interp_4tap_vert_ps_12x16_avx2; > p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].filter_vps = > x265_interp_4tap_vert_ps_16x8_avx2; > p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].filter_vps = > x265_interp_4tap_vert_ps_4x16_avx2; > p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].filter_vps = > x265_interp_4tap_vert_ps_16x16_avx2; >diff -r 8f148ac8dbe4 -r ad5178906527 source/common/x86/ipfilter8.asm >--- a/source/common/x86/ipfilter8.asm Tue Mar 10 15:46:36 2015 +0530 >+++ b/source/common/x86/ipfilter8.asm Wed Mar 11 11:47:42 2015 +0530 >@@ -4992,6 +4992,312 @@ > > FILTER_V4_W12_H2 12, 32 > >+%macro FILTER_VER_CHROMA_AVX2_12x16 1 >+INIT_YMM avx2 >+%if ARCH_X86_64 == 1 >+cglobal interp_4tap_vert_%1_12x16, 4, 7, 13 >+ mov r4d, r4m >+ shl r4d, 6 >+ >+%ifdef PIC >+ lea r5, [tab_ChromaCoeffVer_32] >+ add r5, r4 >+%else >+ lea r5, [tab_ChromaCoeffVer_32 + r4] >+%endif >+ >+ lea r4, [r1 * 3] >+ sub r0, r1 >+%ifidn %1,pp >+ mova m12, [pw_512] >+%else >+ add r3d, r3d >+ vbroadcasti128 m12, [pw_2000] >+%endif >+ lea r6, [r3 * 3] >+ >+ movu xm0, [r0] ; m0 = row 0 >+ movu xm1, [r0 + r1] ; m1 = row 1 >+ punpckhbw xm2, xm0, xm1 >+ punpcklbw xm0, xm1 >+ vinserti128 m0, m0, xm2, 1 >+ pmaddubsw m0, [r5] >+ movu xm2, [r0 + r1 * 2] ; m2 = row 2 >+ punpckhbw xm3, xm1, xm2 >+ punpcklbw xm1, xm2 >+ vinserti128 m1, m1, xm3, 1 >+ pmaddubsw m1, [r5] >+ movu xm3, [r0 + r4] ; m3 = row 3 >+ punpckhbw xm4, xm2, xm3 >+ punpcklbw xm2, xm3 >+ vinserti128 m2, m2, xm4, 1 >+ pmaddubsw m4, m2, [r5 + 1 * mmsize] >+ paddw m0, m4 >+ pmaddubsw m2, [r5] >+ lea r0, [r0 + r1 * 4] >+ movu xm4, [r0] ; m4 = row 4 >+ punpckhbw xm5, xm3, xm4 >+ punpcklbw xm3, xm4 >+ vinserti128 m3, m3, xm5, 1 >+ pmaddubsw m5, m3, [r5 + 1 * mmsize] >+ paddw m1, m5 >+ pmaddubsw m3, [r5] >+ movu xm5, [r0 + r1] ; m5 = row 5 >+ punpckhbw xm6, xm4, xm5 >+ punpcklbw xm4, xm5 >+ vinserti128 m4, m4, xm6, 1 >+ pmaddubsw m6, m4, [r5 + 1 * mmsize] >+ paddw m2, m6 >+ pmaddubsw m4, [r5] >+ movu xm6, [r0 + r1 * 2] ; m6 = row 6 >+ punpckhbw xm7, xm5, xm6 >+ punpcklbw xm5, xm6 >+ vinserti128 m5, m5, xm7, 1 >+ pmaddubsw m7, m5, [r5 + 1 * mmsize] >+ paddw m3, m7 >+ pmaddubsw m5, [r5] >+ movu xm7, [r0 + r4] ; m7 = row 7 >+ punpckhbw xm8, xm6, xm7 >+ punpcklbw xm6, xm7 >+ vinserti128 m6, m6, xm8, 1 >+ pmaddubsw m8, m6, [r5 + 1 * mmsize] >+ paddw m4, m8 >+ pmaddubsw m6, [r5] >+ lea r0, [r0 + r1 * 4] >+ movu xm8, [r0] ; m8 = row 8 >+ punpckhbw xm9, xm7, xm8 >+ punpcklbw xm7, xm8 >+ vinserti128 m7, m7, xm9, 1 >+ pmaddubsw m9, m7, [r5 + 1 * mmsize] >+ paddw m5, m9 >+ pmaddubsw m7, [r5] >+ movu xm9, [r0 + r1] ; m9 = row 9 >+ punpckhbw xm10, xm8, xm9 >+ punpcklbw xm8, xm9 >+ vinserti128 m8, m8, xm10, 1 >+ pmaddubsw m10, m8, [r5 + 1 * mmsize] >+ paddw m6, m10 >+ pmaddubsw m8, [r5] >+ movu xm10, [r0 + r1 * 2] ; m10 = row 10 >+ punpckhbw xm11, xm9, xm10 >+ punpcklbw xm9, xm10 >+ vinserti128 m9, m9, xm11, 1 >+ pmaddubsw m11, m9, [r5 + 1 * mmsize] >+ paddw m7, m11 >+ pmaddubsw m9, [r5] >+ >+%ifidn %1,pp >+ pmulhrsw m0, m12 ; m0 = word: row 0 >+ pmulhrsw m1, m12 ; m1 = word: row 1 >+ pmulhrsw m2, m12 ; m2 = word: row 2 >+ pmulhrsw m3, m12 ; m3 = word: row 3 >+ pmulhrsw m4, m12 ; m4 = word: row 4 >+ pmulhrsw m5, m12 ; m5 = word: row 5 >+ pmulhrsw m6, m12 ; m6 = word: row 6 >+ pmulhrsw m7, m12 ; m7 = word: row 7 >+ packuswb m0, m1 >+ packuswb m2, m3 >+ packuswb m4, m5 >+ packuswb m6, m7 >+ vpermq m0, m0, 11011000b >+ vpermq m2, m2, 11011000b >+ vpermq m4, m4, 11011000b >+ vpermq m6, m6, 11011000b >+ vextracti128 xm1, m0, 1 >+ vextracti128 xm3, m2, 1 >+ vextracti128 xm5, m4, 1 >+ vextracti128 xm7, m6, 1 >+ movq [r2], xm0 >+ pextrd [r2 + 8], xm0, 2 >+ movq [r2 + r3], xm1 >+ pextrd [r2 + r3 + 8], xm1, 2 >+ movq [r2 + r3 * 2], xm2 >+ pextrd [r2 + r3 * 2 + 8], xm2, 2 >+ movq [r2 + r6], xm3 >+ pextrd [r2 + r6 + 8], xm3, 2 >+ lea r2, [r2 + r3 * 4] >+ movq [r2], xm4 >+ pextrd [r2 + 8], xm4, 2 >+ movq [r2 + r3], xm5 >+ pextrd [r2 + r3 + 8], xm5, 2 >+ movq [r2 + r3 * 2], xm6 >+ pextrd [r2 + r3 * 2 + 8], xm6, 2 >+ movq [r2 + r6], xm7 >+ pextrd [r2 + r6 + 8], xm7, 2 >+%else >+ psubw m0, m12 ; m0 = word: row 0 >+ psubw m1, m12 ; m1 = word: row 1 >+ psubw m2, m12 ; m2 = word: row 2 >+ psubw m3, m12 ; m3 = word: row 3 >+ psubw m4, m12 ; m4 = word: row 4 >+ psubw m5, m12 ; m5 = word: row 5 >+ psubw m6, m12 ; m6 = word: row 6 >+ psubw m7, m12 ; m7 = word: row 7 >+ movu [r2], xm0 >+ vextracti128 xm0, m0, 1 >+ movq [r2 + 16], xm0 >+ movu [r2 + r3], xm1 >+ vextracti128 xm1, m1, 1 >+ movq [r2 + r3 + 16], xm1 >+ movu [r2 + r3 * 2], xm2 >+ vextracti128 xm2, m2, 1 >+ movq [r2 + r3 * 2 + 16], xm2 >+ movu [r2 + r6], xm3 >+ vextracti128 xm3, m3, 1 >+ movq [r2 + r6 + 16], xm3 >+ lea r2, [r2 + r3 * 4] >+ movu [r2], xm4 >+ vextracti128 xm4, m4, 1 >+ movq [r2 + 16], xm4 >+ movu [r2 + r3], xm5 >+ vextracti128 xm5, m5, 1 >+ movq [r2 + r3 + 16], xm5 >+ movu [r2 + r3 * 2], xm6 >+ vextracti128 xm6, m6, 1 >+ movq [r2 + r3 * 2 + 16], xm6 >+ movu [r2 + r6], xm7 >+ vextracti128 xm7, m7, 1 >+ movq [r2 + r6 + 16], xm7 >+%endif >+ lea r2, [r2 + r3 * 4] >+ >+ movu xm11, [r0 + r4] ; m11 = row 11 >+ punpckhbw xm6, xm10, xm11 >+ punpcklbw xm10, xm11 >+ vinserti128 m10, m10, xm6, 1 >+ pmaddubsw m6, m10, [r5 + 1 * mmsize] >+ paddw m8, m6 >+ pmaddubsw m10, [r5] >+ lea r0, [r0 + r1 * 4] >+ movu xm6, [r0] ; m6 = row 12 >+ punpckhbw xm7, xm11, xm6 >+ punpcklbw xm11, xm6 >+ vinserti128 m11, m11, xm7, 1 >+ pmaddubsw m7, m11, [r5 + 1 * mmsize] >+ paddw m9, m7 >+ pmaddubsw m11, [r5] >+ movu xm7, [r0 + r1] ; m7 = row 13 >+ punpckhbw xm0, xm6, xm7 >+ punpcklbw xm6, xm7 >+ vinserti128 m6, m6, xm0, 1 >+ pmaddubsw m0, m6, [r5 + 1 * mmsize] >+ paddw m10, m0 >+ pmaddubsw m6, [r5] >+ movu xm0, [r0 + r1 * 2] ; m0 = row 14 >+ punpckhbw xm1, xm7, xm0 >+ punpcklbw xm7, xm0 >+ vinserti128 m7, m7, xm1, 1 >+ pmaddubsw m1, m7, [r5 + 1 * mmsize] >+ paddw m11, m1 >+ pmaddubsw m7, [r5] >+ movu xm1, [r0 + r4] ; m1 = row 15 >+ punpckhbw xm2, xm0, xm1 >+ punpcklbw xm0, xm1 >+ vinserti128 m0, m0, xm2, 1 >+ pmaddubsw m2, m0, [r5 + 1 * mmsize] >+ paddw m6, m2 >+ pmaddubsw m0, [r5] >+ lea r0, [r0 + r1 * 4] >+ movu xm2, [r0] ; m2 = row 16 >+ punpckhbw xm3, xm1, xm2 >+ punpcklbw xm1, xm2 >+ vinserti128 m1, m1, xm3, 1 >+ pmaddubsw m3, m1, [r5 + 1 * mmsize] >+ paddw m7, m3 >+ pmaddubsw m1, [r5] >+ movu xm3, [r0 + r1] ; m3 = row 17 >+ punpckhbw xm4, xm2, xm3 >+ punpcklbw xm2, xm3 >+ vinserti128 m2, m2, xm4, 1 >+ pmaddubsw m2, [r5 + 1 * mmsize] >+ paddw m0, m2 >+ movu xm4, [r0 + r1 * 2] ; m4 = row 18 >+ punpckhbw xm5, xm3, xm4 >+ punpcklbw xm3, xm4 >+ vinserti128 m3, m3, xm5, 1 >+ pmaddubsw m3, [r5 + 1 * mmsize] >+ paddw m1, m3 >+ >+%ifidn %1,pp >+ pmulhrsw m8, m12 ; m8 = word: row 8 >+ pmulhrsw m9, m12 ; m9 = word: row 9 >+ pmulhrsw m10, m12 ; m10 = word: row 10 >+ pmulhrsw m11, m12 ; m11 = word: row 11 >+ pmulhrsw m6, m12 ; m6 = word: row 12 >+ pmulhrsw m7, m12 ; m7 = word: row 13 >+ pmulhrsw m0, m12 ; m0 = word: row 14 >+ pmulhrsw m1, m12 ; m1 = word: row 15 >+ packuswb m8, m9 >+ packuswb m10, m11 >+ packuswb m6, m7 >+ packuswb m0, m1 >+ vpermq m8, m8, 11011000b >+ vpermq m10, m10, 11011000b >+ vpermq m6, m6, 11011000b >+ vpermq m0, m0, 11011000b >+ vextracti128 xm9, m8, 1 >+ vextracti128 xm11, m10, 1 >+ vextracti128 xm7, m6, 1 >+ vextracti128 xm1, m0, 1 >+ movq [r2], xm8 >+ pextrd [r2 + 8], xm8, 2 >+ movq [r2 + r3], xm9 >+ pextrd [r2 + r3 + 8], xm9, 2 >+ movq [r2 + r3 * 2], xm10 >+ pextrd [r2 + r3 * 2 + 8], xm10, 2 >+ movq [r2 + r6], xm11 >+ pextrd [r2 + r6 + 8], xm11, 2 >+ lea r2, [r2 + r3 * 4] >+ movq [r2], xm6 >+ pextrd [r2 + 8], xm6, 2 >+ movq [r2 + r3], xm7 >+ pextrd [r2 + r3 + 8], xm7, 2 >+ movq [r2 + r3 * 2], xm0 >+ pextrd [r2 + r3 * 2 + 8], xm0, 2 >+ movq [r2 + r6], xm1 >+ pextrd [r2 + r6 + 8], xm1, 2 >+%else >+ psubw m8, m12 ; m8 = word: row 8 >+ psubw m9, m12 ; m9 = word: row 9 >+ psubw m10, m12 ; m10 = word: row 10 >+ psubw m11, m12 ; m11 = word: row 11 >+ psubw m6, m12 ; m6 = word: row 12 >+ psubw m7, m12 ; m7 = word: row 13 >+ psubw m0, m12 ; m0 = word: row 14 >+ psubw m1, m12 ; m1 = word: row 15 >+ movu [r2], xm8 >+ vextracti128 xm8, m8, 1 >+ movq [r2 + 16], xm8 >+ movu [r2 + r3], xm9 >+ vextracti128 xm9, m9, 1 >+ movq [r2 + r3 + 16], xm9 >+ movu [r2 + r3 * 2], xm10 >+ vextracti128 xm10, m10, 1 >+ movq [r2 + r3 * 2 + 16], xm10 >+ movu [r2 + r6], xm11 >+ vextracti128 xm11, m11, 1 >+ movq [r2 + r6 + 16], xm11 >+ lea r2, [r2 + r3 * 4] >+ movu [r2], xm6 >+ vextracti128 xm6, m6, 1 >+ movq [r2 + 16], xm6 >+ movu [r2 + r3], xm7 >+ vextracti128 xm7, m7, 1 >+ movq [r2 + r3 + 16], xm7 >+ movu [r2 + r3 * 2], xm0 >+ vextracti128 xm0, m0, 1 >+ movq [r2 + r3 * 2 + 16], xm0 >+ movu [r2 + r6], xm1 >+ vextracti128 xm1, m1, 1 >+ movq [r2 + r6 + 16], xm1 >+%endif >+ RET >+%endif >+%endmacro >+ >+FILTER_VER_CHROMA_AVX2_12x16 pp >+FILTER_VER_CHROMA_AVX2_12x16 ps >+ > ;----------------------------------------------------------------------------- > ; void interp_4tap_vert_pp_16x%2(pixel *src, intptr_t srcStride, pixel *dst, > intptr_t dstStride, int coeffIdx) > ;----------------------------------------------------------------------------- >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
