-------- Forwarding messages -------- From: chen <[email protected]> Date: 2015-03-03 01:09:18 To: [email protected] Cc: "Praveen Tiwari" <[email protected]> Subject: Re:[PATCH] asm-ssse3: filter_p2s[12x16](9.64x), filter_p2s[24x32](10.30x), At 2015-03-02 13:24:18,[email protected] wrote: ># HG changeset patch ># User Rajesh Paulraj<[email protected]> ># Date 1425273779 -19800 ># Mon Mar 02 10:52:59 2015 +0530 ># Node ID 70be3fa2ee550ec1b954c420e3c7a915589163a7 ># Parent 018e8bbaa854b1a4bd82b3a2e23f7775a77da5cc >asm-ssse3: filter_p2s[12x16](9.64x), filter_p2s[24x32](10.30x), > filter_p2s[48x64](7.79x) > >diff -r 018e8bbaa854 -r 70be3fa2ee55 source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Fri Feb 27 11:46:09 2015 +0530 >+++ b/source/common/x86/asm-primitives.cpp Mon Mar 02 10:52:59 2015 +0530 >@@ -1272,6 +1272,9 @@ > p.pu[LUMA_64x32].filter_p2s = x265_pixelToShort_64x32_ssse3; > p.pu[LUMA_64x48].filter_p2s = x265_pixelToShort_64x48_ssse3; > p.pu[LUMA_64x64].filter_p2s = x265_pixelToShort_64x64_ssse3; >+ p.pu[LUMA_12x16].filter_p2s = x265_pixelToShort_12x16_ssse3; >+ p.pu[LUMA_24x32].filter_p2s = x265_pixelToShort_24x32_ssse3; >+ p.pu[LUMA_48x64].filter_p2s = x265_pixelToShort_48x64_ssse3; > > p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_ssse3; > p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_ssse3; >diff -r 018e8bbaa854 -r 70be3fa2ee55 source/common/x86/ipfilter8.asm >--- a/source/common/x86/ipfilter8.asm Fri Feb 27 11:46:09 2015 +0530 >+++ b/source/common/x86/ipfilter8.asm Mon Mar 02 10:52:59 2015 +0530 >@@ -5416,8 +5416,9 @@ > FILTER_V4_W16n_H2 64, 48 > FILTER_V4_W16n_H2 48, 64 > FILTER_V4_W16n_H2 64, 16 >-;----------------------------------------------------------------------------- >-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, >int height) >+ >+;----------------------------------------------------------------------------- >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) > ;----------------------------------------------------------------------------- > %macro PIXEL_WH_4xN 2 > INIT_XMM ssse3 >@@ -5480,7 +5481,7 @@ > PIXEL_WH_4xN 4, 16 > > ;----------------------------------------------------------------------------- >-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, >int height) >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) > ;----------------------------------------------------------------------------- > %macro PIXEL_WH_8xN 2 > INIT_XMM ssse3 >@@ -5541,9 +5542,8 @@ > PIXEL_WH_8xN 8, 16 > PIXEL_WH_8xN 8, 32 > >- >-;----------------------------------------------------------------------------- >-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, >int height) >+;----------------------------------------------------------------------------- >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) > ;----------------------------------------------------------------------------- > %macro PIXEL_WH_16xN 2 > INIT_XMM ssse3 >@@ -5607,7 +5607,7 @@ > PIXEL_WH_16xN 16, 64 > > ;----------------------------------------------------------------------------- >-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, >int height) >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) > ;----------------------------------------------------------------------------- > %macro PIXEL_WH_32xN 2 > INIT_XMM ssse3 >@@ -5670,7 +5670,7 @@ > PIXEL_WH_32xN 32, 64 > > ;----------------------------------------------------------------------------- >-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, >int height) >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) > ;----------------------------------------------------------------------------- > %macro PIXEL_WH_64xN 2 > INIT_XMM ssse3 >@@ -5731,6 +5731,173 @@ > PIXEL_WH_64xN 64, 32 > PIXEL_WH_64xN 64, 48 > >+;----------------------------------------------------------------------------- >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) >+;----------------------------------------------------------------------------- >+INIT_XMM ssse3 >+cglobal pixelToShort_12x16, 3, 7, 6 where are r3,r4,r5? >+ >+ ; load constant >+ mova m4, [pb_128] >+ mova m5, [tab_c_64_n64] >+ >+%rep 4 >+ mov r6, r0 >+ >+ movu m0, [r6] >+ movu m1, m0 >+ punpcklbw m1, m0, m4 >+ punpckhbw m0, m4 >+ pmaddubsw m0, m5 >+ pmaddubsw m1, m5 >+ >+ movu m2, [r6 + r1] >+ movu m3, m2 >+ punpcklbw m3, m2, m4 >+ punpckhbw m2, m4 >+ pmaddubsw m2, m5 >+ pmaddubsw m3, m5 >+ >+ movu [r2 + FENC_STRIDE * 0], m1 >+ movu [r2 + FENC_STRIDE * 2], m3 >+ >+ movh [r2 + FENC_STRIDE * 0 + 16], m0 >+ movh [r2 + FENC_STRIDE * 2 + 16], m2 >+ >+ movu m0, [r6 + r1 * 2] ---> >+ movu m1, m0 >+ punpcklbw m1, m0, m4 >+ punpckhbw m0, m4 >+ pmaddubsw m0, m5 >+ pmaddubsw m1, m5 >+ >+ lea r6, [r6 + r1 * 2] why not reuse with above flag --> >+ movu m2, [r6 + r1] you just want to get r1*3, you have many free register, don't need set r6 alias to r0 >+ movu m3, m2 >+ punpcklbw m3, m2, m4 >+ punpckhbw m2, m4 >+ pmaddubsw m2, m5 >+ pmaddubsw m3, m5 >+ >+ movu [r2 + FENC_STRIDE * 4], m1 >+ movu [r2 + FENC_STRIDE * 6], m3 >+ >+ movh [r2 + FENC_STRIDE * 4 + 16], m0 >+ movh [r2 + FENC_STRIDE * 6 + 16], m2 >+ >+ lea r0, [r0 + r1 * 4] >+ add r2, FENC_STRIDE * 8 >+%endrep >+ RET
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
