Full unroll is bad case, it made low cache performance I suggest unroll 2 rows, eg: 24x32 you may process 64 or 128 pixels every time
At 2015-03-09 13:40:56,[email protected] wrote: ># HG changeset patch ># User Rajesh Paulraj<[email protected]> ># Date 1425879590 -19800 ># Mon Mar 09 11:09:50 2015 +0530 ># Node ID 38ea9788d3e652d6fd53518b3943b636d55bb0b4 ># Parent 043c2418864b0a3ada6f597e6def6ead73d90b5f >asm-ssse3: filter_p2s[12x16](10.41x), filter_p2s[24x32](13.26x), > filter_p2s[48x64](8.43x) > >diff -r 043c2418864b -r 38ea9788d3e6 source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Fri Mar 06 13:15:55 2015 -0600 >+++ b/source/common/x86/asm-primitives.cpp Mon Mar 09 11:09:50 2015 +0530 >@@ -1260,7 +1260,9 @@ > p.pu[LUMA_64x32].filter_p2s = x265_pixelToShort_64x32_ssse3; > p.pu[LUMA_64x48].filter_p2s = x265_pixelToShort_64x48_ssse3; > p.pu[LUMA_64x64].filter_p2s = x265_pixelToShort_64x64_ssse3; >- >+ p.pu[LUMA_12x16].filter_p2s = x265_pixelToShort_12x16_ssse3; >+ p.pu[LUMA_24x32].filter_p2s = x265_pixelToShort_24x32_ssse3; >+ p.pu[LUMA_48x64].filter_p2s = x265_pixelToShort_48x64_ssse3; > p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_ssse3; > p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_ssse3; > >diff -r 043c2418864b -r 38ea9788d3e6 source/common/x86/ipfilter8.asm >--- a/source/common/x86/ipfilter8.asm Fri Mar 06 13:15:55 2015 -0600 >+++ b/source/common/x86/ipfilter8.asm Mon Mar 09 11:09:50 2015 +0530 >@@ -4,6 +4,7 @@ > ;* Authors: Min Chen <[email protected]> > ;* Nabajit Deka <[email protected]> > ;* Praveen Kumar Tiwari <[email protected]> >+;* Rajesh Paulraj <[email protected]> > ;* > ;* This program is free software; you can redistribute it and/or modify > ;* it under the terms of the GNU General Public License as published by >@@ -5811,8 +5812,9 @@ > FILTER_V4_W16n_H2 64, 48 > FILTER_V4_W16n_H2 48, 64 > FILTER_V4_W16n_H2 64, 16 >-;----------------------------------------------------------------------------- >-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, >int height) >+ >+;----------------------------------------------------------------------------- >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) > ;----------------------------------------------------------------------------- > %macro PIXEL_WH_4xN 2 > INIT_XMM ssse3 >@@ -5873,9 +5875,8 @@ > PIXEL_WH_4xN 4, 4 > PIXEL_WH_4xN 4, 8 > PIXEL_WH_4xN 4, 16 >- >-;----------------------------------------------------------------------------- >-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, >int height) >+;----------------------------------------------------------------------------- >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) > ;----------------------------------------------------------------------------- > %macro PIXEL_WH_8xN 2 > INIT_XMM ssse3 >@@ -5935,10 +5936,8 @@ > PIXEL_WH_8xN 8, 4 > PIXEL_WH_8xN 8, 16 > PIXEL_WH_8xN 8, 32 >- >- >-;----------------------------------------------------------------------------- >-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, >int height) >+;----------------------------------------------------------------------------- >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) > ;----------------------------------------------------------------------------- > %macro PIXEL_WH_16xN 2 > INIT_XMM ssse3 >@@ -6000,9 +5999,8 @@ > PIXEL_WH_16xN 16, 12 > PIXEL_WH_16xN 16, 32 > PIXEL_WH_16xN 16, 64 >- >-;----------------------------------------------------------------------------- >-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, >int height) >+;----------------------------------------------------------------------------- >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) > ;----------------------------------------------------------------------------- > %macro PIXEL_WH_32xN 2 > INIT_XMM ssse3 >@@ -6063,9 +6061,8 @@ > PIXEL_WH_32xN 32, 16 > PIXEL_WH_32xN 32, 24 > PIXEL_WH_32xN 32, 64 >- >-;----------------------------------------------------------------------------- >-; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst, int width, >int height) >+;----------------------------------------------------------------------------- >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) > ;----------------------------------------------------------------------------- > %macro PIXEL_WH_64xN 2 > INIT_XMM ssse3 >@@ -6126,6 +6123,148 @@ > PIXEL_WH_64xN 64, 32 > PIXEL_WH_64xN 64, 48 > >+;----------------------------------------------------------------------------- >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) >+;----------------------------------------------------------------------------- >+INIT_XMM ssse3 >+cglobal pixelToShort_12x16, 3, 4, 6 >+ >+ ; load constant >+ mova m4, [pb_128] >+ mova m5, [tab_c_64_n64] >+ >+%rep 4 >+ >+ movu m0, [r0] >+ punpcklbw m1, m0, m4 >+ punpckhbw m0, m4 >+ pmaddubsw m0, m5 >+ pmaddubsw m1, m5 >+ >+ movu m2, [r0 + r1] >+ punpcklbw m3, m2, m4 >+ punpckhbw m2, m4 >+ pmaddubsw m2, m5 >+ pmaddubsw m3, m5 >+ >+ movu [r2 + FENC_STRIDE * 0], m1 >+ movu [r2 + FENC_STRIDE * 2], m3 >+ >+ movh [r2 + FENC_STRIDE * 0 + 16], m0 >+ movh [r2 + FENC_STRIDE * 2 + 16], m2 >+ >+ movu m0, [r0 + r1 * 2] >+ punpcklbw m1, m0, m4 >+ punpckhbw m0, m4 >+ pmaddubsw m0, m5 >+ pmaddubsw m1, m5 >+ >+ lea r3, [r0 + r1 * 2] >+ movu m2, [r3 + r1] >+ punpcklbw m3, m2, m4 >+ punpckhbw m2, m4 >+ pmaddubsw m2, m5 >+ pmaddubsw m3, m5 >+ >+ movu [r2 + FENC_STRIDE * 4], m1 >+ movu [r2 + FENC_STRIDE * 6], m3 >+ >+ movh [r2 + FENC_STRIDE * 4 + 16], m0 >+ movh [r2 + FENC_STRIDE * 6 + 16], m2 >+ >+ lea r0, [r0 + r1 * 4] >+ add r2, FENC_STRIDE * 8 >+%endrep >+ RET >+ >+;----------------------------------------------------------------------------- >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) >+;----------------------------------------------------------------------------- >+INIT_XMM ssse3 >+cglobal pixelToShort_24x32, 3, 4, 6 >+ >+ ; load constant >+ mova m4, [pb_128] >+ mova m5, [tab_c_64_n64] >+ >+%rep 8 >+%assign x 0 >+%rep 3 >+ lea r3, [r0 + x] >+ >+ movh m0, [r3] >+ punpcklbw m0, m4 >+ pmaddubsw m0, m5 >+ >+ movh m1, [r3 + r1] >+ punpcklbw m1, m4 >+ pmaddubsw m1, m5 >+ >+ movh m2, [r3 + r1 * 2] >+ punpcklbw m2, m4 >+ pmaddubsw m2, m5 >+ >+ lea r3, [r3 + r1 * 2] >+ movh m3, [r3 + r1] >+ punpcklbw m3, m4 >+ pmaddubsw m3, m5 >+ >+%assign x x+8 >+ >+ movu [r2 + x * 2 + FENC_STRIDE * 0 - 16], m0 >+ movu [r2 + x * 2 + FENC_STRIDE * 2 - 16], m1 >+ movu [r2 + x * 2 + FENC_STRIDE * 4 - 16], m2 >+ movu [r2 + x * 2 + FENC_STRIDE * 6 - 16], m3 >+%endrep >+ lea r0, [r0 + r1 * 4] >+ add r2, FENC_STRIDE * 8 >+%endrep >+ RET >+ >+;----------------------------------------------------------------------------- >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) >+;----------------------------------------------------------------------------- >+INIT_XMM ssse3 >+cglobal pixelToShort_48x64, 3, 4, 6 >+ >+ ; load constant >+ mova m4, [pb_128] >+ mova m5, [tab_c_64_n64] >+ >+%rep 16 >+%assign x 0 >+%rep 6 >+ lea r3, [r0 + x] >+ >+ movh m0, [r3] >+ punpcklbw m0, m4 >+ pmaddubsw m0, m5 >+ >+ movh m1, [r3 + r1] >+ punpcklbw m1, m4 >+ pmaddubsw m1, m5 >+ >+ movh m2, [r3 + r1 * 2] >+ punpcklbw m2, m4 >+ pmaddubsw m2, m5 >+ >+ lea r3, [r3 + r1 * 2] >+ movh m3, [r3 + r1] >+ punpcklbw m3, m4 >+ pmaddubsw m3, m5 >+ >+%assign x x+8 >+ >+ movu [r2 + x * 2 + FENC_STRIDE * 0 - 16], m0 >+ movu [r2 + x * 2 + FENC_STRIDE * 2 - 16], m1 >+ movu [r2 + x * 2 + FENC_STRIDE * 4 - 16], m2 >+ movu [r2 + x * 2 + FENC_STRIDE * 6 - 16], m3 >+%endrep >+ lea r0, [r0 + r1 * 4] >+ add r2, FENC_STRIDE * 8 >+%endrep >+ RET >+ > %macro PROCESS_LUMA_W4_4R 0 > movd m0, [r0] > movd m1, [r0 + r1] >diff -r 043c2418864b -r 38ea9788d3e6 source/common/x86/ipfilter8.h >--- a/source/common/x86/ipfilter8.h Fri Mar 06 13:15:55 2015 -0600 >+++ b/source/common/x86/ipfilter8.h Mon Mar 09 11:09:50 2015 +0530 >@@ -642,6 +642,10 @@ > void x265_pixelToShort_64x32_ssse3(const pixel* src, intptr_t srcStride, > int16_t* dst); > void x265_pixelToShort_64x48_ssse3(const pixel* src, intptr_t srcStride, > int16_t* dst); > void x265_pixelToShort_64x64_ssse3(const pixel* src, intptr_t srcStride, > int16_t* dst); >+void x265_pixelToShort_12x16_ssse3(const pixel* src, intptr_t srcStride, >int16_t* dst); >+void x265_pixelToShort_24x32_ssse3(const pixel* src, intptr_t srcStride, >int16_t* dst); >+void x265_pixelToShort_48x64_ssse3(const pixel* src, intptr_t srcStride, >int16_t* dst); >+ > #undef LUMA_FILTERS > #undef LUMA_SP_FILTERS > #undef LUMA_SS_FILTERS >diff -r 043c2418864b -r 38ea9788d3e6 source/test/ipfilterharness.cpp >--- a/source/test/ipfilterharness.cpp Fri Mar 06 13:15:55 2015 -0600 >+++ b/source/test/ipfilterharness.cpp Mon Mar 09 11:09:50 2015 +0530 >@@ -523,7 +523,7 @@ > > checked(opt, pixel_test_buff[index] + i, rand_srcStride, > IPF_vec_output_s); > >- if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * >sizeof(pixel))) >+ if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * >sizeof(int16_t))) > return false; > > reportfail(); >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
