At 2015-03-04 14:39:13,[email protected] wrote: ># HG changeset patch ># User Rajesh [email protected]> ># Date 1425451088 -19800 ># Wed Mar 04 12:08:08 2015 +0530 ># Node ID 94991f753feae850b6edd371481e199f76243af3 ># Parent 018e8bbaa854b1a4bd82b3a2e23f7775a77da5cc >asm-ssse3: filter_p2s[12x16](9.87x), filter_p2s[24x32](10.30x), > filter_p2s[48x64](9.60x) > @@ -5730,6 +5726,169 @@ > PIXEL_WH_64xN 64, 16 > PIXEL_WH_64xN 64, 32 > PIXEL_WH_64xN 64, 48 >+;----------------------------------------------------------------------------- >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) >+;----------------------------------------------------------------------------- >+INIT_XMM ssse3 >+cglobal pixelToShort_12x16, 3, 4, 6 >+ >+ ; load constant >+ mova m4, [pb_128] >+ mova m5, [tab_c_64_n64] >+ >+%rep 4 >+ >+ movu m0, [r0] >+ movu m1, m0 copy between register always aligned
>+ punpcklbw m1, m0, m4 you copy into m1 a moment ago, why overwrite before use it? >+ punpckhbw m0, m4 >+ pmaddubsw m0, m5 >+ pmaddubsw m1, m5 >+ >+ movu m2, [r0 + r1] >+ movu m3, m2 >+ punpcklbw m3, m2, m4 >+ punpckhbw m2, m4 >+ pmaddubsw m2, m5 >+ pmaddubsw m3, m5 >+ >+ movu [r2 + FENC_STRIDE * 0], m1 >+ movu [r2 + FENC_STRIDE * 2], m3 >+ >+ movh [r2 + FENC_STRIDE * 0 + 16], m0 >+ movh [r2 + FENC_STRIDE * 2 + 16], m2 >+ >+ movu m0, [r0 + r1 * 2] >+ movu m1, m0 >+ punpcklbw m1, m0, m4 >+ punpckhbw m0, m4 >+ pmaddubsw m0, m5 >+ pmaddubsw m1, m5 >+ >+ lea r3, [r0 + r1 * 2] >+ movu m2, [r3 + r1] >+ movu m3, m2 >+ punpcklbw m3, m2, m4 >+ punpckhbw m2, m4 >+ pmaddubsw m2, m5 >+ pmaddubsw m3, m5 >+ >+ movu [r2 + FENC_STRIDE * 4], m1 >+ movu [r2 + FENC_STRIDE * 6], m3 >+ >+ movh [r2 + FENC_STRIDE * 4 + 16], m0 >+ movh [r2 + FENC_STRIDE * 6 + 16], m2 >+ >+ lea r0, [r0 + r1 * 4] >+ add r2, FENC_STRIDE * 8 >+%endrep >+ RET >+ >+;----------------------------------------------------------------------------- >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) >+;----------------------------------------------------------------------------- >+INIT_XMM ssse3 >+cglobal pixelToShort_24x32, 3, 6, 6 >+ >+ ; load height >+ mov r3d, 32 >+ >+ ; load constant >+ mova m4, [pb_128] >+ mova m5, [tab_c_64_n64] >+ >+.loopH: >+ >+ xor r4d, r4d >+.loopW: >+ lea r5, [r0 + r4] >+ >+ movh m0, [r5] >+ punpcklbw m0, m4 >+ pmaddubsw m0, m5 >+ >+ movh m1, [r5 + r1] >+ punpcklbw m1, m4 >+ pmaddubsw m1, m5 >+ >+ movh m2, [r5 + r1 * 2] >+ punpcklbw m2, m4 >+ pmaddubsw m2, m5 >+ >+ lea r5, [r5 + r1 * 2] >+ movh m3, [r5 + r1] >+ punpcklbw m3, m4 >+ pmaddubsw m3, m5 >+ >+ add r4, 8 >+ cmp r4, 24 >+ movu [r2 + r4 * 2 + FENC_STRIDE * 0 - 16], m0 >+ movu [r2 + r4 * 2 + FENC_STRIDE * 2 - 16], m1 >+ movu [r2 + r4 * 2 + FENC_STRIDE * 4 - 16], m2 >+ movu [r2 + r4 * 2 + FENC_STRIDE * 6 - 16], m3 >+ je .nextH >+ jnz .loopW >+ >+.nextH: >+ lea r0, [r0 + r1 * 4] >+ add r2, FENC_STRIDE * 8 >+ >+ sub r3d, 4 >+ jnz .loopH >+ RET >+ >+;----------------------------------------------------------------------------- >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) >+;----------------------------------------------------------------------------- >+INIT_XMM ssse3 >+cglobal pixelToShort_48x64, 3, 6, 6 >+ >+ ; load height >+ mov r3d, 64 >+ >+ ; load constant >+ mova m4, [pb_128] >+ mova m5, [tab_c_64_n64] >+ >+.loopH: >+ >+ xor r4d, r4d >+.loopW: >+ lea r5, [r0 + r4] >+ >+ movh m0, [r5] >+ punpcklbw m0, m4 >+ pmaddubsw m0, m5 >+ >+ movh m1, [r5 + r1] >+ punpcklbw m1, m4 >+ pmaddubsw m1, m5 >+ >+ movh m2, [r5 + r1 * 2] >+ punpcklbw m2, m4 >+ pmaddubsw m2, m5 >+ >+ lea r5, [r5 + r1 * 2] >+ movh m3, [r5 + r1] >+ punpcklbw m3, m4 >+ pmaddubsw m3, m5 >+ >+ add r4, 8 >+ cmp r4, 48 >+ movu [r2 + r4 * 2 + FENC_STRIDE * 0 - 16], m0 >+ movu [r2 + r4 * 2 + FENC_STRIDE * 2 - 16], m1 >+ movu [r2 + r4 * 2 + FENC_STRIDE * 4 - 16], m2 >+ movu [r2 + r4 * 2 + FENC_STRIDE * 6 - 16], m3 >+ je .nextH >+ jnz .loopW >+ >+.nextH: >+ lea r0, [r0 + r1 * 4] >+ add r2, FENC_STRIDE * 8 >+ >+ sub r3d, 4 >+ jnz .loopH >+ RET > > %macro PROCESS_LUMA_W4_4R 0 > movd m0, [r0] >diff -r 018e8bbaa854 -r 94991f753fea source/common/x86/ipfilter8.h >--- a/source/common/x86/ipfilter8.h Fri Feb 27 11:46:09 2015 +0530 >+++ b/source/common/x86/ipfilter8.h Wed Mar 04 12:08:08 2015 +0530 >@@ -642,6 +642,10 @@ > void x265_pixelToShort_64x32_ssse3(const pixel* src, intptr_t srcStride, > int16_t* dst); > void x265_pixelToShort_64x48_ssse3(const pixel* src, intptr_t srcStride, > int16_t* dst); > void x265_pixelToShort_64x64_ssse3(const pixel* src, intptr_t srcStride, > int16_t* dst); >+void x265_pixelToShort_12x16_ssse3(const pixel* src, intptr_t srcStride, >int16_t* dst); >+void x265_pixelToShort_24x32_ssse3(const pixel* src, intptr_t srcStride, >int16_t* dst); >+void x265_pixelToShort_48x64_ssse3(const pixel* src, intptr_t srcStride, >int16_t* dst); >+ > #undef LUMA_FILTERS > #undef LUMA_SP_FILTERS > #undef LUMA_SS_FILTERS >diff -r 018e8bbaa854 -r 94991f753fea source/test/ipfilterharness.cpp >--- a/source/test/ipfilterharness.cpp Fri Feb 27 11:46:09 2015 +0530 >+++ b/source/test/ipfilterharness.cpp Wed Mar 04 12:08:08 2015 +0530 >@@ -523,7 +523,7 @@ > > checked(opt, pixel_test_buff[index] + i, rand_srcStride, > IPF_vec_output_s); > >- if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * >sizeof(pixel))) >+ if (memcmp(IPF_vec_output_s, IPF_C_output_s, TEST_BUF_SIZE * >sizeof(int16_t))) > return false; > > reportfail(); >_______________________________________________ >x265-devel mailing list >[email protected] >https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
