At 2015-03-05 20:07:40,[email protected] wrote: ># HG changeset patch ># User Rajesh Paulraj<[email protected]> ># Date 1425557077 -19800 ># Thu Mar 05 17:34:37 2015 +0530 ># Node ID cbac0290953fe5841760b4dfd387956c193df27c ># Parent 38cb572c2927eee8039464ec462b874c0da20871 >asm-sse2: 16bpp code for filter_p2s[4x4](2.67x), filter_p2s[4x8](3.12x), >filter_p2s[4x16](3.11x), filter_p2s[8x4](4.90x), filter_p2s[8x8](4.54x), >filter_p2s[8x16](5.68x), filter_p2s[8x32](6.28x), filter_p2s[16x4](7.98x), >filter_p2s[16x8](9.87x), filter_p2s[16x12](9.66x), filter_p2s[16x16](10.24x), >filter_p2s[16x32](10.38x), filter_p2s[16x64](10.36x), filter_p2s[32x8](8.06x), >filter_p2s[32x16](7.09x), filter_p2s[32x24](7.98x), filter_p2s[32x32](7.09x), >filter_p2s[32x64](8.11x), filter_p2s[64x16](8.03x), filter_p2s[64x32](7.11x), >filter_p2s[64x48](7.89x), filter_p2s[64x64](6.79x), filter_p2s[12x16](8.35x), >filter_p2s[24x32](10.31x), filter_p2s[48x64](6.89x) > >diff -r 38cb572c2927 -r cbac0290953f source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Thu Mar 05 17:19:57 2015 +0530 >+++ b/source/common/x86/asm-primitives.cpp Thu Mar 05 17:34:37 2015 +0530 >@@ -855,7 +855,32 @@ > PIXEL_AVG_W4(mmx2); > LUMA_VAR(sse2); > >- p.luma_p2s = x265_luma_p2s_sse2; >+ p.pu[LUMA_4x4].filter_p2s = x265_pixelToShort_4x4_sse2; >+ p.pu[LUMA_4x8].filter_p2s = x265_pixelToShort_4x8_sse2; >+ p.pu[LUMA_4x16].filter_p2s = x265_pixelToShort_4x16_sse2; >+ p.pu[LUMA_8x4].filter_p2s = x265_pixelToShort_8x4_sse2; >+ p.pu[LUMA_8x8].filter_p2s = x265_pixelToShort_8x8_sse2; >+ p.pu[LUMA_8x16].filter_p2s = x265_pixelToShort_8x16_sse2; >+ p.pu[LUMA_8x32].filter_p2s = x265_pixelToShort_8x32_sse2; >+ p.pu[LUMA_16x4].filter_p2s = x265_pixelToShort_16x4_sse2; >+ p.pu[LUMA_16x8].filter_p2s = x265_pixelToShort_16x8_sse2; >+ p.pu[LUMA_16x12].filter_p2s = x265_pixelToShort_16x12_sse2; >+ p.pu[LUMA_16x16].filter_p2s = x265_pixelToShort_16x16_sse2; >+ p.pu[LUMA_16x32].filter_p2s = x265_pixelToShort_16x32_sse2; >+ p.pu[LUMA_16x64].filter_p2s = x265_pixelToShort_16x64_sse2; >+ p.pu[LUMA_32x8].filter_p2s = x265_pixelToShort_32x8_sse2; >+ p.pu[LUMA_32x16].filter_p2s = x265_pixelToShort_32x16_sse2; >+ p.pu[LUMA_32x24].filter_p2s = x265_pixelToShort_32x24_sse2; >+ p.pu[LUMA_32x32].filter_p2s = x265_pixelToShort_32x32_sse2; >+ p.pu[LUMA_32x64].filter_p2s = x265_pixelToShort_32x64_sse2; >+ p.pu[LUMA_64x16].filter_p2s = x265_pixelToShort_64x16_sse2; >+ p.pu[LUMA_64x32].filter_p2s = x265_pixelToShort_64x32_sse2; >+ p.pu[LUMA_64x48].filter_p2s = x265_pixelToShort_64x48_sse2; >+ p.pu[LUMA_64x64].filter_p2s = x265_pixelToShort_64x64_sse2; >+ p.pu[LUMA_12x16].filter_p2s = x265_pixelToShort_12x16_sse2; >+ p.pu[LUMA_24x32].filter_p2s = x265_pixelToShort_24x32_sse2; >+ p.pu[LUMA_48x64].filter_p2s = x265_pixelToShort_48x64_sse2; >+ > p.chroma[X265_CSP_I420].p2s = x265_chroma_p2s_sse2; > p.chroma[X265_CSP_I422].p2s = x265_chroma_p2s_sse2; > >diff -r 38cb572c2927 -r cbac0290953f source/common/x86/ipfilter16.asm >--- a/source/common/x86/ipfilter16.asm Thu Mar 05 17:19:57 2015 +0530 >+++ b/source/common/x86/ipfilter16.asm Thu Mar 05 17:34:37 2015 +0530 >@@ -3,6 +3,7 @@ > ;* > ;* Authors: Nabajit Deka <[email protected]> > ;* Murugan Vairavel <[email protected]> >+;* Rajesh Paulraj <[email protected]> > ;* > ;* This program is free software; you can redistribute it and/or modify > ;* it under the terms of the GNU General Public License as published by >@@ -5525,65 +5526,472 @@ > FILTER_VER_LUMA_SS 64, 16 > FILTER_VER_LUMA_SS 16, 64 > >-;-------------------------------------------------------------------------------------------------- >-; void filterConvertPelToShort(pixel *src, intptr_t srcStride, int16_t *dst, >int width, int height) >-;-------------------------------------------------------------------------------------------------- >+;----------------------------------------------------------------------------- >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) >+;----------------------------------------------------------------------------- >+%macro P2S_H_4xN 1 > INIT_XMM sse2 >-cglobal luma_p2s, 3, 7, 5 >- >+cglobal pixelToShort_4x%1, 3, 5, 5 > add r1, r1 > >- ; load width and height >- mov r3d, r3m >- mov r4d, r4m >+ ; load height >+ mov r3d, %1 %1/4, so we may use 'dec r3d' below
> > ; load constant > mova m4, [tab_c_n8192] > > .loopH: > >- xor r5d, r5d >-.loopW: >- lea r6, [r0 + r5 * 2] >- >- movu m0, [r6] >+ movu m0, [r0] > psllw m0, 4 > paddw m0, m4 > >- movu m1, [r6 + r1] >+ movu m1, [r0 + r1] > psllw m1, 4 > paddw m1, m4 > >- movu m2, [r6 + r1 * 2] >+ movu m2, [r0 + r1 * 2] > psllw m2, 4 > paddw m2, m4 > >- lea r6, [r6 + r1 * 2] >- movu m3, [r6 + r1] >+ lea r4, [r0 + r1 * 2] >+ movu m3, [r4 + r1] > psllw m3, 4 > paddw m3, m4 > >- add r5, 8 >- cmp r5, r3 >- jg .width4 >- movu [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0 >- movu [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1 >- movu [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2 >- movu [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3 >+ movh [r2 + FENC_STRIDE * 0 ], m0 >+ movh [r2 + FENC_STRIDE * 2 ], m1 >+ movh [r2 + FENC_STRIDE * 4 ], m2 >+ movh [r2 + FENC_STRIDE * 6 ], m3 >+ >+ lea r0, [r0 + r1 * 4] >+ add r2, FENC_STRIDE * 8 >+ >+ sub r3d, 4 >+ jnz .loopH >+ >+ RET >+%endmacro >+P2S_H_4xN 4 >+P2S_H_4xN 8 >+P2S_H_4xN 16 >+ >+;----------------------------------------------------------------------------- >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) >+;----------------------------------------------------------------------------- >+%macro P2S_H_8xN 1 >+INIT_XMM sse2 >+cglobal pixelToShort_8x%1, 3, 5, 5 >+ >+ add r1, r1 >+ >+ ; load height >+ mov r3d, %1 >+ >+ ; load constant >+ mova m4, [tab_c_n8192] >+ >+.loopH: >+ >+ movu m0, [r0] >+ psllw m0, 4 >+ paddw m0, m4 >+ >+ movu m1, [r0 + r1] >+ psllw m1, 4 >+ paddw m1, m4 >+ >+ movu m2, [r0 + r1 * 2] >+ psllw m2, 4 >+ paddw m2, m4 >+ >+ lea r4, [r0 + r1 * 2] >+ movu m3, [r4 + r1] >+ psllw m3, 4 >+ paddw m3, m4 >+ >+ movu [r2 + FENC_STRIDE * 0], m0 >+ movu [r2 + FENC_STRIDE * 2], m1 >+ movu [r2 + FENC_STRIDE * 4], m2 >+ movu [r2 + FENC_STRIDE * 6], m3 >+ >+ lea r0, [r0 + r1 * 4] >+ add r2, FENC_STRIDE * 8 >+ >+ sub r3d, 4 >+ jnz .loopH >+ >+ RET >+%endmacro >+P2S_H_8xN 8 >+P2S_H_8xN 4 >+P2S_H_8xN 16 >+P2S_H_8xN 32 >+ >+;----------------------------------------------------------------------------- >+; void pixelToShort(pixel *src, intptr_t srcStride, int16_t *dst) >+;----------------------------------------------------------------------------- >+%macro P2S_H_16xN 1 >+INIT_XMM sse2 >+cglobal pixelToShort_16x%1, 3, 6, 5 >+ >+ add r1, r1 >+ >+ ; load height >+ mov r3d, %1 >+ >+ ; load constant >+ mova m4, [tab_c_n8192] >+ >+.loopH: >+ xor r4d, r4d >+.loopW: you process 4x8 area every loop, it have low cache performance, I suggest use 1xN or similar to get more performance >+ lea r5, [r0 + r4 * 2] >+ >+ movu m0, [r5] >+ psllw m0, 4 >+ paddw m0, m4 >+ >+ movu m1, [r5 + r1] >+ psllw m1, 4 >+ paddw m1, m4 >+ >+ movu m2, [r5 + r1 * 2] >+ psllw m2, 4 >+ paddw m2, m4 >+ >+ lea r5, [r5 + r1 * 2] >+ movu m3, [r5 + r1] >+ psllw m3, 4 >+ paddw m3, m4 >+ >+ add r4, 8 >+ cmp r4, 16 >+ >+ movu [r2 + r4 * 2 + FENC_STRIDE * 0 - 16], m0 >+ movu [r2 + r4 * 2 + FENC_STRIDE * 2 - 16], m1 >+ movu [r2 + r4 * 2 + FENC_STRIDE * 4 - 16], m2 >+ movu [r2 + r4 * 2 + FENC_STRIDE * 6 - 16], m3 > je .nextH >- jmp .loopW >- >-.width4: >- movh [r2 + r5 * 2 + FENC_STRIDE * 0 - 16], m0 >- movh [r2 + r5 * 2 + FENC_STRIDE * 2 - 16], m1 >- movh [r2 + r5 * 2 + FENC_STRIDE * 4 - 16], m2 >- movh [r2 + r5 * 2 + FENC_STRIDE * 6 - 16], m3 >+ jnz .loopW > > .nextH: > lea r0, [r0 + r1 * 4] > add r2, FENC_STRIDE * 8 > >- sub r4d, 4 >+ sub r3d, 4 > jnz .loopH > > RET >+%endmacro >+P2S_H_16xN 4 >+P2S_H_16xN 8
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
