Changes done ! More performance gain moving paddq instructions out of function to final stage, more than using less registers !
# HG changeset patch # User Ramya Sriraman <[email protected]> # Date 1443438827 -19800 # Mon Sep 28 16:43:47 2015 +0530 # Node ID 5f1451e5842252b31442e8b6519138d8033bbb2b # Parent 69440d394ec2682702cb1fe5479fb1ff0babf69d asm: fix sse_ss[64x64] sse2 12bit diff -r 69440d394ec2 -r 5f1451e58422 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Mon Sep 28 16:13:55 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Mon Sep 28 16:43:47 2015 +0530 @@ -1006,10 +1006,11 @@ p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_4x8_mmx2); p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2); p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2); -#if X265_DEPTH <= 10 - p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2); - ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2); -#endif + p.cu[BLOCK_4x4].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_4x4_mmx2); + p.cu[BLOCK_8x8].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_8x8_sse2); + p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_sse2); + p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_sse2); + p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_sse2); p.cu[BLOCK_4x4].dct = PFX(dct4_sse2); p.cu[BLOCK_8x8].dct = PFX(dct8_sse2); p.cu[BLOCK_4x4].idct = PFX(idct4_sse2); diff -r 69440d394ec2 -r 5f1451e58422 source/common/x86/ssd-a.asm --- a/source/common/x86/ssd-a.asm Mon Sep 28 16:13:55 2015 +0530 +++ b/source/common/x86/ssd-a.asm Mon Sep 28 16:43:47 2015 +0530 @@ -183,6 +183,89 @@ RET %endmacro +;Function to find ssd for 64x4 block, sse2, 12 bit depth +;Defined sepeartely to be called from SSD_ONE_SS_64 macro +INIT_XMM sse2 +cglobal ssd_ss_64x4 + pxor m4, m4 + mov r4d, 4 +.loop: + ;----process 1st half a row---- + movu m0, [r0] + movu m1, [r0 + mmsize] + movu m2, [r2] + movu m3, [r2 + mmsize] + psubw m0, m2 + psubw m1, m3 + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m4, m0 + paddd m4, m1 + movu m0, [r0 + 2 * mmsize] + movu m1, [r0 + 3 * mmsize] + movu m2, [r2 + 2 * mmsize] + movu m3, [r2 + 3 * mmsize] + psubw m0, m2 + psubw m1, m3 + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m4, m0 + paddd m4, m1 + ;----process 2nd half a row---- + movu m0, [r0 + 4 * mmsize] + movu m1, [r0 + 5 * mmsize] + movu m2, [r2 + 4 * mmsize] + movu m3, [r2 + 5 * mmsize] + psubw m0, m2 + psubw m1, m3 + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m4, m0 + paddd m4, m1 + movu m0, [r0 + 6 * mmsize] + movu m1, [r0 + 7 * mmsize] + movu m2, [r2 + 6 * mmsize] + movu m3, [r2 + 7 * mmsize] + psubw m0, m2 + psubw m1, m3 + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m4, m0 + paddd m4, m1 + + add r0, r1 + add r2, r3 + dec r4d + jnz .loop + + mova m0, m4 + pxor m1, m1 + punpckldq m0, m1 + punpckhdq m4, m1 + paddq m5, m0 + paddq m6, m4 + + ret +%macro SSD_ONE_SS_64 0 +cglobal pixel_ssd_ss_64x64, 4,6,7 + add r1d, r1d + add r3d, r3d + xor r4, r4 + pxor m5, m5 + pxor m6, m6 + mov r5d, 16 +.iterate: + call ssd_ss_64x4 + dec r5 + jne .iterate + + paddq m5, m6 + movhlps m2, m5 + paddq m5, m2 + movq rax, m5 + RET +%endmacro + %macro SSD_TWO 2 cglobal pixel_ssd_ss_%1x%2, 4,7,8 FIX_STRIDES r1, r3 @@ -529,15 +612,16 @@ %if BIT_DEPTH <= 10 SSD_ONE 32, 64 + SSD_TWO 64, 64 %else SSD_ONE_32 + SSD_ONE_SS_64 %endif SSD_TWO 48, 64 SSD_TWO 64, 16 SSD_TWO 64, 32 SSD_TWO 64, 48 -SSD_TWO 64, 64 INIT_YMM avx2 SSD_ONE 16, 8 SSD_ONE 16, 16 Thank you Regards Ramya On Mon, Sep 28, 2015 at 8:40 PM, chen <[email protected]> wrote: > code is right, just some improve suggest in below > > > At 2015-09-28 19:14:31,[email protected] wrote: > ># HG changeset patch > ># User Ramya Sriraman <[email protected]> > ># Date 1443438827 -19800 > ># Mon Sep 28 16:43:47 2015 +0530 > ># Node ID f8d4155a5a6af75bec1ca487213045442c0a38bc > ># Parent 69440d394ec2682702cb1fe5479fb1ff0babf69d > >asm: fix sse_ss[64x64] sse2 12bit > > > >diff -r 69440d394ec2 -r f8d4155a5a6a source/common/x86/asm-primitives.cpp > >--- a/source/common/x86/asm-primitives.cpp Mon Sep 28 16:13:55 2015 +0530 > >+++ b/source/common/x86/asm-primitives.cpp Mon Sep 28 16:43:47 2015 +0530 > >@@ -1006,10 +1006,11 @@ > > p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = > > (pixel_sse_t)PFX(pixel_ssd_ss_4x8_mmx2); > > p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = > > (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2); > > p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = > > (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2); > >-#if X265_DEPTH <= 10 > >- p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2); > >- ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2); > >-#endif > >+ p.cu[BLOCK_4x4].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_4x4_mmx2); > >+ p.cu[BLOCK_8x8].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_8x8_sse2); > >+ p.cu[BLOCK_16x16].sse_ss = > >(pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_sse2); > >+ p.cu[BLOCK_32x32].sse_ss = > >(pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_sse2); > >+ p.cu[BLOCK_64x64].sse_ss = > >(pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_sse2); > > p.cu[BLOCK_4x4].dct = PFX(dct4_sse2); > > p.cu[BLOCK_8x8].dct = PFX(dct8_sse2); > > p.cu[BLOCK_4x4].idct = PFX(idct4_sse2); > >diff -r 69440d394ec2 -r f8d4155a5a6a source/common/x86/ssd-a.asm > >--- a/source/common/x86/ssd-a.asm Mon Sep 28 16:13:55 2015 +0530 > >+++ b/source/common/x86/ssd-a.asm Mon Sep 28 16:43:47 2015 +0530 > >@@ -183,6 +183,84 @@ > > RET > > %endmacro > > > >+;Function to find ssd for 4x64 block, sse2, 12 bit depth > >+;Defined sepeartely to be called from SSD_ONE_SS_64 macro > >+INIT_XMM sse2 > >+cglobal ssd_ss_4x64 > We use function name as Width x Height, it means 4 pixels and 64 rows, it > can't match to below code > > >+ pxor m8, m8 > >+ mov r4d, 4 > >+.loop: > >+ ;----process 1st half a row---- > >+ movu m0, [r0] > >+ movu m1, [r0 + mmsize] > >+ movu m2, [r0 + 2 * mmsize] > >+ movu m3, [r0 + 3 * mmsize] > >+ movu m4, [r2] > >+ movu m5, [r2 + mmsize] > >+ movu m6, [r2 + 2 * mmsize] > >+ movu m7, [r2 + 3 * mmsize] > split into 2 rows format can reduce half of registers, e.g. (m0,m1) - (m2,m3) > > > >+ psubw m0, m4 > >+ psubw m1, m5 > >+ psubw m2, m6 > >+ psubw m3, m7 > >+ pmaddwd m0, m0 > >+ pmaddwd m1, m1 > >+ pmaddwd m2, m2 > >+ pmaddwd m3, m3 > >+ paddd m2, m3 > >+ paddd m0, m1 > >+ paddd m0, m2 > >+ paddd m8, m0 > >+ ;----process 2nd half a row---- > >+ movu m0, [r0 + 4 * mmsize] > >+ movu m1, [r0 + 5 * mmsize] > >+ movu m2, [r0 + 6 * mmsize] > >+ movu m3, [r0 + 7 * mmsize] > >+ movu m4, [r2 + 4 * mmsize] > >+ movu m5, [r2 + 5 * mmsize] > >+ movu m6, [r2 + 6 * mmsize] > >+ movu m7, [r2 + 7 * mmsize] > >+ psubw m0, m4 > >+ psubw m1, m5 > >+ psubw m2, m6 > >+ psubw m3, m7 > >+ pmaddwd m0, m0 > >+ pmaddwd m1, m1 > >+ pmaddwd m2, m2 > >+ pmaddwd m3, m3 > >+ paddd m2, m3 > >+ paddd m0, m1 > >+ paddd m0, m2 > >+ paddd m8, m0 > >+ add r0, r1 > >+ add r2, r3 > >+ dec r4d > >+ jnz .loop > >+ > >+ mova m4, m8 > >+ pxor m5, m5 > >+ punpckldq m8, m5 > >+ punpckhdq m4, m5 > >+ paddq m4, m8 > >+ movhlps m5, m4 > >+ paddq m4, m5 > >+ paddq m9, m4 > for single function, above code to sum into m9 is right > > but for below loop, it spending lots of time, we can do it in final stage. > > >+ ret > >+%macro SSD_ONE_SS_64 0 > >+cglobal pixel_ssd_ss_64x64, 4,7,10 > >+ add r1d, r1d > >+ add r3d, r3d > >+ xor r4, r4 > >+ pxor m9, m9 > >+ mov r5d, 16 > >+.iterate: > >+ call ssd_ss_4x64 > >+ dec r5 > >+ jne .iterate > >+ movq rax, m9 > >+ RET > >+%endmacro > > > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel > >
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
