code is right, just some improve suggest in below
At 2015-09-28 19:14:31,[email protected] wrote: ># HG changeset patch ># User Ramya Sriraman <[email protected]> ># Date 1443438827 -19800 ># Mon Sep 28 16:43:47 2015 +0530 ># Node ID f8d4155a5a6af75bec1ca487213045442c0a38bc ># Parent 69440d394ec2682702cb1fe5479fb1ff0babf69d >asm: fix sse_ss[64x64] sse2 12bit > >diff -r 69440d394ec2 -r f8d4155a5a6a source/common/x86/asm-primitives.cpp >--- a/source/common/x86/asm-primitives.cpp Mon Sep 28 16:13:55 2015 +0530 >+++ b/source/common/x86/asm-primitives.cpp Mon Sep 28 16:43:47 2015 +0530 >@@ -1006,10 +1006,11 @@ > p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = > (pixel_sse_t)PFX(pixel_ssd_ss_4x8_mmx2); > p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = > (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2); > p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = > (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2); >-#if X265_DEPTH <= 10 >- p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2); >- ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2); >-#endif >+ p.cu[BLOCK_4x4].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_4x4_mmx2); >+ p.cu[BLOCK_8x8].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_8x8_sse2); >+ p.cu[BLOCK_16x16].sse_ss = >(pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_sse2); >+ p.cu[BLOCK_32x32].sse_ss = >(pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_sse2); >+ p.cu[BLOCK_64x64].sse_ss = >(pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_sse2); > p.cu[BLOCK_4x4].dct = PFX(dct4_sse2); > p.cu[BLOCK_8x8].dct = PFX(dct8_sse2); > p.cu[BLOCK_4x4].idct = PFX(idct4_sse2); >diff -r 69440d394ec2 -r f8d4155a5a6a source/common/x86/ssd-a.asm >--- a/source/common/x86/ssd-a.asm Mon Sep 28 16:13:55 2015 +0530 >+++ b/source/common/x86/ssd-a.asm Mon Sep 28 16:43:47 2015 +0530 >@@ -183,6 +183,84 @@ > RET > %endmacro > >+;Function to find ssd for 4x64 block, sse2, 12 bit depth >+;Defined sepeartely to be called from SSD_ONE_SS_64 macro >+INIT_XMM sse2 >+cglobal ssd_ss_4x64 We use function name as Width x Height, it means 4 pixels and 64 rows, it can't match to below code >+ pxor m8, m8 >+ mov r4d, 4 >+.loop: >+ ;----process 1st half a row---- >+ movu m0, [r0] >+ movu m1, [r0 + mmsize] >+ movu m2, [r0 + 2 * mmsize] >+ movu m3, [r0 + 3 * mmsize] >+ movu m4, [r2] >+ movu m5, [r2 + mmsize] >+ movu m6, [r2 + 2 * mmsize] >+ movu m7, [r2 + 3 * mmsize] split into 2 rows format can reduce half of registers, e.g. (m0,m1) - (m2,m3) >+ psubw m0, m4 >+ psubw m1, m5 >+ psubw m2, m6 >+ psubw m3, m7 >+ pmaddwd m0, m0 >+ pmaddwd m1, m1 >+ pmaddwd m2, m2 >+ pmaddwd m3, m3 >+ paddd m2, m3 >+ paddd m0, m1 >+ paddd m0, m2 >+ paddd m8, m0 >+ ;----process 2nd half a row---- >+ movu m0, [r0 + 4 * mmsize] >+ movu m1, [r0 + 5 * mmsize] >+ movu m2, [r0 + 6 * mmsize] >+ movu m3, [r0 + 7 * mmsize] >+ movu m4, [r2 + 4 * mmsize] >+ movu m5, [r2 + 5 * mmsize] >+ movu m6, [r2 + 6 * mmsize] >+ movu m7, [r2 + 7 * mmsize] >+ psubw m0, m4 >+ psubw m1, m5 >+ psubw m2, m6 >+ psubw m3, m7 >+ pmaddwd m0, m0 >+ pmaddwd m1, m1 >+ pmaddwd m2, m2 >+ pmaddwd m3, m3 >+ paddd m2, m3 >+ paddd m0, m1 >+ paddd m0, m2 >+ paddd m8, m0 >+ add r0, r1 >+ add r2, r3 >+ dec r4d >+ jnz .loop >+ >+ mova m4, m8 >+ pxor m5, m5 >+ punpckldq m8, m5 >+ punpckhdq m4, m5 >+ paddq m4, m8 >+ movhlps m5, m4 >+ paddq m4, m5 >+ paddq m9, m4 for single function, above code to sum into m9 is right but for below loop, it spending lots of time, we can do it in final stage. >+ ret >+%macro SSD_ONE_SS_64 0 >+cglobal pixel_ssd_ss_64x64, 4,7,10 >+ add r1d, r1d >+ add r3d, r3d >+ xor r4, r4 >+ pxor m9, m9 >+ mov r5d, 16 >+.iterate: >+ call ssd_ss_4x64 >+ dec r5 >+ jne .iterate >+ movq rax, m9 >+ RET >+%endmacro
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
