Please ignore the previous mail. Below is the most recent updated version of the patch.
# HG changeset patch # User Ramya Sriraman <[email protected]> # Date 1443592336 -19800 # Wed Sep 30 11:22:16 2015 +0530 # Node ID 73b301b038c84d7520337c1097d5e2307766a9e4 # Parent 6e7761bdfe23addb862483f8407b388800de7d92 asm: Add sse_ss for [16x16],[32x32] & [64x64] for 8bpp avx2 diff -r 6e7761bdfe23 -r 73b301b038c8 source/common/x86/asm-primitives.cpp --- a/source/common/x86/asm-primitives.cpp Wed Sep 30 14:57:15 2015 +0530 +++ b/source/common/x86/asm-primitives.cpp Wed Sep 30 11:22:16 2015 +0530 @@ -2677,6 +2677,10 @@ #if X86_64 if (cpuMask & X265_CPU_AVX2) { + p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_avx2); + p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx2); + p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx2); + p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx2); p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx2); p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_avx2); diff -r 6e7761bdfe23 -r 73b301b038c8 source/common/x86/ssd-a.asm --- a/source/common/x86/ssd-a.asm Wed Sep 30 14:57:15 2015 +0530 +++ b/source/common/x86/ssd-a.asm Wed Sep 30 11:22:16 2015 +0530 @@ -1016,8 +1016,171 @@ SSD_SS_32xN SSD_SS_48 SSD_SS_64xN + +INIT_YMM avx2 +cglobal pixel_ssd_ss_16x16, 4,4,3 + add r1d, r1d + add r3d, r3d + pxor m2, m2 + + movu m0, [r0] + movu m1, [r0 + r1] + psubw m0, [r2] + psubw m1, [r2 + r3] + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m1 + paddd m2, m0 + + movu m0, [r0] + movu m1, [r0 + r1] + psubw m0, [r2] + psubw m1, [r2 + r3] + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m1 + paddd m2, m0 + + movu m0, [r0] + movu m1, [r0 + r1] + psubw m0, [r2] + psubw m1, [r2 + r3] + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m1 + paddd m2, m0 + + movu m0, [r0] + movu m1, [r0 + r1] + psubw m0, [r2] + psubw m1, [r2 + r3] + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m1 + paddd m2, m0 + + movu m0, [r0] + movu m1, [r0 + r1] + psubw m0, [r2] + psubw m1, [r2 + r3] + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m1 + paddd m2, m0 + + movu m0, [r0] + movu m1, [r0 + r1] + psubw m0, [r2] + psubw m1, [r2 + r3] + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m1 + paddd m2, m0 + + movu m0, [r0] + movu m1, [r0 + r1] + psubw m0, [r2] + psubw m1, [r2 + r3] + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m1 + paddd m2, m0 + + movu m0, [r0] + movu m1, [r0 + r1] + psubw m0, [r2] + psubw m1, [r2 + r3] + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m1 + paddd m2, m0 + + HADDD m2,m0 + movd eax, xm2 + RET + +INIT_YMM avx2 +cglobal pixel_ssd_ss_32x32, 4,5,3 + add r1d, r1d + add r3d, r3d + pxor m2, m2 + mov r4d, 16 +.loop: + movu m0, [r0] + movu m1, [r0 + mmsize] + psubw m0, [r2] + psubw m1, [r2 + mmsize] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m2, m1 + movu m0, [r0 + r1] + movu m1, [r0 + r1 + mmsize] + psubw m0, [r2 + r3] + psubw m1, [r2 + r3 + mmsize] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m2, m1 + lea r0, [r0 + 2 * r1] + lea r2, [r2 + 2 * r3] + dec r4d + jne .loop + + HADDD m2,m0 + movd eax, xm2 + RET +INIT_YMM avx2 +cglobal pixel_ssd_ss_64x64, 4,5,3 + add r1d, r1d + add r3d, r3d + pxor m2, m2 + mov r4d,64 +.loop: + movu m0, [r0] + movu m1, [r0 + mmsize] + psubw m0, [r2] + psubw m1, [r2 + mmsize] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m2, m1 + movu m0, [r0 + 2 * mmsize] + movu m1, [r0 + 3 * mmsize] + psubw m0, [r2 + 2 * mmsize] + psubw m1, [r2 + 3 * mmsize] + pmaddwd m0, m0 + pmaddwd m1, m1 + paddd m2, m0 + paddd m2, m1 + + add r0, r1 + add r2, r3 + + dec r4d + jne .loop + + HADDD m2,m0 + movd eax, xm2 + RET + %endif ; !HIGH_BIT_DEPTH - %if HIGH_BIT_DEPTH == 0 %macro SSD_LOAD_FULL 5 movu m1, [t0+%1] Thank you Regards Ramya On Thu, Oct 1, 2015 at 5:27 PM, Ramya Sriraman <[email protected]> wrote: > # HG changeset patch > # User Ramya Sriraman <[email protected]> > # Date 1443592336 -19800 > # Wed Sep 30 11:22:16 2015 +0530 > # Node ID f56066fbfc4de2deb969d65efdb9045f37681808 > # Parent 6e7761bdfe23addb862483f8407b388800de7d92 > asm: Add sse_ss for [16x16],[32x32] & [64x64] for 8bpp avx2 > > diff -r 6e7761bdfe23 -r f56066fbfc4d source/common/x86/asm-primitives.cpp > --- a/source/common/x86/asm-primitives.cpp Wed Sep 30 14:57:15 2015 +0530 > +++ b/source/common/x86/asm-primitives.cpp Wed Sep 30 11:22:16 2015 +0530 > @@ -2677,6 +2677,10 @@ > #if X86_64 > if (cpuMask & X265_CPU_AVX2) > { > + p.cu[BLOCK_16x16].sse_ss = > (pixel_sse_ss_t)PFX(pixel_ssd_ss_16x16_avx2); > + p.cu[BLOCK_32x32].sse_ss = > (pixel_sse_ss_t)PFX(pixel_ssd_ss_32x32_avx2); > + p.cu[BLOCK_64x64].sse_ss = > (pixel_sse_ss_t)PFX(pixel_ssd_ss_64x64_avx2); > + > p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx2); > p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_avx2); > p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_avx2); > diff -r 6e7761bdfe23 -r f56066fbfc4d source/common/x86/ssd-a.asm > --- a/source/common/x86/ssd-a.asm Wed Sep 30 14:57:15 2015 +0530 > +++ b/source/common/x86/ssd-a.asm Wed Sep 30 11:22:16 2015 +0530 > @@ -1016,8 +1016,175 @@ > SSD_SS_32xN > SSD_SS_48 > SSD_SS_64xN > + > +INIT_YMM avx2 > +cglobal pixel_ssd_ss_16x16, 4,4,5 > + add r1d, r1d > + add r3d, r3d > + pxor m4, m4 > + > + movu m0, [r0] > + movu m1, [r0+r1] > + psubw m0, [r2] > + psubw m1, [r2+r3] > + lea r0, [r0+2*r1] > + lea r2, [r2+2*r3] > + pmaddwd m0, m0 > + pmaddwd m1, m1 > + paddd m0 , m1 > + paddd m4, m0 > + > + movu m0, [r0] > + movu m1, [r0+r1] > + psubw m0, [r2] > + psubw m1, [r2+r3] > + lea r0, [r0+2*r1] > + lea r2, [r2+2*r3] > + pmaddwd m0, m0 > + pmaddwd m1, m1 > + paddd m0, m1 > + paddd m4, m0 > + > + movu m0, [r0] > + movu m1, [r0+r1] > + psubw m0, [r2] > + psubw m1, [r2+r3] > + lea r0, [r0+2*r1] > + lea r2, [r2+2*r3] > + pmaddwd m0, m0 > + pmaddwd m1, m1 > + paddd m0, m1 > + paddd m4, m0 > + > + movu m0, [r0] > + movu m1, [r0+r1] > + psubw m0, [r2] > + psubw m1, [r2+r3] > + lea r0, [r0+2*r1] > + lea r2, [r2+2*r3] > + pmaddwd m0, m0 > + pmaddwd m1, m1 > + paddd m0, m1 > + paddd m4, m0 > + > + movu m0, [r0] > + movu m1, [r0+r1] > + psubw m0, [r2] > + psubw m1, [r2+r3] > + lea r0, [r0+2*r1] > + lea r2, [r2+2*r3] > + pmaddwd m0, m0 > + pmaddwd m1, m1 > + paddd m0, m1 > + paddd m4, m0 > + > + movu m0, [r0] > + movu m1, [r0+r1] > + psubw m0, [r2] > + psubw m1, [r2+r3] > + lea r0, [r0+2*r1] > + lea r2, [r2+2*r3] > + pmaddwd m0, m0 > + pmaddwd m1, m1 > + paddd m0, m1 > + paddd m4, m0 > + > + movu m0, [r0] > + movu m1, [r0+r1] > + psubw m0, [r2] > + psubw m1, [r2+r3] > + lea r0, [r0+2*r1] > + lea r2, [r2+2*r3] > + pmaddwd m0, m0 > + pmaddwd m1, m1 > + paddd m0, m1 > + paddd m4, m0 > + > + movu m0, [r0] > + movu m1, [r0+r1] > + psubw m0, [r2] > + psubw m1, [r2+r3] > + lea r0, [r0+2*r1] > + lea r2, [r2+2*r3] > + pmaddwd m0, m0 > + pmaddwd m1, m1 > + paddd m0, m1 > + paddd m4, m0 > + > + HADDD m4,m0 > + movd eax, xm4 > + RET > + > +INIT_YMM avx2 > +cglobal pixel_ssd_ss_32x32, 4,5,5 > + add r1d, r1d > + add r3d, r3d > + pxor m4, m4 > + mov r4d, 16 > +.loop: > + movu m0, [r0] > + movu m1, [r0+mmsize] > + psubw m0, [r2] > + psubw m1, [r2+mmsize] > + pmaddwd m0, m0 > + pmaddwd m1, m1 > + paddd m4, m0 > + paddd m4, m1 > + movu m0, [r0+r1] > + movu m1, [r0+r1+mmsize] > + movu m2, [r2+r3] > + movu m3, [r2+r3+mmsize] > + psubw m0, m2 > + psubw m1, m3 > + pmaddwd m0, m0 > + pmaddwd m1, m1 > + paddd m4, m0 > + paddd m4, m1 > + lea r0, [r0+2*r1] > + lea r2, [r2+2*r3] > + dec r4d > + jne .loop > + > + HADDD m4,m0 > + movd eax, xm4 > + RET > +INIT_YMM avx2 > +cglobal pixel_ssd_ss_64x64, 4,5,5 > + add r1d, r1d > + add r3d, r3d > + pxor m4, m4 > + mov r4d,64 > +.loop: > + movu m0, [r0] > + movu m1, [r0+mmsize] > + psubw m0, [r2] > + psubw m1, [r2+mmsize] > + pmaddwd m0, m0 > + pmaddwd m1, m1 > + paddd m4, m0 > + paddd m4, m1 > + movu m0, [r0+2*mmsize] > + movu m1, [r0+3*mmsize] > + movu m2, [r2+2*mmsize] > + movu m3, [r2+3*mmsize] > + psubw m0, m2 > + psubw m1, m3 > + pmaddwd m0, m0 > + pmaddwd m1, m1 > + paddd m4, m0 > + paddd m4, m1 > + > + add r0, r1 > + add r2, r3 > + > + dec r4d > + jne .loop > + > + HADDD m4,m0 > + movd eax, xm4 > + RET > + > %endif ; !HIGH_BIT_DEPTH > - > %if HIGH_BIT_DEPTH == 0 > %macro SSD_LOAD_FULL 5 > movu m1, [t0+%1] > > > > Thank you > Regards > Ramya > > On Wed, Sep 30, 2015 at 8:29 PM, chen <[email protected]> wrote: > >> >> >> At 2015-09-30 13:53:18,[email protected] wrote: >> ># HG changeset patch >> ># User Ramya Sriraman [email protected]> >> ># Date 1443592336 -19800 >> ># Wed Sep 30 11:22:16 2015 +0530 >> ># Node ID 29b61906162c657da241aecee9012e3f2da34b6d >> ># Parent 5f1451e5842252b31442e8b6519138d8033bbb2b >> >asm: Add sse_ss for [16x16],[32x32] & [64x64] for 8bpp avx2 >> > <[email protected]%3E%3E#%C2%A0Date%C2%A01443592336%C2%A0-19800%3E%23%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0Wed%C2%A0Sep%C2%A030%C2%A011:22:16%C2%A02015%C2%A0+0530%3E%23%C2%A0Node%C2%A0ID%C2%A029b61906162c657da241aecee9012e3f2da34b6d%3E%23%C2%A0Parent%C2%A0%C2%A05f1451e5842252b31442e8b6519138d8033bbb2b%3Easm:%C2%A0Add%C2%A0sse_ss%C2%A0for%C2%A0[16x16],[32x32]%C2%A0&%C2%A0[64x64]%C2%A0for%C2%A08bpp%C2%A0avx2%3E>diff >> > -r 5f1451e58422 -r 29b61906162c source/common/x86/ssd-a.asm >> >--- a/source/common/x86/ssd-a.asm Mon Sep 28 16:43:47 2015 +0530 >> >+++ b/source/common/x86/ssd-a.asm Wed Sep 30 11:22:16 2015 +0530 >> >@@ -1100,8 +1100,195 @@ >> > SSD_SS_32xN >> > SSD_SS_48 >> > SSD_SS_64xN >> >+ >> >+INIT_YMM avx2 >> >+cglobal pixel_ssd_ss_16x16, 4,4,5 >> >+ add r1d, r1d >> >+ add r3d, r3d >> >+ pxor m4, m4 >> >+ >> >+ movu m0, [r0] >> >+ movu m1, [r0+r1] >> >+ movu m2, [r2] >> >+ movu m3, [r2+r3] >> >+ psubw m0, m2 >> in avx2, vpsubw can work on unaligned address >> >> >+ psubw m1, m3 >> >+ lea r0, [r0+2*r1] >> >+ lea r2, [r2+2*r3] >> >+ pmaddwd m0, m0 >> >+ pmaddwd m1, m1 >> >+ paddd m0 , m1 >> >+ paddd m4, m0 >> >> >> >> _______________________________________________ >> x265-devel mailing list >> [email protected] >> https://mailman.videolan.org/listinfo/x265-devel >> >> >
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
