The encoder uses 4x4 block size by testing with these two videos NebutaFestival_2560x1600_60_10bit_crop.yuv and 720p50_parkrun_ter.y4m. Since there is no similar function definition, I have generated the new function definition for 4x4.
On Tue, Dec 16, 2014 at 10:55 PM, chen <[email protected]> wrote: > > > > > At 2014-12-16 18:35:49,"Divya Manivannan" <[email protected]> wrote: > ># HG changeset patch > ># User Divya Manivannan <[email protected]> > ># Date 1418726099 -19800 > ># Tue Dec 16 16:04:59 2014 +0530 > ># Node ID de6f39b44c144aa56c68d27d6ee201e7dd493755 > ># Parent 775ebb4694ad7931a98b796640bf646085659ea2 > >asm: added psy_acEnergy_pp_4x4 in sse4 for psyCost_pp > > > >diff -r 775ebb4694ad -r de6f39b44c14 source/common/pixel.cpp > >--- a/source/common/pixel.cpp Tue Dec 16 09:40:00 2014 +0530 > >+++ b/source/common/pixel.cpp Tue Dec 16 16:04:59 2014 +0530 > >@@ -795,8 +795,18 @@ > > else > > { > > /* 4x4 is too small for sa8d */ > >- int sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, > >4>(source, sstride, zeroBuf, 0) >> 2); > >- int reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, > >4>(recon, rstride, zeroBuf, 0) >> 2); > >+ int sourceEnergy, reconEnergy; > >+ if (!HIGH_BIT_DEPTH) // once HBD asm code is developed, if > >condition will go away > > #if > > > > >+ { > >+ sourceEnergy = primitives.psy_acEnergy_pp(source, sstride); > >+ reconEnergy = primitives.psy_acEnergy_pp(recon, rstride); > >+ } > >+ else > >+ { > >+ //original code; > >+ sourceEnergy = satd_4x4(source, sstride, zeroBuf, 0) - (sad<4, > >4>(source, sstride, zeroBuf, 0) >> 2); > >+ reconEnergy = satd_4x4(recon, rstride, zeroBuf, 0) - (sad<4, > >4>(recon, rstride, zeroBuf, 0) >> 2); > > reference code may put into primitives C model > > >+ } > > return abs(sourceEnergy - reconEnergy); > > } > > } > >diff -r 775ebb4694ad -r de6f39b44c14 source/common/primitives.h > >--- a/source/common/primitives.h Tue Dec 16 09:40:00 2014 +0530 > >+++ b/source/common/primitives.h Tue Dec 16 16:04:59 2014 +0530 > >@@ -195,6 +195,7 @@ > > typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, > > pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t > > mask); > > > > typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* > > propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const > > int32_t* invQscales, const double* fpsFactor, int len); > >+typedef int(*psy_acEnergy_pp_t)(const pixel* pix, intptr_t stride); > > > > /* Define a structure containing function pointers to optimized encoder > > * primitives. Each pointer can reference either an assembly routine, > >@@ -213,6 +214,7 @@ > > pixelcmp_t sa8d[NUM_SQUARE_BLOCKS]; // sa8d > > primitives for square intra blocks > > pixelcmp_t psy_cost_pp[NUM_SQUARE_BLOCKS]; // difference in > > AC energy between two blocks > > pixelcmp_ss_t psy_cost_ss[NUM_SQUARE_BLOCKS]; > >+ psy_acEnergy_pp_t psy_acEnergy_pp; > > > > dct_t dct[NUM_DCTS]; > > idct_t idct[NUM_IDCTS]; > >diff -r 775ebb4694ad -r de6f39b44c14 source/common/x86/asm-primitives.cpp > >--- a/source/common/x86/asm-primitives.cpp Tue Dec 16 09:40:00 2014 +0530 > >+++ b/source/common/x86/asm-primitives.cpp Tue Dec 16 16:04:59 2014 +0530 > >@@ -1898,6 +1898,9 @@ > > p.chroma[X265_CSP_I420].filter_vpp[CHROMA_16x16] = > > x265_interp_4tap_vert_pp_16x16_avx2; > > p.chroma[X265_CSP_I420].filter_vpp[CHROMA_32x32] = > > x265_interp_4tap_vert_pp_32x32_avx2; > > #endif > >+ > >+ p.psy_acEnergy_pp = x265_psy_acEnergy_pp_4x4_sse4; > >+ > > } > > #endif // if HIGH_BIT_DEPTH > > } > >diff -r 775ebb4694ad -r de6f39b44c14 source/common/x86/pixel-a.asm > >--- a/source/common/x86/pixel-a.asm Tue Dec 16 09:40:00 2014 +0530 > >+++ b/source/common/x86/pixel-a.asm Tue Dec 16 16:04:59 2014 +0530 > >@@ -6579,3 +6579,35 @@ > > mov [r2], r3w > > .end: > > RET > >+ > >+;--------------------------------------------------------------------------------------------------------------------- > >+;int psy_acEnergy_pp(const pixel* source, intptr_t sstride) > >+;--------------------------------------------------------------------------------------------------------------------- > >+INIT_XMM sse4 > >+cglobal psy_acEnergy_pp_4x4, 2, 3, 6 > >+ > >+ lea r2, [3 * r1] > >+ movd m0, [r0] > >+ movd m1, [r0 + r1] > >+ movd m2, [r0 + r1 * 2] > >+ movd m3, [r0 + r2] > >+ shufps m0, m1, 0 > > overwrite m0 with m1 lowest 32-bits? > > the compute logic is wrong below > > > > >+ shufps m2, m3, 0 > >+ mova m4, [hmul_4p] > >+ pmaddubsw m0, m4 > >+ pmaddubsw m2, m4 > >+ > >+ paddw m5, m0, m2 > >+ movhlps m4, m5 > >+ paddw m5, m4 > >+ phaddw m5, m5 > > pmaddwd may replace this phaddw to avoid Port5, just another way, need > analyze output object code. > > > > >+ pmovzxwd m5, m5 > > reduce by above pmaddwd > > > > >+ psrld m5, 2 > >+ > >+ HADAMARD 0, sumsub, 0, 2, 1, 3 > >+ HADAMARD 4, sumsub, 0, 2, 1, 3 > >+ HADAMARD 1, amax, 0, 2, 1, 3 > >+ HADDW m0, m2 > >+ psubd m0, m5 > >+ movd eax, m0 > >+ RET > > > _______________________________________________ > x265-devel mailing list > [email protected] > https://mailman.videolan.org/listinfo/x265-devel > >
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
