I just checked and ratecontrol.cpp uses var for block sizes 8x8 and 16x16. All the other block sizes are unused.
We should probably define only square block sizes for this primitive. On Nov 25, 2013, at 2:07 PM, Steve Borho <st...@borho.org> wrote: > > On Nov 25, 2013, at 7:38 AM, muru...@multicorewareinc.com wrote: > >> # HG changeset patch >> # User Murugan Vairavel <muru...@multicorewareinc.com> >> # Date 1385386658 -19800 >> # Mon Nov 25 19:07:38 2013 +0530 >> # Node ID deb2fc2dcaf24a86132ebfe0fbaac4859611c92f >> # Parent 43da6ca15a61e18d033931ca58940d6794f6f8f8 >> asm: code for pixel_var_8xN > > I'm not sure the encoder uses any variance block measurements other than 8x8 > >> >> diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/pixel.cpp >> --- a/source/common/pixel.cpp Mon Nov 25 18:46:28 2013 +0530 >> +++ b/source/common/pixel.cpp Mon Nov 25 19:07:38 2013 +0530 >> @@ -968,8 +968,11 @@ >> p.ssim_4x4x2_core = ssim_4x4x2_core; >> p.ssim_end_4 = ssim_end_4; >> >> - p.var[LUMA_16x16] = pixel_var<16, 16>; >> + p.var[LUMA_8x4] = pixel_var<8, 4>; >> p.var[LUMA_8x8] = pixel_var<8, 8>; >> + p.var[LUMA_8x16] = pixel_var<8, 16>; >> + p.var[LUMA_8x32] = pixel_var<8, 32>; >> + >> p.plane_copy_deinterleave_c = plane_copy_deinterleave_chroma; >> } >> } >> diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/asm-primitives.cpp >> --- a/source/common/x86/asm-primitives.cpp Mon Nov 25 18:46:28 2013 +0530 >> +++ b/source/common/x86/asm-primitives.cpp Mon Nov 25 19:07:38 2013 +0530 >> @@ -412,6 +412,15 @@ >> SETUP_LUMA_BLOCKCOPY_FUNC_DEF(64, 16, cpu); \ >> SETUP_LUMA_BLOCKCOPY_FUNC_DEF(16, 64, cpu); >> >> +#define SETUP_PIXEL_VAR_DEF(W, H, cpu) \ >> + p.var[LUMA_ ## W ## x ## H] = x265_pixel_var_ ## W ## x ## H ## cpu; >> + >> +#define LUMA_VAR(cpu) \ >> + SETUP_PIXEL_VAR_DEF(8, 4, cpu); \ >> + SETUP_PIXEL_VAR_DEF(8, 8, cpu); \ >> + SETUP_PIXEL_VAR_DEF(8, 16, cpu); \ >> + SETUP_PIXEL_VAR_DEF(8, 32, cpu); >> + >> namespace x265 { >> // private x265 namespace >> >> @@ -442,6 +451,8 @@ >> PIXEL_AVG(sse2); >> PIXEL_AVG_W4(mmx2); >> >> + LUMA_VAR(_sse2); >> + >> p.sad[LUMA_8x32] = x265_pixel_sad_8x32_sse2; >> p.sad[LUMA_16x4] = x265_pixel_sad_16x4_sse2; >> p.sad[LUMA_16x12] = x265_pixel_sad_16x12_sse2; >> diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/pixel-a.asm >> --- a/source/common/x86/pixel-a.asm Mon Nov 25 18:46:28 2013 +0530 >> +++ b/source/common/x86/pixel-a.asm Mon Nov 25 19:07:38 2013 +0530 >> @@ -1301,6 +1301,106 @@ >> >> %if HIGH_BIT_DEPTH == 0 >> %macro VAR 0 >> +cglobal pixel_var_8x4, 2,3,8 >> + VAR_START 1 >> + lea r2, [r1 * 3] >> + movh m0, [r0] >> + movh m3, [r0 + r1] >> + movhps m0, [r0 + r1 * 2] >> + movhps m3, [r0 + r2] >> + DEINTB 1, 0, 4, 3, 7 >> + lea r0, [r0 + r1 * 4] >> + VAR_CORE >> + VAR_END 8, 4 >> + >> +cglobal pixel_var_8x8, 2,3,8 >> + VAR_START 1 >> + lea r2, [r1 * 3] >> + movh m0, [r0] >> + movh m3, [r0 + r1] >> + movhps m0, [r0 + r1 * 2] >> + movhps m3, [r0 + r2] >> + DEINTB 1, 0, 4, 3, 7 >> + lea r0, [r0 + r1 * 4] >> + VAR_CORE >> + movh m0, [r0] >> + movh m3, [r0 + r1] >> + movhps m0, [r0 + r1 * 2] >> + movhps m3, [r0 + r2] >> + DEINTB 1, 0, 4, 3, 7 >> + VAR_CORE >> + VAR_END 8, 8 >> + >> + >> +cglobal pixel_var_8x16, 2,4,8 >> + VAR_START 1 >> + lea r2, [r1 * 3] >> + movh m0, [r0] >> + movh m3, [r0 + r1] >> + movhps m0, [r0 + r1 * 2] >> + movhps m3, [r0 + r2] >> + DEINTB 1, 0, 4, 3, 7 >> + lea r0, [r0 + r1 * 4] >> + VAR_CORE >> + movh m0, [r0] >> + movh m3, [r0 + r1] >> + movhps m0, [r0 + r1 * 2] >> + movhps m3, [r0 + r2] >> + DEINTB 1, 0, 4, 3, 7 >> + lea r0, [r0 + r1 * 4] >> + VAR_CORE >> + movh m0, [r0] >> + movh m3, [r0 + r1] >> + movhps m0, [r0 + r1 * 2] >> + movhps m3, [r0 + r2] >> + DEINTB 1, 0, 4, 3, 7 >> + lea r0, [r0 + r1 * 4] >> + VAR_CORE >> + movh m0, [r0] >> + movh m3, [r0 + r1] >> + movhps m0, [r0 + r1 * 2] >> + movhps m3, [r0 + r2] >> + DEINTB 1, 0, 4, 3, 7 >> + VAR_CORE >> + VAR_END 8, 16 >> + >> +cglobal pixel_var_8x32, 2,4,8 >> + VAR_START 1 >> + mov r2d, 2 >> + lea r3, [r1 * 3] >> +.loop: >> + movh m0, [r0] >> + movh m3, [r0 + r1] >> + movhps m0, [r0 + r1 * 2] >> + movhps m3, [r0 + r3] >> + DEINTB 1, 0, 4, 3, 7 >> + lea r0, [r0 + r1 * 4] >> + VAR_CORE >> + movh m0, [r0] >> + movh m3, [r0 + r1] >> + movhps m0, [r0 + r1 * 2] >> + movhps m3, [r0 + r3] >> + DEINTB 1, 0, 4, 3, 7 >> + lea r0, [r0 + r1 * 4] >> + VAR_CORE >> + movh m0, [r0] >> + movh m3, [r0 + r1] >> + movhps m0, [r0 + r1 * 2] >> + movhps m3, [r0 + r3] >> + DEINTB 1, 0, 4, 3, 7 >> + lea r0, [r0 + r1 * 4] >> + VAR_CORE >> + movh m0, [r0] >> + movh m3, [r0 + r1] >> + movhps m0, [r0 + r1 * 2] >> + movhps m3, [r0 + r3] >> + DEINTB 1, 0, 4, 3, 7 >> + lea r0, [r0 + r1 * 4] >> + VAR_CORE >> + dec r2d >> + jnz .loop >> + VAR_END 8, 32 >> + >> cglobal pixel_var_16x16, 2,3,8 >> VAR_START 1 >> mov r2d, 8 >> @@ -1313,38 +1413,6 @@ >> dec r2d >> jg .loop >> VAR_END 16, 16 >> - >> -cglobal pixel_var_8x8, 2,4,8 >> - VAR_START 1 >> - mov r2d, 2 >> - lea r3, [r1*3] >> -.loop: >> - movh m0, [r0] >> - movh m3, [r0+r1] >> - movhps m0, [r0+r1*2] >> - movhps m3, [r0+r3] >> - DEINTB 1, 0, 4, 3, 7 >> - lea r0, [r0+r1*4] >> - VAR_CORE >> - dec r2d >> - jg .loop >> - VAR_END 8, 8 >> - >> -cglobal pixel_var_8x16, 2,4,8 >> - VAR_START 1 >> - mov r2d, 4 >> - lea r3, [r1*3] >> -.loop: >> - movh m0, [r0] >> - movh m3, [r0+r1] >> - movhps m0, [r0+r1*2] >> - movhps m3, [r0+r3] >> - DEINTB 1, 0, 4, 3, 7 >> - lea r0, [r0+r1*4] >> - VAR_CORE >> - dec r2d >> - jg .loop >> - VAR_END 8, 16 >> %endmacro ; VAR >> >> INIT_XMM sse2 >> diff -r 43da6ca15a61 -r deb2fc2dcaf2 source/common/x86/pixel.h >> --- a/source/common/x86/pixel.h Mon Nov 25 18:46:28 2013 +0530 >> +++ b/source/common/x86/pixel.h Mon Nov 25 19:07:38 2013 +0530 >> @@ -347,6 +347,17 @@ >> CHROMA_PIXELSUB_DEF(_sse4); >> LUMA_PIXELSUB_DEF(_sse4); >> >> +#define SETUP_LUMA_PIXELVAR_FUNC(W, H, cpu) \ >> + uint64_t x265_pixel_var_ ## W ## x ## H ## cpu(pixel *pix, intptr_t >> pixstride); >> + >> +#define LUMA_PIXELVAR_DEF(cpu) \ >> + SETUP_LUMA_PIXELVAR_FUNC(8, 4, cpu); \ >> + SETUP_LUMA_PIXELVAR_FUNC(8, 8, cpu); \ >> + SETUP_LUMA_PIXELVAR_FUNC(8, 16, cpu); \ >> + SETUP_LUMA_PIXELVAR_FUNC(8, 32, cpu); >> + >> +LUMA_PIXELVAR_DEF(_sse2); >> + >> #undef DECL_PIXELS >> #undef DECL_SUF >> #undef DECL_HEVC_SSD >> @@ -357,6 +368,8 @@ >> #undef SETUP_LUMA_PIXELSUB_PS_FUNC >> #undef CHROMA_PIXELSUB_DEF >> #undef LUMA_PIXELSUB_DEF >> +#undef LUMA_PIXELVAR_DEF >> +#undef SETUP_LUMA_PIXELVAR_FUNC >> >> void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, pixel* recon, >> int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int >> strideipred); >> void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, pixel* recon, >> int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int >> strideipred); >> _______________________________________________ >> x265-devel mailing list >> x265-devel@videolan.org >> https://mailman.videolan.org/listinfo/x265-devel >
signature.asc
Description: Message signed with OpenPGP using GPGMail
_______________________________________________ x265-devel mailing list x265-devel@videolan.org https://mailman.videolan.org/listinfo/x265-devel