Thanks, if 10bpp also overflow, we may remove PMADDWD to avoid overflow in 8bpp, these cost get from lookahead, I never analyze dynamic range before.
At 2015-11-17 21:48:34,"Dnyaneshwar Gorade" <[email protected]> wrote: Hi Min, There is overflow with 10-bit as well which causes output mismatch. So, we should use pmaddwd for BIT_DEPTH <= 8 only. You can reproduce output mismtach with following CLI- (frame no: 141, check value: m_scratch + 114) CrowdRun_1920x1080_50_10bit_422.yuv --preset fast --aq-mode 0 --sar 2 --range full --no-info --hash=1 --psnr --ssim On Wed, Nov 11, 2015 at 10:12 PM, Min Chen <[email protected]> wrote: # HG changeset patch # User Min Chen <[email protected]> # Date 1447258832 21600 # Node ID df66a0f940c87df49318203de0231dca6ad8b4e4 # Parent a74493c5b7ab137c3f082d9a661a7498a883baad asm: fix Main12 bug in mbtree_propagate_cost, (the IntraCost over 16bits) --- source/common/x86/mc-a2.asm | 29 +++++++++++++++++++++++++++++ 1 files changed, 29 insertions(+), 0 deletions(-) diff -r a74493c5b7ab -r df66a0f940c8 source/common/x86/mc-a2.asm --- a/source/common/x86/mc-a2.asm Wed Nov 11 10:04:57 2015 -0600 +++ b/source/common/x86/mc-a2.asm Wed Nov 11 10:20:32 2015 -0600 @@ -1019,7 +1019,16 @@ por m3, m1 movd m1, [r1+r5*2] ; prop +%if (BIT_DEPTH <= 10) pmaddwd m0, m2 +%else + punpckldq m2, m2 + punpckldq m0, m0 + pmuludq m0, m2 + pshufd m2, m2, q3120 + pshufd m0, m0, q3120 +%endif + punpcklwd m1, m4 cvtdq2pd m0, m0 mulpd m0, m6 ; intra*invq*fps_factor>>8 @@ -1063,7 +1072,15 @@ por m3, m1 movd m1, [r1+r5*2] ; prop +%if (BIT_DEPTH <= 10) pmaddwd m0, m2 +%else + punpckldq m2, m2 ; DWORD [- 1 - 0] + punpckldq m0, m0 + pmuludq m0, m2 ; QWORD [m1 m0] + pshufd m2, m2, q3120 + pshufd m0, m0, q3120 +%endif punpcklwd m1, m4 cvtdq2pd m0, m0 mulpd m0, m6 ; intra*invq*fps_factor>>8 @@ -1103,7 +1120,11 @@ pminsd xm3, xm2 pmovzxwd xm1, [r1+r5*2] ; prop +%if (BIT_DEPTH <= 10) pmaddwd xm0, xm2 +%else + pmulld xm0, xm2 +%endif cvtdq2pd m0, xm0 cvtdq2pd m1, xm1 ; prop %if cpuflag(avx2) @@ -1145,7 +1166,11 @@ movd xm1, [r1+r5*2] ; prop pmovzxwd xm1, xm1 +%if (BIT_DEPTH <= 10) pmaddwd xm0, xm2 +%else + pmulld xm0, xm2 +%endif cvtdq2pd m0, xm0 cvtdq2pd m1, xm1 ; prop %if cpuflag(avx2) @@ -1179,7 +1204,11 @@ movzx r6d, word [r1+r5*2] ; prop movd xm1, r6d +%if (BIT_DEPTH <= 10) pmaddwd xm0, xm2 +%else + pmulld xm0, xm2 +%endif cvtdq2pd m0, xm0 cvtdq2pd m1, xm1 ; prop %if cpuflag(avx2) _______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
