>@@ -1154,14 +1154,18 @@ > ; void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num, int > scale, int shift) > ;----------------------------------------------------------------------------- > INIT_XMM sse4 >-cglobal dequant_normal, 2,5,8 >- movd m1, r3m ; m1 = word [scale] >+cglobal dequant_normal, 4,6,5 >+ movd m1, r3 ; m1 = word [scale] >+ cmp r3d, 255 >+ jle .skip >+ psrld m1, 2 >+.skip: > mov r4d, r4m > movd m0, r4d ; m0 = shift >- xor r3d, r3d >+ xor r5d, r5d > dec r4d >- bts r3d, r4d >- movd m2, r3d >+ bts r5d, r4d >+ movd m2, r5d > punpcklwd m1, m2 > pshufd m1, m1, 0 ; m1 = dword [add scale] > mova m2, [pw_1] >@@ -1174,6 +1178,10 @@ > movu m3, [r0] > movu m4, [r0 + 16] > packssdw m3, m4 ; m3 = clipQCoef >+ cmp r3d, 255 >+ jle .skip1 >+ psllw m3, 2 >+.skip1: > punpckhwd m4, m3, m2 > punpcklwd m3, m2 > pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
Don't use jmp on inner loop, it is low performance you can do it like two code block and jmp on function head
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
