+INIT_XMM sse2 +cglobal count_nonzero, 2,3,4 + pxor m0, m0 + pxor m1, m1 + mov r2d, r1d + shr r1d, 3 + +.loop
+ mova m2, [r0] + mova m3, [r0 + 16] + add r0, 32 + packssdw m2, m3, just count, no need it + pcmpeqw m2, m0 + psrlw m2, 15 pcmp generte mask, it is 0xFFFF, so we no need to shift right + packsswb m2, m2 + psadbw m2, m0 psad is low perf, why you need exact number in inner loop? of course, abs(-1) = abs(1) + paddd m1, m2 + dec r1d + jnz .loop + + movd r1d, m1 + sub r2d, r1d + mov eax, r2d + + RET
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
