+INIT_XMM sse2
+cglobal count_nonzero, 2,3,4
+    pxor        m0, m0
+    pxor        m1, m1
+    mov         r2d, r1d
+    shr         r1d, 3
+
+.loop

+    mova        m2, [r0]
+    mova        m3, [r0 + 16]

+    add         r0, 32

+    packssdw    m2, m3,
just count, no need it
 
+    pcmpeqw     m2, m0
+    psrlw       m2, 15
pcmp generte mask, it is 0xFFFF, so we no need to shift right
 
+    packsswb    m2, m2
+    psadbw      m2, m0
psad is low perf, why you need exact number in inner loop?
of course, abs(-1) = abs(1) 

+    paddd       m1, m2
+    dec         r1d
+    jnz        .loop
+
+    movd        r1d, m1
+    sub         r2d, r1d
+    mov         eax, r2d
+
+    RET
_______________________________________________
x265-devel mailing list
[email protected]
https://mailman.videolan.org/listinfo/x265-devel

Reply via email to