First version 500ms
LBB0_1: # =>This Inner Loop Header: Depth=1
movdqa xmm3, xmm5
movdqa xmm6, xmm5
paddd xmm5, xmmword ptr [__xmm@00000008000000080000000800000008]
add esi, -8
pcmpeqd xmm3, xmm0
pcmpeqd xmm6, xmm2
psubd xmm4, xmm3
psubd xmm1, xmm6
jne LBB0_1
With mutiplication 2500ms(5x slower):
LBB0_3: # =>This Inner Loop Header: Depth=1
movdqu xmm6, xmmword ptr [esp + 16] # 16-byte Reload
add esi, -8
movdqa xmm4, xmm6
pshufd xmm5, xmm6, 245 # xmm5 = xmm6[1,1,3,3]
movdqa xmm3, xmm6
paddd xmm6, xmmword ptr [__xmm@00000008000000080000000800000008]
pmuludq xmm4, xmm4
pmuludq xmm5, xmm5
paddd xmm3, xmm1
pshufd xmm4, xmm4, 232 # xmm4 = xmm4[0,2,2,3]
pshufd xmm5, xmm5, 232 # xmm5 = xmm5[0,2,2,3]
punpckldq xmm4, xmm5 # xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
pshufd xmm5, xmm3, 245 # xmm5 = xmm3[1,1,3,3]
pmuludq xmm3, xmm3
pmuludq xmm5, xmm5
pshufd xmm3, xmm3, 232 # xmm3 = xmm3[0,2,2,3]
pcmpeqd xmm4, xmm2
movdqu xmmword ptr [esp + 16], xmm6 # 16-byte Spill
pshufd xmm5, xmm5, 232 # xmm5 = xmm5[0,2,2,3]
psubd xmm7, xmm4
punpckldq xmm3, xmm5 # xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
pcmpeqd xmm3, xmm2
psubd xmm0, xmm3
jne LBB0_3
If compiled with **tcc** perfomance almost the same.