First version 500ms 
    
    
    LBB0_1:                                 # =>This Inner Loop Header: Depth=1
      movdqa  xmm3, xmm5
      movdqa  xmm6, xmm5
      paddd xmm5, xmmword ptr [__xmm@00000008000000080000000800000008]
      add esi, -8
      pcmpeqd xmm3, xmm0
      pcmpeqd xmm6, xmm2
      psubd xmm4, xmm3
      psubd xmm1, xmm6
      jne LBB0_1
    

With mutiplication 2500ms(5x slower): 
    
    
    LBB0_3:                                 # =>This Inner Loop Header: Depth=1
      movdqu  xmm6, xmmword ptr [esp + 16] # 16-byte Reload
      add esi, -8
      movdqa  xmm4, xmm6
      pshufd  xmm5, xmm6, 245         # xmm5 = xmm6[1,1,3,3]
      movdqa  xmm3, xmm6
      paddd xmm6, xmmword ptr [__xmm@00000008000000080000000800000008]
      pmuludq xmm4, xmm4
      pmuludq xmm5, xmm5
      paddd xmm3, xmm1
      pshufd  xmm4, xmm4, 232         # xmm4 = xmm4[0,2,2,3]
      pshufd  xmm5, xmm5, 232         # xmm5 = xmm5[0,2,2,3]
      punpckldq xmm4, xmm5      # xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
      pshufd  xmm5, xmm3, 245         # xmm5 = xmm3[1,1,3,3]
      pmuludq xmm3, xmm3
      pmuludq xmm5, xmm5
      pshufd  xmm3, xmm3, 232         # xmm3 = xmm3[0,2,2,3]
      pcmpeqd xmm4, xmm2
      movdqu  xmmword ptr [esp + 16], xmm6 # 16-byte Spill
      pshufd  xmm5, xmm5, 232         # xmm5 = xmm5[0,2,2,3]
      psubd xmm7, xmm4
      punpckldq xmm3, xmm5      # xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
      pcmpeqd xmm3, xmm2
      psubd xmm0, xmm3
      jne LBB0_3
    

If compiled with **tcc** perfomance almost the same.

Reply via email to