Please try https://github.com/JuliaStats/Distances.jl/pull/44

Using:

function testDistances5(a::Array{Float64,1}, b::Array{Float64,1})
    @time for i in 1:5000
        myjaccard5(a,b)
    end
    d = Jaccard()
    @time for i in 1:5000
        evaluate(d, a,b)
    end
end


I get:
julia> testDistances5(rand(100), rand(100))
  0.000849 seconds
  0.001670 seconds




On Monday, June 13, 2016 at 7:57:20 PM UTC+2, jean-pierre both wrote:
>
> It makes perfect sense to use Jaccard distances for float values
> Cf  for example http://www.ncbi.nlm.nih.gov/pubmed/16794951
>
> Nevertheless the problem is just an implementation, the time spent should 
> be
> comparable with the one with Euclidean.
>
> The problem I mention is that the nice  implementation used in
>  packages Distances is a problem for this distance as a simple loop is 
> really faster.
> I presume there is an optimization issue as the difference in time with 
> Euclidean is many orders of magnitude 
> larger than what can be expected from the complexity.
>
>
> The funny thing is that min and max seems also part of the problem as can 
> be seen in the following:
>
>
> function myjaccard2(a::Array{Float64,1}, b::Array{Float64,1})
>     num = 0.
>     den = 0.
>     for I in 1:length(a)
>         @inbounds ai = a[I]
>         @inbounds bi = b[I]
>         num = num + min(ai,bi)
>         den = den + max(ai,bi)      
>     end
>     1. - num/den
> end
>
>
>
> function testDistances2(v1::Array{Float64,1}, v2::Array{Float64,1})
>     for i in 1:50000
>         myjaccard2(v1,v2)
>     end
> end
>
> @time testDistances2(v1,v2)
> machine   3.217329 seconds (200.01 M allocations: 2.981 GB, 19.91% gc time)
>
>
>
> function myjaccard5(a::Array{Float64,1}, b::Array{Float64,1})
>     num = 0.
>     den = 0.
>     for I in 1:length(a)
>         @inbounds ai = a[I]
>         @inbounds bi = b[I]
>         abs_m = abs(ai-bi)
>         abs_p = abs(ai+bi)
>         num += abs_p - abs_m
>         den += abs_p + abs_m   
>     end
>     1. - num/den
> end
>
>
> function testDistances5(a::Array{Float64,1}, b::Array{Float64,1})
>     for i in 1:5000
>         myjaccard5(a,b)
>     end
> end
>
> end
>
>
> julia> @time testDistances5(v1,v2)
>   0.166979 seconds (4 allocations: 160 bytes)
>
>
>
> We see that using abs is faster.
>
> I do not do a pull request beccause
>
> I would expect a good implementation to be 2 or 3 times slower than 
> Euclidean, and I have not 
> that yet.
>
> Le lundi 13 juin 2016 13:43:00 UTC+2, Kristoffer Carlsson a écrit :
>>
>> It seems weird to me that you guys want to call Jaccard distance with 
>> float arrays. AFAIK Jaccard distance measures the distance between two 
>> distinct samples from a pair of sets so basically between two Vector{Bool}, 
>> see: 
>> http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.jaccard.html
>>
>> "Computes the Jaccard-Needham dissimilarity between two boolean 1-D 
>> arrays."
>>
>> Is there some more general formulation of it that extends to vectors in a 
>> continuous vector space?
>>
>> And, to note, Jaccard is type stable for inputs of Vector{Bool} in 
>> Distances.jl.
>>
>> On Monday, June 13, 2016 at 3:53:14 AM UTC+2, jean-pierre both wrote:
>>>
>>>
>>>
>>> I encountered in my application with Distances.Jaccard compared with 
>>> Distances.Euclidean
>>> It was very slow.
>>>
>>> For example with 2 vecteurs Float64 of size 11520
>>>
>>> I get the following 
>>> julia> D=Euclidean()
>>> Distances.Euclidean()
>>> julia> @time for i in 1:500
>>>        evaluate(D,v1,v2)
>>>        end
>>>   0.002553 seconds (500 allocations: 7.813 KB)
>>>
>>> and with Jaccard
>>>
>>> julia> D=Jaccard()
>>> Distances.Jaccard()
>>> @time for i in 1:500
>>>               evaluate(D,v1,v2)
>>>               end
>>>   1.995046 seconds (40.32 M allocations: 703.156 MB, 9.68% gc time)
>>>
>>> With a simple loop for computing jaccard :
>>>
>>>
>>> function myjaccard2(a::Array{Float64,1}, b::Array{Float64,1})
>>>            num = 0
>>>            den = 0
>>>            for i in 1:length(a)
>>>                    num = num + min(a[i],b[i])
>>>                    den = den + max(a[i],b[i])      
>>>            end
>>>                1. - num/den
>>>        end
>>> myjaccard2 (generic function with 1 method)
>>>
>>> julia> @time for i in 1:500
>>>               myjaccard2(v1,v2)
>>>               end
>>>   0.451582 seconds (23.04 M allocations: 351.592 MB, 20.04% gc time)
>>>
>>> I do not see the problem in jaccard distance implementation in the 
>>> Distances packages
>>>
>>

Reply via email to