Re: [julia-users] How to find the indices of the unique rows in a matrix

Peter Simon Mon, 11 Aug 2014 13:26:44 -0700

Thanks, this is very much appreciated.  After looking over your code and 
looking up the meaning of "hash collision" I now see that my code will 
silently fail sometimes, because I didn't check for collisions.  So your 
provided code is very definitely useful to me now!


--Peter

On Monday, August 11, 2014 11:47:53 AM UTC-7, Simon Kornblith wrote:
>
> Here it is copy/pasted from Base, in case it's useful for you for now:
>
> using Base.Cartesian, Base.Prehashed
> @ngenerate N typeof(A) function indunique{T,N}(A::AbstractArray{T,N}, dim
> ::Int)
>     1 <= dim <= N || return copy(A)
>     hashes = zeros(Uint, size(A, dim))
>
>     # Compute hash for each row
>     k = 0
>     @nloops N i A d->(if d == dim; k = i_d; end) begin
>        @inbounds hashes[k] = hash(hashes[k], hash((@nref N A i)))
>     end
>
>     # Collect index of first row for each hash
>     uniquerow = Array(Int, size(A, dim))
>     firstrow = Dict{Prehashed,Int}()
>     for k = 1:size(A, dim)
>         uniquerow[k] = get!(firstrow, Prehashed(hashes[k]), k)
>     end
>     uniquerows = collect(values(firstrow))
>
>     # Check for collisions
>     collided = falses(size(A, dim))
>     @inbounds begin
>         @nloops N i A d->(if d == dim
>                               k = i_d
>                               j_d = uniquerow[k]
>                           else
>                               j_d = i_d
>                           end) begin
>             if (@nref N A j) != (@nref N A i)
>                 collided[k] = true
>             end
>         end
>     end
>
>     if any(collided)
>         nowcollided = BitArray(size(A, dim))
>         while any(collided)
>             # Collect index of first row for each collided hash
>             empty!(firstrow)
>             for j = 1:size(A, dim)
>                 collided[j] || continue
>                 uniquerow[j] = get!(firstrow, Prehashed(hashes[j]), j)
>             end
>             for v in values(firstrow)
>                 push!(uniquerows, v)
>             end
>
>             # Check for collisions
>             fill!(nowcollided, false)
>             @nloops N i A d->begin
>                                  if d == dim
>                                      k = i_d
>                                      j_d = uniquerow[k]
>                                      (!collided[k] || j_d == k) && 
> continue
>                                  else
>                                      j_d = i_d
>                                  end
>                              end begin
>                 if (@nref N A j) != (@nref N A i)
>                     nowcollided[k] = true
>                 end
>             end
>             (collided, nowcollided) = (nowcollided, collided)
>         end
>     end
>
>     sort!(uniquerows)
> end
>
> On Monday, August 11, 2014 2:43:24 PM UTC-4, Simon Kornblith wrote:
>>
>> unique with a dim argument actually computes this as byproduct but does 
>> not return it. All we need is an API.
>>
>> On Monday, August 11, 2014 1:27:57 PM UTC-4, Jacob Quinn wrote:
>>>
>>> There's an open issue about it here: 
>>> https://github.com/JuliaLang/julia/issues/1845
>>>
>>> I've also played around with a simple version based on a Set:
>>>
>>> function uniqueind(C)
>>>     out = Int[]
>>>     seen = Set{eltype(C)}()
>>>     for i = 1:length(C)
>>>         @inbounds x = C[i]
>>>         if !in(x, seen)
>>>             push!(seen, x)
>>>             push!(out, i)
>>>         end
>>>     end
>>>     out
>>> end
>>>
>>>
>>> On Mon, Aug 11, 2014 at 1:22 PM, Peter Simon <[email protected]> wrote:
>>>
>>>> Greetings.
>>>>
>>>> I didn't find any built-in function to return the indices for the 
>>>> unique rows in a matrix, so I tried rolling my own.  I looked at, but 
>>>> didn't understand, the highly macro-ized, arbitrary dimensional code in 
>>>> the 
>>>> built-in unique() function.  I did discover hashing, though, from looking 
>>>> at that code.  Here is what I have so far:
>>>>
>>>>  function uniquerowindices2{T}(a::Matrix{T})
>>>>      rowhash = Uint64[hash(a[k,:]) for k = 1:size(a,1)]
>>>>      hu = unique(rowhash)
>>>>      ind = zeros(Int,length(hu))
>>>>      k1 = 1
>>>>      for m = 1:length(ind)
>>>>          for k = k1:length(rowhash)
>>>>              if rowhash[k] == hu[m]
>>>>                  ind[m] = k
>>>>                  k1 = k
>>>>                  break
>>>>              end
>>>>          end
>>>>      end
>>>>      return ind
>>>>  end
>>>>
>>>> This is quite a bit faster than my first effort, where I was comparing 
>>>> rows, rather than row hashes, but I'm betting someone out there has a much 
>>>> better way.  Thanks in advance,
>>>>
>>>> --Peter    
>>>>
>>>
>>>

Re: [julia-users] How to find the indices of the unique rows in a matrix

Reply via email to