On Thursday, January 30, 2014 4:30:00 AM UTC-6, Jon Norberg wrote:
>
> Well, I solved it for now with subsampling:
>
> using Distance
>
>
> n=50000
>
> a=rand(3,n)
>
> #@time r=pairwise(Euclidean(),a,a)
>
> subsample=10
>
> m=integer(n/subsample)
>
>
You may need to be a bit more careful about the index ranges if n is not a
multiple of subsample.
> s=spzeros(n,n)
>
>
Generally it is a bad idea to initialize a sparse matrix to zeros then
insert elements. A SparseMatrixCSC object is stored in compressed column
format which makes insertions expensive.
The preferred approach is to create the matrix in triplet format (i.e.
three parallel vectors of the row indices, the column indices and the
non-zero value for the non-zeros only) and only convert to SparseMatrixCSC
when you are done generating the matrix.
> r=zeros(m,m)
>
> threshold=0.2
>
> for i=1:subsample-1
>
> ii=(i-1)*m+1
>
> for j=1:subsample-1
>
> jj=(j-1)*m+1
>
> r=pairwise(Euclidean(),a[:,ii:ii+m-1],a[:,jj:jj+m-1])
>
>
This will be more expensive then necessary because the subsections of 'a' will
be copied (I think). Because the arguments to pairwise are declared as
AbstractArray types you can use sub(a,:,ii:ii+m-1] or, if you add
using NumericExtensions
view(a, :, ii:ii+m-1). (view is expected to become part of Base but I
don't think that has happened yet.)
> r[r.>threshold]=0
>
> s[ii:ii+m-1,jj:jj+m-1]=sparse(r)
>
> end
>
> end
>
>
> If anyone know any performance improving tricks I'd be grateful.
>
>
I would start with something like
using Distance,NumericExtensions
# Create a vector of k contiguous index ranges covering 1:N
function subinds(k, N)
step = iceil(N/k)
rng = 1:k
res = [rng + j * step for j in 0:k-2]
push!(res, ((k - 1) * step + 1):N)
res
end
function edist(a::AbstractMatrix, nblocks::Int, threshold)
N = size(a,2)
si = subinds(nblocks,N)
i = Int[]
j = Int[]
x = eltype(a)[]
for ii in si, jj in si
r = pairwise(Euclidean(), view(a,:,ii), view(a,:,jj))
ioffset = start(ii) - 1
joffset = start(jj) - 1
for ir in 1:size(r,1), jr in 1:size(r,2)
if (rij = r[ir,jr]) < threshold
push!(i,ioffset + ir)
push!(j,joffset + jr)
push!(x,rij)
end
end
end
sparse(i,j,x)
end
which provides
julia> srand(1234321)
julia> A = randn(500,1000);
julia> pairwise(Euclidean(),view(A,:,1:10))
10x10 Array{Float64,2}:
0.0 30.8681 31.0917 33.3303 32.6781 32.538 34.4923 32.0431
33.1331 31.7442
30.8681 0.0 29.9652 29.9959 30.6198 31.1438 30.9268 29.9775
30.2343 31.8855
31.0917 29.9652 0.0 31.9773 30.5056 30.9952 31.3752 30.8604
30.9147 30.2703
33.3303 29.9959 31.9773 0.0 32.9377 33.6873 33.0936 31.2665
32.1753 31.6462
32.6781 30.6198 30.5056 32.9377 0.0 31.5086 32.8416 30.4298
32.6461 30.5551
32.538 31.1438 30.9952 33.6873 31.5086 0.0 30.7051 32.0288
31.4237 31.9303
34.4923 30.9268 31.3752 33.0936 32.8416 30.7051 0.0 31.1203
32.0035 31.5522
32.0431 29.9775 30.8604 31.2665 30.4298 32.0288 31.1203 0.0
30.3211 30.6363
33.1331 30.2343 30.9147 32.1753 32.6461 31.4237 32.0035 30.3211
0.0 31.6678
31.7442 31.8855 30.2703 31.6462 30.5551 31.9303 31.5522 30.6363
31.6678 0.0
julia> ed = edist(A,10,28.5)
998x998 sparse matrix with 30 Float64 nonzeros:
[976, 210] = 28.16
[976, 310] = 28.2496
[976, 408] = 27.9055
[982, 408] = 28.4838
[998, 502] = 28.2881
[998, 604] = 28.2445
[917, 609] = 27.9873
⋮
[917, 986] = 28.3307
[976, 988] = 28.3014
[502, 998] = 28.2881
[604, 998] = 28.2445
[705, 998] = 28.2145
[934, 998] = 28.3334
[964, 998] = 28.0159
[976, 998] = 28.2735
julia> @time edist(A,10,28.5);
elapsed time: 0.017411791 seconds (1127192 bytes allocated)