Hi guys,
I'm trying to implement the K-Means Clustering Algorithm, but I'm having
some problems. The function I wrote:
function kcluster(data; distance = pearson, k=4)
# Generate a list of tuples of the min and max values of each column of
"data"
ranges = [(minimum(data[:,i]), maximum(data[:,i])) for i in 1:size(data,
2)]
# Create k randomly placed centroids
centroids = [rand()*ranges[j][2] - ranges[j][1] + ranges[j][1] for i in
1:k, j in 1:length(ranges)]
lastmatches = Any[]
for t in 1:100
println("Iteration $t")
bestmatches = [Int[] for i in 1:k]
# Get best matches for each cluster
for j in 1:size(data, 1)
row = data[j, :]
bestmatch = 1
bestd = distance(centroids[bestmatch, :], row)
for i in 1:k
d = distance(centroids[i, :], row)
if d < bestd
bestd = d
bestmatch = i
end
end
push!(bestmatches[bestmatch], j)
end
if lastmatches == bestmatches
return lastmatches
end
lastmatches = bestmatches
# Move clusters to the average of its matches
numcols = size(data, 2)
for i in 1:k
avgs = zeros(1, numcols)
if length(bestmatches[i]) > 0
for row in bestmatches[i]
avgs += data[row, :]
end
avgs /= length(bestmatches[i])
centroids[i, :] = avgs
end
end
end
return lastmatches
end
The "data" argument is a two dimensional Array, each row representing an
individual, and each column its position on space.
The problem is the following: the same algorithm in Python (with the same
"data" input), use to stop near iteration #5, and in Julia it always goes
to the iteration #100. The not-empty clusters on Python are also smaller,
therefore there are less empty clusters. Can somebody find why it never
enters the "if lastmatches == bestmatches" block?
Sorry about my poor english