This is an automated email from the ASF dual-hosted git repository. baunsgaard pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push: new 4ff3819 [MINOR] K-means seeded fix 4ff3819 is described below commit 4ff3819c20eb5035749cfd88d32c92848e187207 Author: baunsgaard <baunsga...@tugraz.at> AuthorDate: Thu Oct 29 21:42:44 2020 +0100 [MINOR] K-means seeded fix This commit change the seeding of k-means to actually work. Since the last change to add the seed, unfortunately, made each initial cluster assign to the same location, making the entire selection and sampling in the beginning invalid. Now the seed are incremented and different for each of the initial values. --- scripts/builtin/kmeans.dml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/scripts/builtin/kmeans.dml b/scripts/builtin/kmeans.dml index 4b407f0..646ec34 100644 --- a/scripts/builtin/kmeans.dml +++ b/scripts/builtin/kmeans.dml @@ -67,7 +67,7 @@ m_kmeans = function(Matrix[Double] X, Integer k = 10, Integer runs = 10, Integer print ("Taking data samples for initialization..."); [sample_maps, samples_vs_runs_map, sample_block_size] = get_sample_maps( - num_records, num_runs, num_centroids * avg_sample_size_per_centroid); + num_records, num_runs, num_centroids * avg_sample_size_per_centroid, seed); is_row_in_samples = rowSums (sample_maps); X_samples = sample_maps %*% X; @@ -84,6 +84,8 @@ m_kmeans = function(Matrix[Double] X, Integer k = 10, Integer runs = 10, Integer min_distances = is_row_in_samples; # Pick the 1-st centroids uniformly at random + seed = ifelse(seed == -1, -1, seed + 1) + random_row = rand(rows = 1, cols = num_runs, seed = seed); for (i in 1 : num_centroids) { # "Matricize" and prefix-sum to compute the cumulative distribution function: @@ -92,7 +94,7 @@ m_kmeans = function(Matrix[Double] X, Integer k = 10, Integer runs = 10, Integer # Select the i-th centroid in each sample as a random sample row id with # probability ~ min_distances: - random_row = rand(rows = 1, cols = num_runs, seed = seed); + random_row = random_row + rand(rows = 1, cols = 1, seed = seed + i) threshold_matrix = random_row * cdf_min_distances [sample_block_size, ]; centroid_ids = t(colSums (cdf_min_distances < threshold_matrix)) + 1; @@ -222,12 +224,10 @@ m_kmeans = function(Matrix[Double] X, Integer k = 10, Integer runs = 10, Integer C = matrix(0, num_centroids, num_records) Y = matrix(-1, 1, num_records) } - - } -get_sample_maps = function (int num_records, int num_samples, int approx_sample_size) +get_sample_maps = function (int num_records, int num_samples, int approx_sample_size, int seed) return (Matrix[double] sample_maps, Matrix[double] sample_col_map, int sample_block_size) { if (approx_sample_size < num_records) @@ -239,7 +239,7 @@ get_sample_maps = function (int num_records, int num_samples, int approx_sample_ # Generate all samples in parallel by converting uniform random values into random # integer skip-ahead intervals and prefix-summing them: - sample_rec_ids = Rand (rows = sample_block_size, cols = num_samples, min = 0.0, max = 1.0); + sample_rec_ids = Rand (rows = sample_block_size, cols = num_samples, min = 0.0, max = 1.0, seed = seed); sample_rec_ids = round (log (sample_rec_ids) / log (1.0 - approx_sample_size / num_records) + 0.5); # Prob [k-1 < log(uniform)/log(1-p) < k] = p*(1-p)^(k-1) = Prob [k-1 zeros before a one] sample_rec_ids = cumsum (sample_rec_ids); # (skip to next one) --> (skip to i-th one)