This is an automated email from the ASF dual-hosted git repository.

baunsgaard pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


The following commit(s) were added to refs/heads/master by this push:
     new 4898c95  [DOCS] Kmeans update docs
4898c95 is described below

commit 4898c955d54bdb44be32e1f5d228f737fffadc88
Author: baunsgaard <baunsga...@tugraz.at>
AuthorDate: Fri Oct 30 11:40:16 2020 +0100

    [DOCS] Kmeans update docs
---
 docs/site/builtins-reference.md                      | 8 +++++---
 scripts/builtin/kmeans.dml                           | 1 +
 src/test/scripts/functions/builtin/kmeansPredict.dml | 2 +-
 src/test/scripts/functions/builtin/smote.dml         | 2 +-
 4 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/site/builtins-reference.md b/docs/site/builtins-reference.md
index 66b2299..22c1491 100644
--- a/docs/site/builtins-reference.md
+++ b/docs/site/builtins-reference.md
@@ -557,7 +557,7 @@ imputeByFD(X, sourceAttribute, targetAttribute, threshold)
 ### Example
 
 ```r
-X = matrix("1 1 1 2 4 5 5 3 3 NaN 4 5 4 1", rows=7, cols=2) 
+X = matrix("1 1 1 2 4 5 5 3 3 NaN 4 5 4 1", rows=7, cols=2)
 imputeByFD(X = X, source = 1, target = 2, threshold = 0.6, verbose = FALSE)
 ```
 
@@ -568,7 +568,7 @@ The kmeans() implements the KMeans Clustering algorithm.
 ### Usage
 
 ```r
-kmeans(X = X, k = 20, runs = 10, max_iter = 5000, eps = 0.000001, is_verbose = 
FALSE, avg_sample_size_per_centroid = 50)
+kmeans(X = X, k = 20, runs = 10, max_iter = 5000, eps = 0.000001, is_verbose = 
FALSE, avg_sample_size_per_centroid = 50, seed = -1)
 ```
 
 ### Arguments
@@ -581,6 +581,8 @@ kmeans(X = X, k = 20, runs = 10, max_iter = 5000, eps = 
0.000001, is_verbose = F
 | max_iter   | Int             | `100`      |Max no. of iterations allowed |
 | eps        | Double          | `0.000001` | Tolerance (epsilon) for WCSS 
change ratio |
 | is_verbose | Boolean         |   FALSE    | do not print per-iteration stats 
|
+| avg_sample_size_per_centroid | int         |   50    | Number of samples to 
make in the initialization |
+| seed | int         |   -1    | The seed used for initial sampling. If set to 
-1 random seeds are selected. |
 
 ### Returns
 
@@ -593,7 +595,7 @@ kmeans(X = X, k = 20, runs = 10, max_iter = 5000, eps = 
0.000001, is_verbose = F
 
 ```r
 X = rand (rows = 3972, cols = 972)
-kmeans(X = X, k = 20, runs = 10, max_iter = 5000, eps = 0.000001, is_verbose = 
FALSE, avg_sample_size_per_centroid = 50)
+kmeans(X = X, k = 20, runs = 10, max_iter = 5000, eps = 0.000001, is_verbose = 
FALSE, avg_sample_size_per_centroid = 50, seed = -1)
 ```
 
 ## `lm`-Function
diff --git a/scripts/builtin/kmeans.dml b/scripts/builtin/kmeans.dml
index 646ec34..ee66e3d 100644
--- a/scripts/builtin/kmeans.dml
+++ b/scripts/builtin/kmeans.dml
@@ -32,6 +32,7 @@
 # eps                               Double    0.000001 Tolerance (epsilon) for 
WCSS change ratio
 # is_verbose                        Boolean   FALSE    do not print 
per-iteration stats
 # avg_sample_size_per_centroid      Int       50       Average number of 
records per centroid in data samples
+# seed                              Int       -1       The seed used for 
initial sampling. If set to -1 random seeds are selected. 
 #
 #
 # RETURN VALUES
diff --git a/src/test/scripts/functions/builtin/kmeansPredict.dml 
b/src/test/scripts/functions/builtin/kmeansPredict.dml
index a96fc28..59dc726 100644
--- a/src/test/scripts/functions/builtin/kmeansPredict.dml
+++ b/src/test/scripts/functions/builtin/kmeansPredict.dml
@@ -21,7 +21,7 @@
 
 X = read($X)
 
-[C, Y] = kmeans(X,  $k, $runs, $max_iter, $eps, TRUE, 50)
+[C, Y] = kmeans(X,  $k, $runs, $max_iter, $eps, TRUE, 50, 1324)
 Y_1 = kmeansPredict(X, C)
 
 res = mean(Y==Y_1)
diff --git a/src/test/scripts/functions/builtin/smote.dml 
b/src/test/scripts/functions/builtin/smote.dml
index 3891c98..5a8d5d6 100644
--- a/src/test/scripts/functions/builtin/smote.dml
+++ b/src/test/scripts/functions/builtin/smote.dml
@@ -30,7 +30,7 @@ T = read($T);
 A_B = rbind(A, B)
 n = nrow(A_B)
 # group data into k=2 clusters
-[C, Y] = kmeans(rbind(A_B, T),  2, 10, 100, 0.000001, FALSE, 50)
+[C, Y] = kmeans(rbind(A_B, T),  2, 10, 100, 0.000001, FALSE, 50, 314)
 # check if the instances of A and B fall in same cluster
 check = matrix(as.scalar(Y[1,1]), n, 1)
 testSum = sum(check - Y[1:n,])

Reply via email to