[systemds] 05/05: [SYSTEMDS-2710] Fix K-Means and GMM builtin functions, robustness IPA

mboehm7 Sat, 31 Oct 2020 15:07:39 -0700

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git


commit 3f8ff967da5e8b01d9e04604290646d1b748d20d
Author: Matthias Boehm <mboe...@gmail.com>
AuthorDate: Sat Oct 31 23:05:47 2020 +0100

    [SYSTEMDS-2710] Fix K-Means and GMM builtin functions, robustness IPA
    
    This patch fixes various issues related to the recent addition of seeds
    to the Kmeans builtin function.
    
    First, the GMM built-in function was failing in IPA because the GMM call
    to Kmeans used by-position arguments and thus missed the new seed
    argument. We now use named parameters to guard against future additions.
    
    Second, the error handling in such cases of mismatching numbers of
    arguments was horrible (failing in inter-procedural analysis index out
    of bounds exceptions). We now use a more robust handling in IPA such
    that the user get the intended, error message explaining the problem.
    
    Third, the kmeans addition of seeds used a uniform random matrix and
    incrementally added uniform random matrices per centroid. Adding
    multiple uniformly distributed random variables, gives a normally
    distributed random variable. For K-Means initialization this is not
    desirable and ultimately led to the flaky python test failures.
---
 scripts/builtin/gmm.dml                            | 70 +++++++++-------------
 scripts/builtin/kmeans.dml                         |  5 +-
 .../sysds/hops/ipa/InterProceduralAnalysis.java    |  4 +-
 3 files changed, 32 insertions(+), 47 deletions(-)

diff --git a/scripts/builtin/gmm.dml b/scripts/builtin/gmm.dml
index 2eafe13..7c327ca 100644
--- a/scripts/builtin/gmm.dml
+++ b/scripts/builtin/gmm.dml
@@ -54,8 +54,7 @@
 
 m_gmm = function(Matrix[Double] X, Integer n_components = 1, String model = 
"VVV", String init_params = "kmeans", Integer iter = 100, Double reg_covar = 
1e-6, Double tol = 0.000001, Boolean verbose = FALSE )
 return (Matrix[Double] weights, Matrix[Double] labels, Integer df, Double bic)
-{ 
-
+{
   # sanity checks
   if(model != "VVV" & model != "EEE" & model != "VVI" & model != "VII")
     stop("model not supported, should be in VVV, EEE, VVI, VII");
@@ -72,13 +71,14 @@ return (Matrix[Double] weight, Matrix[Double] mean, 
List[Unknown] sigma, List[Un
   resp = matrix(0, nrow(X), n_components)
   if(init_params == "kmeans")
   {
-    [C, Y] = kmeans(X, n_components, 10, 10, tol, FALSE, 25)
+    [C, Y] = kmeans(X=X, k=n_components, runs=10, max_iter=10, 
+                    eps=tol, is_verbose=FALSE, avg_sample_size_per_centroid=25)
     resp = resp + t(seq(1,n_components))
     resp = resp == Y
   }
   else if(init_params == "random")
   {
-    resp = Rand(rows = nrow(X), cols=n_components)   
+    resp = Rand(rows = nrow(X), cols=n_components)
     resp = resp/rowSums(resp)
   }
   else stop("invalid parameter value, expected kmeans or random found 
"+init_params) 
@@ -109,7 +109,7 @@ return (Matrix[Double] weight, Matrix[Double] mean, 
List[Unknown] sigma)
   mu = (t(resp) %*% X) / t(nk)
   sigma = list()
   if(model == "VVV")
-    sigma = covariances_VVV(X, resp, mu, nk, reg_covar) 
+    sigma = covariances_VVV(X, resp, mu, nk, reg_covar)
   else if(model == "EEE")
     sigma = covariances_EEE(X, resp, mu, nk, reg_covar)
   else if(model ==  "VVI")
@@ -118,7 +118,7 @@ return (Matrix[Double] weight, Matrix[Double] mean, 
List[Unknown] sigma)
     sigma = covariances_VII(X, resp, mu, nk, reg_covar)
     
   weight = nk
-  mean = mu # n_components * n_features     
+  mean = mu # n_components * n_features
 }
 
 # Matrix/Vector Parameters/List
@@ -128,13 +128,12 @@ covariances_VVV = function(Matrix[Double] X, 
Matrix[Double] resp, Matrix[Double]
 return(List[Unknown] sigma)
 {
   sigma = list()
-  for(k in 1:nrow(mu))
-  {
+  for(k in 1:nrow(mu)) {
     diff = X - mu[k,]
     cov = (t(diff * resp[, k]) %*% diff) / as.scalar(nk[1,k])
     cov = cov + diag(matrix(reg_covar, ncol(cov), 1))
     sigma = append(sigma, cov)
-  }       
+  }
 }
 
 # Matrix/Vector Parameters/List
@@ -192,12 +191,12 @@ return (List[Unknown] precision_chol)
       isSPD = checkSPD(cov)
       if(isSPD) {
         cov_chol = cholesky(cov)
-        pre_chol = t(inv(cov_chol))                      
+        pre_chol = t(inv(cov_chol))
         precision_chol = append(precision_chol, pre_chol)
       } else 
         stop("Fitting the mixture model failed because some components have 
ill-defined empirical covariance (i.e., singleton matrix or non-symmetric )."+ 
         "\nTry to decrease the number of components, or increase reg_covar")
-    }      
+    }
   }
   else if(model == "EEE") {
     cov = as.matrix(sigma[1])
@@ -217,7 +216,7 @@ return (List[Unknown] precision_chol)
       "\nTry to decrease the number of components, or increase reg_covar")
     else {
       precision_chol = append(precision_chol, 1.0/sqrt(cov))
-    }   
+    }
   }
 }
 
@@ -244,7 +243,7 @@ return (Matrix[Double] weight_log_pro)
 {
   weight_log_pro = estimate_log_prob(X, mu, precisions_cholesky, model) + 
estimate_log_weights(w)
 }
-  
+
 estimate_log_weights = function(Matrix[Double] w)
 return (Matrix[Double] log_weight)
 {
@@ -263,28 +262,23 @@ return(Matrix[Double] log_det_cholesky)
 {
   comp = length(mat_chol)
 
-  if(model == "VVV")
-  { 
-    log_det_chol = matrix(0, 1, comp) 
-    for(k in 1:comp)
-    {
+  if(model == "VVV") {
+    log_det_chol = matrix(0, 1, comp)
+    for(k in 1:comp) {
       mat = as.matrix(mat_chol[k])
       log_det = sum(log(diag(t(mat))))   # have to take the log of diag 
elements only
       log_det_chol[1,k] = log_det
-    }      
+    }
   }
-  else if(model == "EEE")
-  { 
+  else if(model == "EEE") {
     mat = as.matrix(mat_chol[1])
     log_det_chol = as.matrix(sum(log(diag(mat))))
   }
-  else if(model ==  "VVI")
-  {
+  else if(model ==  "VVI") {
     mat = as.matrix(mat_chol[1])
     log_det_chol = t(rowSums(log(mat)))
   }
-  else if (model == "VII")
-  {
+  else if (model == "VII") {
     mat = as.matrix(mat_chol[1])
     log_det_chol = t(d * log(mat))
   }
@@ -299,27 +293,23 @@ return(Matrix[Double] es_log_prob ) # nrow(X) * 
n_components
   n_components = nrow(mu)
 
   log_det = compute_log_det_cholesky(prec_chol, model, d)
-  if(model == "VVV")
-  { 
-    log_prob = matrix(0, n, n_components) 
-    for(k in 1:n_components)
-    {
+  if(model == "VVV") {
+    log_prob = matrix(0, n, n_components)
+    for(k in 1:n_components) {
       prec = as.matrix(prec_chol[k]) 
       y = X %*% prec - mu[k,] %*% prec  # changing here t intro:  y = X %*% 
prec - mu[k,] %*% prec 
       log_prob[, k] = rowSums(y*y)
-    }      
+    }
   }
-  else if(model == "EEE")
-  { 
-    log_prob = matrix(0, n, n_components) 
+  else if(model == "EEE") {
+    log_prob = matrix(0, n, n_components)
     prec = as.matrix(prec_chol[1])
     for(k in 1:n_components) {
       y = X %*% prec - mu[k,] %*% prec
       log_prob[, k] = rowSums(y*y) # TODO replace y*y with squared built-in
     }
   }
-  else if(model ==  "VVI")
-  {
+  else if(model ==  "VVI") {
     prec = as.matrix(prec_chol[1])
     precisions = prec^2
     bc_matrix = matrix(1,nrow(X), nrow(mu))
@@ -327,8 +317,7 @@ return(Matrix[Double] es_log_prob ) # nrow(X) * n_components
                     2. * (X %*% t(mu * precisions)) +
                     X^2 %*% t(precisions))
   }
-  else if (model == "VII")
-  {
+  else if (model == "VII") {
     prec = as.matrix(prec_chol[1])
     precisions = prec^ 2
     bc_matrix = matrix(1,nrow(X), nrow(mu))
@@ -367,13 +356,11 @@ return (Integer n_parameters)
   mean_param = n_features * n_components
   
   n_parameters = as.integer( cov_param + mean_param + n_components - 1 )
-
 }
 
 fit = function(Matrix[Double] X, Integer n_components, String model, String 
init_params, Integer iter , Double reg_covar, Double tol)
 return (Matrix[Double] label, Matrix[Double] predict_prob, Double 
log_prob_norm)
 {
-
   lower_bound = 0
   converged = FALSE
   n = nrow(X)
@@ -387,7 +374,7 @@ return (Matrix[Double] label, Matrix[Double] predict_prob, 
Double log_prob_norm)
     change = lower_bound - prev_lower_bound
     if(abs(change) < tol)
       converged = TRUE
-    i = i+1   
+    i = i+1
   }
   [log_prob_norm, log_resp] = e_step(X,weight, mean, precision_chol, model)
   label = rowIndexMax(log_resp)
@@ -415,4 +402,3 @@ return(Boolean isSPD)
   }
   else isSPD = FALSE
 }
-
diff --git a/scripts/builtin/kmeans.dml b/scripts/builtin/kmeans.dml
index ee66e3d..3567dc3 100644
--- a/scripts/builtin/kmeans.dml
+++ b/scripts/builtin/kmeans.dml
@@ -85,8 +85,6 @@ m_kmeans = function(Matrix[Double] X, Integer k = 10, Integer 
runs = 10, Integer
 
   min_distances = is_row_in_samples;  # Pick the 1-st centroids uniformly at 
random
 
-  seed = ifelse(seed == -1, -1, seed + 1)
-  random_row = rand(rows = 1, cols = num_runs, seed = seed);
   for (i in 1 : num_centroids)
   {
     # "Matricize" and prefix-sum to compute the cumulative distribution 
function:
@@ -95,7 +93,8 @@ m_kmeans = function(Matrix[Double] X, Integer k = 10, Integer 
runs = 10, Integer
 
     # Select the i-th centroid in each sample as a random sample row id with
     # probability ~ min_distances:
-    random_row = random_row + rand(rows = 1, cols = 1, seed = seed + i)
+    lseed = ifelse(seed==-1, -1, seed + i);
+    random_row = rand(rows = 1, cols = num_runs, seed = lseed)
     threshold_matrix = random_row * cdf_min_distances [sample_block_size, ];
     centroid_ids = t(colSums (cdf_min_distances < threshold_matrix)) + 1;
 
diff --git 
a/src/main/java/org/apache/sysds/hops/ipa/InterProceduralAnalysis.java 
b/src/main/java/org/apache/sysds/hops/ipa/InterProceduralAnalysis.java
index 579af77..14556cb 100644
--- a/src/main/java/org/apache/sysds/hops/ipa/InterProceduralAnalysis.java
+++ b/src/main/java/org/apache/sysds/hops/ipa/InterProceduralAnalysis.java
@@ -521,8 +521,8 @@ public class InterProceduralAnalysis
                ArrayList<Hop> inputOps = fop.getInput();
                String fkey = fop.getFunctionKey();
                
-               for( int i=0; i<funArgNames.length; i++ )
-               {
+               //iterate over all parameters (with robustness for missing 
parameters)
+               for( int i=0; i<Math.min(inputOps.size(), funArgNames.length); 
i++ ) {
                        //create mapping between input hops and vars
                        DataIdentifier dat = 
fstmt.getInputParam(funArgNames[i]);
                        Hop input = inputOps.get(i);

[systemds] 05/05: [SYSTEMDS-2710] Fix K-Means and GMM builtin functions, robustness IPA

Reply via email to