[systemml] 02/02: [SYSTEMDS] Fix and cleanup steplm feature selection built-in

mboehm7 Tue, 12 May 2020 13:36:25 -0700

This is an automated email from the ASF dual-hosted git repository.

mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemml.git


commit 996f61281c45a428986a89bc14c982ad41af0382
Author: Matthias Boehm <[email protected]>
AuthorDate: Tue May 12 22:35:13 2020 +0200

    [SYSTEMDS] Fix and cleanup steplm feature selection built-in
    
    This patch makes several improvements to the existing steplm built-in
    function (correctness and performance):
    
    1) So far, the lm parameters were not correctly passed through to the
    actual lm call, which for example rendered tol, reg, and icpt parameters
    ineffective (except for icpt=1 which was the only one tested).
    
    2) Cleanup of unnecessarily operations and control flow
    
    3) Converted the main two for loops of greedy model building to parfor
    loops (which required a slightly different analysis of the best model).
    
    On a scenario of a dense 10K x 1K input matrix (with convergence after
    20 iterations -> ~21000 lm training calls), this patch improved
    performance from 103.9s to 14.4s due to much better utilization (with
    fewer barriers) of the available 24 virtual cores.
---
 scripts/builtin/steplm.dml | 254 ++++++++++++++++++++-------------------------
 1 file changed, 113 insertions(+), 141 deletions(-)

diff --git a/scripts/builtin/steplm.dml b/scripts/builtin/steplm.dml
index 608a477..fd0018d 100644
--- a/scripts/builtin/steplm.dml
+++ b/scripts/builtin/steplm.dml
@@ -60,170 +60,138 @@
 # STDEV_TOT_Y           Standard Deviation of the response value Y
 # AVG_RES_Y             Average of the residual Y - pred(Y|X), i.e. residual 
bias
 
-m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, 
Double reg = 1e-7, Double tol = 1e-7, Integer maxi = 0, Boolean verbose = TRUE)
-return(Matrix[Double] C, Matrix[Double] S) {
-
-  # currently only the forward selection strategy in supported: start
-  # from one feature and iteratively add features until AIC improves
-  dir = "forward";
+m_steplm = function(Matrix[Double] X, Matrix[Double] y, Integer icpt = 0, 
+  Double reg = 1e-7, Double tol = 1e-7, Integer maxi = 0, Boolean verbose = 
TRUE)
+  return(Matrix[Double] B, Matrix[Double] S)
+{
+  if( icpt!=0 & icpt!=1 & icpt!=2 )
+    stop("Invalid steplm invocation with icpt="+icpt+" (valid values: 
0,1,2).");
+
+  # NOTE: currently only the forward selection strategy in supported:
+  # start from one feature and iteratively add features until AIC improves
   thr = 0.001;
 
   print("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
-  print("Reading X and Y...");
   X_orig = X;
   n = nrow(X_orig);
   m_orig = ncol(X_orig);
 
   # BEGIN STEPWISE LINEAR REGRESSION
-  if (dir == "forward") {
+  columns_fixed = matrix(0, 1, m_orig);
+  columns_fixed_ordered = matrix(0, 1, 1);
+  
+  # X_global stores the best model found at each step
+  X_global = matrix(0, n, 1);
+  
+  if (icpt == 1 | icpt == 2) {
+    beta = mean(y);
+    AIC_best_orig = 2 + n * log(sum((beta - y) ^ 2) / n);
+  } else {
+    beta = 0.0;
+    AIC_best_orig = n * log(sum(y ^ 2) / n);
+  }
+  print("Best AIC without any features: " + AIC_best_orig);
+  boa_ncol = ncol(X_orig) + as.integer(icpt!=0);
+  beta_out_all = matrix(0, boa_ncol, m_orig);
+
+  # First pass to examine single features
+  AICs = matrix(0, 1, m_orig);
+  parfor (i in 1:m_orig, check = 0) {
+    [AIC_1, beta_out_i] = linear_regression(X_orig[, i], y, icpt, reg, tol, 
maxi, verbose);
+    AICs[1, i] = AIC_1;
+    beta_out_all[1:nrow(beta_out_i), i] = beta_out_i;
+  }
+  AIC_best = min(min(AICs), AIC_best_orig);
+  AIC_check = checkAIC(AIC_best, AIC_best_orig, thr);
+  column_best = ifelse(AIC_check, as.scalar(rowIndexMin(AICs)), 0);
+
+  # beta best so far
+  beta_best = beta_out_all[, column_best];
+  if (column_best == 0) {
+    print("AIC of an empty model is " + AIC_best + " and adding no feature 
achieves more than " + (thr * 100) + "% decrease in AIC!");
+    B = matrix(0, m_orig, 1);
+    if (icpt != 0)
+      B = rbind(B, as.matrix(beta));
+    S = matrix(0, 1, 1);
+  }
+  else {
+
+    print("Best AIC " + AIC_best + " achieved with feature: " + column_best);
+
+    columns_fixed[1, column_best] = 1;
+    columns_fixed_ordered[1, 1] = column_best;
+    X_global = X_orig[, column_best];
+
     continue = TRUE
-    columns_fixed = matrix(0, 1, m_orig);
-    columns_fixed_ordered = matrix(0, 1, 1);
-    
-               # X_global stores the best model found at each step
-    X_global = matrix(0, n, 1);
-    
-               if (icpt == 1 | icpt == 2) {
-      beta = mean(y);
-      AIC_best = 2 + n * log(sum((beta - y) ^ 2) / n);
-    } else {
-      beta = 0.0;
-      AIC_best = n * log(sum(y ^ 2) / n);
-    }
-    AICs = matrix(AIC_best, 1, m_orig);
-    print("Best AIC without any features: " + AIC_best);
-    boa_ncol = ncol(X_orig);
-    if (icpt != 0) {
-      boa_ncol = boa_ncol + 1
-    }
-    beta_out_all = matrix(0, boa_ncol, m_orig * 1);
-    y_ncol = 1;
-    column_best = 0;
-
-    # First pass to examine single features
-    for (i in 1:m_orig, check = 0) {
-      columns_fixed_ordered_1 = as.matrix(i);
-      [AIC_1, beta_out_i] = linear_regression(X_orig[, i], y, icpt);
-
-      AICs[1, i] = AIC_1;
-      AIC_cur = as.scalar(AICs[1, i]);
-      if ((AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs(thr * AIC_best))) 
{
-        column_best = i;
-        AIC_best = AIC_cur;
-      }
-      beta_out_all[1:nrow(beta_out_i), ((i - 1) * y_ncol + 1):(i * y_ncol)] = 
beta_out_i[, 1];
-    }
-               
-    # beta best so far
-    beta_best = beta_out_all[, (column_best - 1) * y_ncol + 1:column_best * 
y_ncol];
-    if (column_best == 0) {
-      print("AIC of an empty model is " + AIC_best + " and adding no feature 
achieves more than " + (thr * 100) + "% decrease in AIC!");
-      Selected = matrix(0, 1, 1);
-      if (icpt == 0) {
-        B = matrix(beta, m_orig, 1);
-      } else {
-        B_tmp = matrix(0, m_orig + 1, 1);
-        B_tmp[m_orig + 1,] = beta;
-        B = B_tmp;
-      }
-      beta_out = B;
-      S = Selected;
-      C = beta_out;
-    } else {
-
-      print("Best AIC " + AIC_best + " achieved with feature: " + column_best);
-
-      columns_fixed[1, column_best] = 1;
-      columns_fixed_ordered[1, 1] = column_best;
-      X_global = X_orig[, column_best];
-
-      while (continue) {
-        # Subsequent passes over the features
-        beta_out_all_2 = matrix(0, boa_ncol, m_orig * 1);
-        for (i in 1:m_orig, check = 0) {
-          if (as.scalar(columns_fixed[1, i]) == 0) {
-            # Construct the feature matrix
-            X = cbind(X_global, X_orig[, i]);
-            tmp = as.matrix(0);
-            tmp[1, 1] = i;
-            columns_fixed_ordered_2 = append(columns_fixed_ordered, tmp);
-            [AIC_2, beta_out_i] = linear_regression(X, y, icpt);
-            beta_out_all_2[1:nrow(beta_out_i), (i - 1) * y_ncol + 1:i * 
y_ncol] = beta_out_i[, 1:1];
-            AICs[1, i] = AIC_2;
-            AIC_cur = as.scalar(AICs[1, i]);
-            if ((AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs(thr * 
AIC_best)) &
-            (as.scalar(columns_fixed[1, i]) == 0)) {
-              column_best = i;
-              AIC_best = as.scalar(AICs[1, i]);
-            }
-          }
+    while (continue) {
+      # Subsequent passes over the features
+      beta_out_all_2 = matrix(0, boa_ncol, m_orig * 1);
+      parfor (i in 1:m_orig, check = 0) {
+        if (as.scalar(columns_fixed[1, i]) == 0) {
+          # Construct the feature matrix
+          X = cbind(X_global, X_orig[, i]);
+          [AIC_2, beta_out_i] = linear_regression(X, y, icpt, reg, tol, maxi, 
verbose);
+          AICs[1, i] = AIC_2;
+          beta_out_all_2[1:nrow(beta_out_i), i] = beta_out_i;
         }
+        else {
+          AICs[1,i] = Inf;
+        }
+      }
 
-        # have the best beta store in the matrix
-        beta_best = beta_out_all_2[, (column_best - 1) * y_ncol + 
1:column_best * y_ncol];
-
-        # Append best found features (i.e., columns) to X_global
-        if (as.scalar(columns_fixed[1, column_best]) == 0) {
-          # new best feature found
-          print("Best AIC " + AIC_best + " achieved with feature: " + 
column_best);
-          columns_fixed[1, column_best] = 1;
-          columns_fixed_ordered = cbind(columns_fixed_ordered, 
as.matrix(column_best));
-          if (ncol(columns_fixed_ordered) == m_orig) {
-            # all features examined
-            X_global = cbind(X_global, X_orig[, column_best]);
-            continue = FALSE;
-          } else {
-            X_global = cbind(X_global, X_orig[, column_best]);
-          }
-        } else {
+      # Determine the best AIC
+      AIC_best_orig = AIC_best;
+      AIC_best = min(min(AICs), AIC_best_orig);
+      AIC_check = checkAIC(AIC_best, AIC_best_orig, thr);
+      column_best = ifelse(AIC_check, as.scalar(rowIndexMin(AICs)), 
column_best);
+
+      # have the best beta store in the matrix
+      beta_best = beta_out_all_2[, column_best];
+
+      # Append best found features (i.e., columns) to X_global
+      if (as.scalar(columns_fixed[1, column_best]) == 0) {
+        # new best feature found
+        print("Best AIC " + AIC_best + " achieved with feature: " + 
column_best);
+        columns_fixed[1, column_best] = 1;
+        columns_fixed_ordered = cbind(columns_fixed_ordered, 
as.matrix(column_best));
+        if (ncol(columns_fixed_ordered) == m_orig) {
+          # all features examined
+          X_global = cbind(X_global, X_orig[, column_best]);
           continue = FALSE;
+        } else {
+          X_global = cbind(X_global, X_orig[, column_best]);
         }
+      } else {
+        continue = FALSE;
       }
-      # run linear regression with selected set of features
-      print("Running linear regression with selected features...");
-      [AIC, beta_out] = linear_regression(X_global, y, icpt);
-      Selected = columns_fixed_ordered;
-      if (icpt != 0) {
-        Selected = cbind(Selected, matrix(boa_ncol, 1, 1))
-      }
-      beta_out = reorder_matrix(boa_ncol, beta_out, Selected);
-      S = Selected;
-      C = beta_out;
     }
-  } else {
-    stop("Currently only forward selection strategy is supported!");
+    # run linear regression with selected set of features
+    print("Running linear regression with selected features...");
+    [AIC, beta_out] = linear_regression(X_global, y, icpt, reg, tol, maxi, 
verbose);
+    S = columns_fixed_ordered;
+    if (icpt != 0)
+      S = cbind(S, matrix(boa_ncol, 1, 1))
+    B = reorder_matrix(boa_ncol, beta_out, S);
   }
 }
 
 # Computes linear regression using lm and outputs AIC.
-linear_regression = function(Matrix[Double] X, Matrix[Double] y, Integer icpt 
= 0)
-  return(Double AIC, Matrix[Double] beta) {
-  n = nrow(X);
-  m = ncol(X);
-
-  # Introduce the intercept, shift and rescale the columns of X if needed
-  if (icpt == 1 | icpt == 2) {
-    # add the intercept column
-    ones_n = matrix(1, n, 1);
-    X = cbind(X, ones_n);
-    m = m - 1;
-  }
-
-  m_ext = ncol(X);
-
+linear_regression = function(Matrix[Double] X, Matrix[Double] y, Integer icpt, 
+  Double reg, Double tol, Integer maxi, Boolean verbose)
+  return(Double AIC, Matrix[Double] beta)
+{
   # BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
-  beta = lm(X = X, y = y, verbose=FALSE);
-  m_ext = ncol(X);
+  beta = lm(X = X, y = y, icpt = icpt, reg=reg, tol=tol, maxi=maxi, 
verbose=FALSE);
+  
+  # PREPARE X for SCORING
+  if( icpt != 0 )
+    X = cbind(X, matrix(1,nrow(X),1))
+
   # COMPUTE AIC
+  n = nrow(X);
   y_residual = y - X %*% beta;
   ss_res = sum(y_residual ^ 2);
-  eq_deg_of_freedom = m_ext;
-  AIC = (2 * eq_deg_of_freedom) + n * log(ss_res / n);
-
-  # TODO IMP NOTE: with the fix in PR-22, we have not accounted for
-  # intercept=2 and # the code before # was not matching so we have removed it
-  # for now. Pl see the git revision history and diff to see the changes.
-  # in future we will have this feature. For now it is disabled
+  AIC = (2 * ncol(X)) + n * log(ss_res / n);
 }
 
 reorder_matrix = function(
@@ -250,3 +218,7 @@ reorder_matrix = function(
   P = table(seqS, S0, ncolX, ncolX);
   Y = t(P) %*% B;
 }
+
+checkAIC = function(Double AIC_cur, Double AIC_best, Double thr) return 
(Boolean R) {
+  R = (AIC_cur < AIC_best) & (AIC_best-AIC_cur > abs(thr * AIC_best))
+}

[systemml] 02/02: [SYSTEMDS] Fix and cleanup steplm feature selection built-in

Reply via email to