systemml git commit: [SYSTEMML-1647] Update StepLinearReg to work with MLContext

deron Fri, 16 Jun 2017 10:00:54 -0700

Repository: systemml
Updated Branches:
  refs/heads/master cf0877b43 -> 948943d17



[SYSTEMML-1647] Update StepLinearReg to work with MLContext

Address write statements and reordering of coefficients issues.

Closes #525.


Project: http://git-wip-us.apache.org/repos/asf/systemml/repo
Commit: http://git-wip-us.apache.org/repos/asf/systemml/commit/948943d1
Tree: http://git-wip-us.apache.org/repos/asf/systemml/tree/948943d1
Diff: http://git-wip-us.apache.org/repos/asf/systemml/diff/948943d1

Branch: refs/heads/master
Commit: 948943d17f5dd97d0678cc9bd36224b41b0b9d97
Parents: cf0877b
Author: Imran Younus <imranyou...@gmail.com>
Authored: Fri Jun 16 09:57:55 2017 -0700
Committer: Deron Eriksson <de...@us.ibm.com>
Committed: Fri Jun 16 09:57:55 2017 -0700

----------------------------------------------------------------------
 scripts/algorithms/StepLinearRegDS.dml | 630 ++++++++++++++--------------
 1 file changed, 325 insertions(+), 305 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/systemml/blob/948943d1/scripts/algorithms/StepLinearRegDS.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/StepLinearRegDS.dml 
b/scripts/algorithms/StepLinearRegDS.dml
index 00f50ee..05f4b26 100644
--- a/scripts/algorithms/StepLinearRegDS.dml
+++ b/scripts/algorithms/StepLinearRegDS.dml
@@ -25,20 +25,23 @@
 #
 # INPUT PARAMETERS:
 # 
--------------------------------------------------------------------------------------------
-# NAME    TYPE    DEFAULT    MEANING
+# NAME        TYPE    DEFAULT    MEANING
 # 
--------------------------------------------------------------------------------------------
-# X       String       ---      Location (on HDFS) to read the matrix X of 
feature vectors
-# Y       String       ---      Location (on HDFS) to read the 1-column matrix 
Y of response values
-# B       String       ---      Location to store estimated regression 
parameters (the betas)
-# S       String    ---      Location to write the selected features ordered 
as computed by the algorithm
-# O       String       " "      Location to write the printed statistics; by 
default is standard output
-# icpt    Int        0       Intercept presence, shifting and rescaling the 
columns of X:
-#                            0 = no intercept, no shifting, no rescaling;
-#                            1 = add intercept, but neither shift nor rescale 
X;
-#                            2 = add intercept, shift & rescale X columns to 
mean = 0, variance = 1
-# thr     Double    0.01     Threshold to stop the algorithm: if the decrease 
in the value of AIC falls below thr
-#                            no further features are being checked and the 
algorithm stops 
-# fmt     String   "text"    Matrix output format for B (the betas) only, 
usually "text" or "csv"
+# X           String     ---   Location (on HDFS) to read the matrix X of 
feature vectors
+# Y           String     ---   Location (on HDFS) to read the 1-column matrix 
Y of response values
+# B           String     ---   Location to store estimated regression 
parameters (the betas)
+# S           String     ---   Location to write the selected features ordered 
as computed by the algorithm
+# O           String     " "   Location to write the printed statistics; by 
default is standard output
+# icpt        Int        0     Intercept presence, shifting and rescaling the 
columns of X:
+#                              0 = no intercept, no shifting, no rescaling;
+#                              1 = add intercept, but neither shift nor 
rescale X;
+#                              2 = add intercept, shift & rescale X columns to 
mean = 0, variance = 1
+# thr         Double    0.01   Threshold to stop the algorithm: if the 
decrease in the value of AIC falls below thr
+#                              no further features are being checked and the 
algorithm stops 
+# fmt         String   "text"  Matrix output format for B (the betas) only, 
usually "text" or "csv"
+# write_beta  Boolean   TRUE   Should the beta's be returned?
+#                              0 = no
+#                              1 = yes
 # 
--------------------------------------------------------------------------------------------
 # OUTPUT: Matrix of regression parameters (the betas) and its size depend on 
icpt input value:
 #         OUTPUT SIZE:   OUTPUT CONTENTS:                HOW TO PREDICT Y FROM 
X AND B:
@@ -70,20 +73,22 @@
 #
 # HOW TO INVOKE THIS SCRIPT - EXAMPLE:
 # hadoop jar SystemML.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X 
Y=INPUT_DIR/Y B=OUTPUT_DIR/betas
-#     O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv
+#     O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv 
write_beta=TRUE
 
 fileX = $X;
 fileY = $Y;
 fileB = $B;
 fileS = $S;
 
+write_beta = ifdef($write_beta, TRUE);
+
 # currently only the forward selection strategy in supported: start from one 
feature and iteratively add 
 # features until AIC improves
 dir = "forward";
 
 fmt  = ifdef ($fmt, "text");
-intercept_status = ifdef ($icpt, 0);   
-thr = ifdef ($thr, 0.01);  
+intercept_status = ifdef ($icpt, 1);
+thr = ifdef ($thr, 0.001);
 
 print ("BEGIN STEPWISE LINEAR REGRESSION SCRIPT");
 print ("Reading X and Y...");
@@ -95,295 +100,310 @@ m_orig = ncol (X_orig);
 
 # BEGIN STEPWISE LINEAR REGRESSION
 
-if (dir == "forward") {  
-       
-       continue = TRUE;
-       columns_fixed = matrix (0, rows = 1, cols = m_orig);
-       columns_fixed_ordered = matrix (0, rows = 1, cols = 1);
-       
-       # X_global stores the best model found at each step 
-       X_global = matrix (0, rows = n, cols = 1);
-       
-       if (intercept_status == 1 | intercept_status == 2) {
-               beta = mean (y);
-               AIC_best = 2 + n * log(sum((beta - y)^2) / n);
-       } else {
-               beta = 0;
-               AIC_best = n * log(sum(y^2) / n);
-       }
-       AICs = matrix (AIC_best, rows = 1, cols = m_orig);
-       print ("Best AIC without any features: " + AIC_best);
-       
-       # First pass to examine single features
-       parfor (i in 1:m_orig) {        
-               [AIC_1] = linear_regression (X_orig[,i], y, m_orig, 
columns_fixed_ordered, " ");                                        
-               AICs[1,i] = AIC_1;
-       }
-
-       # Determine the best AIC 
-       column_best = 0;        
-       for (k in 1:m_orig) {
-               AIC_cur = as.scalar (AICs[1,k]);
-               if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * 
AIC_best)) ) {
-                       column_best = k;
-                       AIC_best = as.scalar(AICs[1,k]);
-               }
-       }
-       
-       if (column_best == 0) {
-               print ("AIC of an empty model is " + AIC_best + " and adding no 
feature achieves more than " + (thr * 100) + "% decrease in AIC!");
-               S = matrix (0, rows=1, cols=1);
-               if (intercept_status == 0) {
-                       B = matrix (beta, rows = m_orig, cols = 1);
-               } else {
-                       B_tmp = matrix (0, rows = m_orig + 1, cols = 1);
-                       B_tmp[m_orig + 1,] = beta;
-                       B = B_tmp;
-               }
-               write (S, fileS, format=fmt);
-               write (B, fileB, format=fmt);
-               stop ("");
-       }
-       print ("Best AIC " + AIC_best + " achieved with feature: " + 
column_best);      
-       columns_fixed[1,column_best] = 1;
-       columns_fixed_ordered[1,1] = column_best;
-       X_global = X_orig[,column_best];                
-               
-       while (continue) {
-               # Subsequent passes over the features
-               parfor (i in 1:m_orig) { 
-                       if (as.scalar(columns_fixed[1,i]) == 0) {       
-                       
-                               # Construct the feature matrix
-                               X = cbind (X_global, X_orig[,i]);
-                               
-                               [AIC_2] = linear_regression (X, y, m_orig, 
columns_fixed_ordered, " ");
-                               AICs[1,i] = AIC_2;
-                       }       
-               }
-       
-               # Determine the best AIC
-               for (k in 1:m_orig) {
-                       AIC_cur = as.scalar (AICs[1,k]);
-                       if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs 
(thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) {
-                               column_best = k;
-                               AIC_best = as.scalar(AICs[1,k]);
-                       }
-               }
-                               
-               # cbind best found features (i.e., columns) to X_global
-               if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best 
feature found
-                       print ("Best AIC " + AIC_best + " achieved with 
feature: " + column_best);
-                       columns_fixed[1,column_best] = 1;
-                       columns_fixed_ordered = cbind (columns_fixed_ordered, 
as.matrix(column_best));
-                       if (ncol(columns_fixed_ordered) == m_orig) { # all 
features examined
-                               X_global = cbind (X_global, 
X_orig[,column_best]);
-                               continue = FALSE;
-                       } else {
-                               X_global = cbind (X_global, 
X_orig[,column_best]);
-                       }
-               } else {
-                       continue = FALSE;
-               }
-       }
-       
-       # run linear regression with selected set of features
-       print ("Running linear regression with selected features...");
-       [AIC] = linear_regression (X_global, y, m_orig, columns_fixed_ordered, 
fileB); 
-       
+if (dir == "forward") {
+  continue = TRUE;
+  columns_fixed = matrix (0, rows = 1, cols = m_orig);
+  columns_fixed_ordered = matrix (0, rows = 1, cols = 1);
+
+  # X_global stores the best model found at each step
+  X_global = matrix (0, rows = n, cols = 1);
+
+  if (intercept_status == 1 | intercept_status == 2) {
+    beta = mean (y);
+    AIC_best = 2 + n * log(sum((beta - y)^2) / n);
+  } else {
+    beta = 0.0;
+    AIC_best = n * log(sum(y^2) / n);
+  }
+
+  AICs = matrix (AIC_best, rows = 1, cols = m_orig);
+  print ("Best AIC without any features: " + AIC_best);
+
+  boa_ncol = ncol(X_orig)
+  if (intercept_status != 0) {
+    boa_ncol = boa_ncol + 1
+  }
+
+  beta_out_all = matrix(0, rows = boa_ncol, cols = m_orig * 1);
+
+  y_ncol = 1;
+
+  # First pass to examine single features
+  parfor (i in 1:m_orig, check = 0) {
+    columns_fixed_ordered_1 = matrix(i, rows=1, cols=1);
+
+    [AIC_1, beta_out_i] = linear_regression (X_orig[, i], y, m_orig, 
columns_fixed_ordered_1,
+                                             write_beta, 0);
+
+    AICs[1, i] = AIC_1;
+
+    beta_out_all[, (i - 1) * y_ncol + 1 : i * y_ncol] = beta_out_i[, 1:1];
+
+  }
+
+  # Determine the best AIC
+  column_best = 0;
+  for (k in 1:m_orig) {
+    AIC_cur = as.scalar (AICs[1, k]);
+    if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) 
) {
+      column_best = k;
+      AIC_best = as.scalar(AICs[1, k]);
+    }
+  }
+
+  # beta best so far
+  beta_best = beta_out_all[, (column_best-1) * y_ncol + 1: column_best * 
y_ncol];
+
+  if (column_best == 0) {
+    print ("AIC of an empty model is " + AIC_best + " and adding no feature 
achieves more than " +
+           (thr * 100) + "% decrease in AIC!");
+    Selected = matrix (0, rows = 1, cols = 1);
+    if (intercept_status == 0) {
+      B = matrix (beta, rows = m_orig, cols = 1);
+    } else {
+      B_tmp = matrix (0, rows = m_orig + 1, cols = 1);
+      B_tmp[m_orig + 1, ] = beta;
+      B = B_tmp;
+    }
+
+    beta_out = B;
+
+    write(Selected, fileS, format=fmt);
+    write(beta_out, fileB, format=fmt);
+
+    stop ("");
+  }
+  print ("Best AIC " + AIC_best + " achieved with feature: " + column_best);
+  columns_fixed[1, column_best] = 1;
+  columns_fixed_ordered[1, 1] = column_best;
+  X_global = X_orig[, column_best];
+
+    while (continue) {
+    # Subsequent passes over the features
+    beta_out_all_2 = matrix(0, rows = boa_ncol, cols = m_orig * 1);
+
+    parfor (i in 1:m_orig, check = 0) {
+      if (as.scalar(columns_fixed[1, i]) == 0) {
+
+        # Construct the feature matrix
+        X = cbind (X_global, X_orig[, i]);
+
+        tmp = matrix(0, rows=1, cols=1);
+        tmp[1, 1] = i;
+        columns_fixed_ordered_2 = append(columns_fixed_ordered, tmp )
+        [AIC_2, beta_out_i] = linear_regression (X, y, m_orig, 
columns_fixed_ordered_2, write_beta, 0);
+        beta_out_all_2[, (i - 1) * y_ncol + 1 : i * y_ncol] = beta_out_i[,1:1];
+
+        AICs[1, i] = AIC_2;
+      }
+    }
+
+    # Determine the best AIC
+    for (k in 1:m_orig) {
+      AIC_cur = as.scalar (AICs[1, k]);
+      if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * 
AIC_best)) &
+            (as.scalar(columns_fixed[1, k]) == 0) ) {
+        column_best = k;
+        AIC_best = as.scalar(AICs[1, k]);
+      }
+    }
+
+    # have the best beta store in the matrix
+    beta_best = beta_out_all_2[, (column_best - 1) * y_ncol + 1 : column_best 
* y_ncol];
+
+    # Append best found features (i.e., columns) to X_global
+    if (as.scalar(columns_fixed[1, column_best]) == 0) { # new best feature 
found
+      print ("Best AIC " + AIC_best + " achieved with feature: " + 
column_best);
+      columns_fixed[1, column_best] = 1;
+      columns_fixed_ordered = cbind (columns_fixed_ordered, 
as.matrix(column_best));
+
+      if (ncol(columns_fixed_ordered) == m_orig) { # all features examined
+        X_global = cbind (X_global, X_orig[, column_best]);
+        continue = FALSE;
+      } else {
+        X_global = cbind (X_global, X_orig[, column_best]);
+      }
+    } else {
+      continue = FALSE;
+    }
+
+  }
+
+  # run linear regression with selected set of features
+  print ("Running linear regression with selected features...");
+  [AIC, beta_out] = linear_regression (X_global, y, m_orig, 
columns_fixed_ordered, write_beta, 1);
+
+  Selected = columns_fixed_ordered;
+  if (intercept_status != 0) {
+    Selected = cbind(Selected, matrix(boa_ncol, rows=1, cols=1))
+  }
+
+  beta_out = reorder_matrix(boa_ncol, beta_out, Selected);
+
+  write(Selected, fileS, format=fmt);
+  write(beta_out, fileB, format=fmt);
+
 } else {
-       stop ("Currently only forward selection strategy is supported!");
-} 
-
-
-/*
-* Computes linear regression using a direct solver for (X^T X) beta = X^T y.
-* It also outputs the AIC of the computed model.  
-*/
-linear_regression = function (Matrix[Double] X, Matrix[Double] y, Double 
m_orig, Matrix[Double] Selected, String fileB) return (Double AIC) {
-               
-       intercept_status = ifdef ($icpt, 0);
-       fmt  = ifdef ($fmt, "text");                    
-       n = nrow (X);   
-       m = ncol (X);
-       
-       # Introduce the intercept, shift and rescale the columns of X if needed
-       if (intercept_status == 1 | intercept_status == 2) { # add the 
intercept column
-               ones_n = matrix (1, rows = n, cols = 1);
-               X = cbind (X, ones_n);
-               m = m - 1;
-       }
-       m_ext = ncol(X);
-       
-       if (intercept_status == 2) { # scale-&-shift X columns to mean 0, 
variance 1
-                                                            # Important 
assumption: X [, m_ext] = ones_n
-               avg_X_cols = t(colSums(X)) / n;
-               var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 
1);
-               is_unsafe = (var_X_cols <= 0);
-               scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
-               scale_X [m_ext, 1] = 1;
-               shift_X = - avg_X_cols * scale_X;
-               shift_X [m_ext, 1] = 0;
-       } else {
-               scale_X = matrix (1, rows = m_ext, cols = 1);
-               shift_X = matrix (0, rows = m_ext, cols = 1);
-       }
-
-       # BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
-
-       A = t(X) %*% X;
-       b = t(X) %*% y;
-       if (intercept_status == 2) {
-               A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]);
-               A =   diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
-               b =   diag (scale_X) %*% b + shift_X %*% b [m_ext, ];
-       }
-
-       beta_unscaled = solve (A, b);
-       
-       # END THE DIRECT SOLVE ALGORITHM
-
-       if (intercept_status == 2) {
-               beta = scale_X * beta_unscaled;
-               beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
-       } else {
-               beta = beta_unscaled;
-       }
-       
-       # COMPUTE AIC
-       y_residual = y - X %*% beta;
-       ss_res = sum (y_residual ^ 2);
-       eq_deg_of_freedom = m_ext;
-       AIC = (2 * eq_deg_of_freedom) + n * log (ss_res / n);
-       
-       if (fileB != " ") {
-       
-               fileO = ifdef ($O, " ");
-               fileS = $S;
-               
-               print ("Computing the statistics...");
-               avg_tot = sum (y) / n;
-               ss_tot = sum (y ^ 2);
-               ss_avg_tot = ss_tot - n * avg_tot ^ 2;
-               var_tot = ss_avg_tot / (n - 1);
-#              y_residual = y - X %*% beta;
-               avg_res = sum (y_residual) / n;
-#              ss_res = sum (y_residual ^ 2);
-               ss_avg_res = ss_res - n * avg_res ^ 2;
-
-               R2 = 1 - ss_res / ss_avg_tot;
-               if (n > m_ext) {
-                       dispersion  = ss_res / (n - m_ext);
-                       adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
-               } else {
-                       dispersion  = 0.0 / 0.0;
-                       adjusted_R2 = 0.0 / 0.0;
-               }
-
-               R2_nobias = 1 - ss_avg_res / ss_avg_tot;
-               deg_freedom = n - m - 1;
-               if (deg_freedom > 0) {
-                       var_res = ss_avg_res / deg_freedom;
-                       adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 
1));
-               } else {
-                       var_res = 0.0 / 0.0;
-                       adjusted_R2_nobias = 0.0 / 0.0;
-                       print ("Warning: zero or negative number of degrees of 
freedom.");
-               }
-
-               R2_vs_0 = 1 - ss_res / ss_tot;
-               if (n > m) {
-                       adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / 
n);
-               } else {
-                       adjusted_R2_vs_0 = 0.0 / 0.0;
-               }
-
-               str = "AVG_TOT_Y," + avg_tot;                                   
 #  Average of the response value Y
-               str = append (str, "STDEV_TOT_Y," + sqrt (var_tot));            
 #  Standard Deviation of the response value Y
-               str = append (str, "AVG_RES_Y," + avg_res);                     
 #  Average of the residual Y - pred(Y|X), i.e. residual bias
-               str = append (str, "STDEV_RES_Y," + sqrt (var_res));            
 #  Standard Deviation of the residual Y - pred(Y|X)
-               str = append (str, "DISPERSION," + dispersion);                 
 #  GLM-style dispersion, i.e. residual sum of squares / # d.f.
-               str = append (str, "R2," + R2);                                 
 #  R^2 of residual with bias included vs. total average
-               str = append (str, "ADJUSTED_R2," + adjusted_R2);               
 #  Adjusted R^2 of residual with bias included vs. total average
-               str = append (str, "R2_NOBIAS," + R2_nobias);                   
 #  R^2 of residual with bias subtracted vs. total average
-               str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); 
 #  Adjusted R^2 of residual with bias subtracted vs. total average
-               if (intercept_status == 0) {
-                       str = append (str, "R2_VS_0," + R2_vs_0);               
     #  R^2 of residual with bias included vs. zero constant
-                       str = append (str, "ADJUSTED_R2_VS_0," + 
adjusted_R2_vs_0);  #  Adjusted R^2 of residual with bias included vs. zero 
constant
-               }
-
-               if (fileO != " ") {
-                       write (str, fileO);
-               } else {
-                       print (str);
-               }
-
-               # Prepare the output matrix
-               print ("Writing the output matrix...");
-               if (intercept_status == 2) {
-                       beta_out = cbind (beta, beta_unscaled);
-               } else {
-                       beta_out = beta;
-               }
-               
-               # Output which features give the best AIC and are being used 
for linear regression 
-               write (Selected, fileS, format=fmt);
-               
-               no_selected = ncol (Selected);
-               max_selected = max (Selected);
-               last = max_selected + 1;        
-               
-               if (intercept_status != 0) {
-               
-                       Selected_ext = cbind (Selected, as.matrix (last));      
                
-                       P1 = table (seq (1, ncol (Selected_ext)), 
t(Selected_ext)); 
-
-                       if (intercept_status == 2) {
-                       
-                               P1_beta = P1 * beta;
-                               P2_beta = colSums (P1_beta);
-                               P1_beta_unscaled = P1 * beta_unscaled;
-                               P2_beta_unscaled = colSums(P1_beta_unscaled);
-                               
-                               if (max_selected < m_orig) {
-                                       P2_beta = cbind (P2_beta, matrix (0, 
rows=1, cols=(m_orig - max_selected)));
-                                       P2_beta_unscaled = cbind 
(P2_beta_unscaled, matrix (0, rows=1, cols=(m_orig - max_selected)));
-                                       
-                                       P2_beta[1, m_orig+1] = P2_beta[1, 
max_selected + 1]; 
-                                       P2_beta[1, max_selected + 1] = 0;
-                               
-                                       P2_beta_unscaled[1, m_orig+1] = 
P2_beta_unscaled[1, max_selected + 1]; 
-                                       P2_beta_unscaled[1, max_selected + 1] = 
0;
-                               }
-                               beta_out = cbind (t(P2_beta), 
t(P2_beta_unscaled));
-                               
-                       } else {
-                       
-                               P1_beta = P1 * beta;
-                               P2_beta = colSums (P1_beta);
-                               
-                               if (max_selected < m_orig) {
-                                       P2_beta = cbind (P2_beta, matrix (0, 
rows=1, cols=(m_orig - max_selected)));
-                                       P2_beta[1, m_orig+1] = P2_beta[1, 
max_selected + 1] ; 
-                                       P2_beta[1, max_selected + 1] = 0;
-                               }
-                               beta_out = t(P2_beta);
-                               
-                       }
-               } else {
-               
-                       P1 = table (seq (1, no_selected), t(Selected)); 
-                       P1_beta = P1 * beta;
-                       P2_beta = colSums (P1_beta);    
-
-                       if (max_selected < m_orig) {
-                               P2_beta = cbind (P2_beta, matrix (0, rows=1, 
cols=(m_orig - max_selected)));
-                       }               
-
-                       beta_out = t(P2_beta);  
-               }
-               
-               write ( beta_out, fileB, format=fmt );          
-       }
+  stop ("Currently only forward selection strategy is supported!");
 }
 
+# Computes linear regression using a direct solver for (X^T X) beta = X^T y.
+# It also outputs the AIC of the computed model.
+
+linear_regression = function (Matrix[Double] X, Matrix[Double] y, Double 
m_orig,
+  Matrix[Double] Selected, Boolean write_beta, Boolean writeStats)
+  return (Double AIC, Matrix[Double] beta) {
+
+    intercept_status = ifdef ($icpt, 0);
+    fmt = ifdef ($fmt, "text");
+    n = nrow (X);
+    m = ncol (X);
+
+    # Introduce the intercept, shift and rescale the columns of X if needed
+    if (intercept_status == 1 | intercept_status == 2) { # add the intercept 
column
+      ones_n = matrix (1, rows = n, cols = 1);
+      X = cbind (X, ones_n);
+      m = m - 1;
+    }
+
+    m_ext = ncol(X);
+
+    if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 
1
+      # Important assumption: X [, m_ext] = ones_n
+      avg_X_cols = t(colSums(X)) / n;
+      var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1);
+      is_unsafe = (var_X_cols <= 0);
+      scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
+      scale_X [m_ext, 1] = 1;
+      shift_X = - avg_X_cols * scale_X;
+      shift_X [m_ext, 1] = 0;
+    } else {
+      scale_X = matrix (1, rows = m_ext, cols = 1);
+      shift_X = matrix (0, rows = m_ext, cols = 1);
+    }
+
+    # BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL)
+
+    A = t(X) %*% X;
+    b = t(X) %*% y;
+    if (intercept_status == 2) {
+      A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]);
+      A =   diag (scale_X) %*% A + shift_X %*% A [m_ext, ];
+      b =   diag (scale_X) %*% b + shift_X %*% b [m_ext, ];
+    }
+
+    beta_unscaled = solve (A, b);
+
+    # END THE DIRECT SOLVE ALGORITHM
+
+    if (intercept_status == 2) {
+      beta = scale_X * beta_unscaled;
+      beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled;
+    } else {
+      beta = beta_unscaled;
+    }
+
+    # COMPUTE AIC
+    y_residual = y - X %*% beta;
+    ss_res = sum (y_residual ^ 2);
+    eq_deg_of_freedom = m_ext;
+    AIC = (2 * eq_deg_of_freedom) + n * log (ss_res / n);
+
+    if(write_beta == 1) {
+      fileO = ifdef ($O, " ");
+      fileS = $S;
+
+      print ("Computing the statistics...");
+      avg_tot = sum (y) / n;
+      ss_tot = sum (y ^ 2);
+      ss_avg_tot = ss_tot - n * avg_tot ^ 2;
+      var_tot = ss_avg_tot / (n - 1);
+      # y_residual = y - X %*% beta;
+      avg_res = sum (y_residual) / n;
+      # ss_res = sum (y_residual ^ 2);
+      ss_avg_res = ss_res - n * avg_res ^ 2;
+
+      R2 = 1 - ss_res / ss_avg_tot;
+      if (n > m_ext) {
+        dispersion  = ss_res / (n - m_ext);
+        adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1));
+      } else {
+        dispersion  = 0.0 / 0.0;
+        adjusted_R2 = 0.0 / 0.0;
+      }
+
+      R2_nobias = 1 - ss_avg_res / ss_avg_tot;
+      deg_freedom = n - m - 1;
+      if (deg_freedom > 0) {
+        var_res = ss_avg_res / deg_freedom;
+        adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1));
+      } else {
+        var_res = 0.0 / 0.0;
+        adjusted_R2_nobias = 0.0 / 0.0;
+        print ("Warning: zero or negative number of degrees of freedom.");
+      }
+
+      R2_vs_0 = 1 - ss_res / ss_tot;
+      if (n > m) {
+        adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n);
+      } else {
+        adjusted_R2_vs_0 = 0.0 / 0.0;
+      }
+
+      str = "AVG_TOT_Y," + avg_tot;                                    #  
Average of the response value Y
+      str = append (str, "STDEV_TOT_Y," + sqrt (var_tot));             #  
Standard Deviation of the response value Y
+      str = append (str, "AVG_RES_Y," + avg_res);                      #  
Average of the residual Y - pred(Y|X), i.e. residual bias
+      str = append (str, "STDEV_RES_Y," + sqrt (var_res));             #  
Standard Deviation of the residual Y - pred(Y|X)
+      str = append (str, "DISPERSION," + dispersion);                  #  
GLM-style dispersion, i.e. residual sum of squares / # d.f.
+      str = append (str, "R2," + R2);                                  #  R^2 
of residual with bias included vs. total average
+      str = append (str, "ADJUSTED_R2," + adjusted_R2);                #  
Adjusted R^2 of residual with bias included vs. total average
+      str = append (str, "R2_NOBIAS," + R2_nobias);                    #  R^2 
of residual with bias subtracted vs. total average
+      str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias);  #  
Adjusted R^2 of residual with bias subtracted vs. total average
+      if (intercept_status == 0) {
+        str = append (str, "R2_VS_0," + R2_vs_0);                      #  R^2 
of residual with bias included vs. zero constant
+        str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0);    #  
Adjusted R^2 of residual with bias included vs. zero constant
+      }
+
+      if (fileO != " " & writeStats != 0) {
+        write(str, fileO);
+      } else {
+        print (str);
+        print ("");
+      }
+
+      # TODO IMP NOTE: with the fix in PR-22, we have not accounted for 
+      # intercept=2 and # the code before # was not matching so we have 
removed it
+      # for now. Pl see the git revision history and diff to see the changes.
+      # in future we will have this feature. For now it is disabled
+    }
+  }
+
+
+reorder_matrix = function(
+  double ncolX, # number of column in X, inlcuding the intercept column
+  matrix[double] B, # beta
+  matrix[double] S  # Selected
+) return (matrix[double] Y) {
+  # This function assumes that B and S have same number of elements.
+  # if the intercept is included in the model, all inputs should be adjusted
+  # appropriately before calling this function.
+
+  S = t(S);
+  num_empty_B = ncolX - nrow(B);
+  if (num_empty_B < 0) {
+    stop("Error: unable to re-order the matrix. Reason: B more than matrix X");
+  }
+
+  if (num_empty_B > 0) {
+    pad_zeros = matrix(0, rows = num_empty_B, cols=1);
+    B = rbind(B, pad_zeros);
+    S = rbind(S, pad_zeros);
+  }
+
+  # since the table won't accept zeros as index we hack it.
+  S0 = replace(target = S, pattern = 0, replacement = ncolX+1);
+  seqS = seq(1, nrow(S0));
+  P = table(seqS, S0, ncolX, ncolX);
+
+  Y = t(P) %*% B;
+}

systemml git commit: [SYSTEMML-1647] Update StepLinearReg to work with MLContext

Reply via email to