http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/StepLinearRegDS.dml ---------------------------------------------------------------------- diff --git a/scripts/algorithms/StepLinearRegDS.dml b/scripts/algorithms/StepLinearRegDS.dml index afd94ed..953402f 100644 --- a/scripts/algorithms/StepLinearRegDS.dml +++ b/scripts/algorithms/StepLinearRegDS.dml @@ -1,388 +1,388 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC -# EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y -# -# INPUT PARAMETERS: -# -------------------------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# -------------------------------------------------------------------------------------------- -# X String --- Location (on HDFS) to read the matrix X of feature vectors -# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values -# B String --- Location to store estimated regression parameters (the betas) -# S String --- Location to write the selected features ordered as computed by the algorithm -# O String " " Location to write the printed statistics; by default is standard output -# icpt Int 0 Intercept presence, shifting and rescaling the columns of X: -# 0 = no intercept, no shifting, no rescaling; -# 1 = add intercept, but neither shift nor rescale X; -# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 -# thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr -# no further features are being checked and the algorithm stops -# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv" -# -------------------------------------------------------------------------------------------- -# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value: -# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B: -# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B -# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] -# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] -# Col.2: betas for shifted/rescaled X and intercept -# -# In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated -# name-value pair per each line, as follows: -# -# NAME MEANING -# ------------------------------------------------------------------------------------- -# AVG_TOT_Y Average of the response value Y -# STDEV_TOT_Y Standard Deviation of the response value Y -# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias -# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X) -# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr. -# PLAIN_R2 Plain R^2 of residual with bias included vs. total average -# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average -# PLAIN_R2_NOBIAS Plain R^2 of residual with bias subtracted vs. total average -# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average -# PLAIN_R2_VS_0 * Plain R^2 of residual with bias included vs. zero constant -# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant -# ------------------------------------------------------------------------------------- -# * The last two statistics are only printed if there is no intercept (icpt=0) -# If the best AIC is achieved without any features the matrix of selected features contains 0. -# Moreover, in this case no further statistics will be produced -# -# HOW TO INVOKE THIS SCRIPT - EXAMPLE: -# hadoop jar SystemML.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas -# O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv - -fileX = $X; -fileY = $Y; -fileB = $B; -fileS = $S; - -# currently only the forward selection strategy in supported: start from one feature and iteratively add -# features until AIC improves -dir = "forward"; - -fmt = ifdef ($fmt, "text"); -intercept_status = ifdef ($icpt, 0); -thr = ifdef ($thr, 0.01); - -print ("BEGIN STEPWISE LINEAR REGRESSION SCRIPT"); -print ("Reading X and Y..."); -X_orig = read (fileX); -y = read (fileY); - -n = nrow (X_orig); -m_orig = ncol (X_orig); - -# BEGIN STEPWISE LINEAR REGRESSION - -if (dir == "forward") { - - continue = TRUE; - columns_fixed = matrix (0, rows = 1, cols = m_orig); - columns_fixed_ordered = matrix (0, rows = 1, cols = 1); - - # X_global stores the best model found at each step - X_global = matrix (0, rows = n, cols = 1); - - if (intercept_status == 1 | intercept_status == 2) { - beta = mean (y); - AIC_best = 2 + n * log(sum((beta - y)^2) / n); - } else { - beta = 0; - AIC_best = n * log(sum(y^2) / n); - } - AICs = matrix (AIC_best, rows = 1, cols = m_orig); - print ("Best AIC without any features: " + AIC_best); - - # First pass to examine single features - parfor (i in 1:m_orig) { - [AIC_1] = linear_regression (X_orig[,i], y, m_orig, columns_fixed_ordered, " "); - AICs[1,i] = AIC_1; - } - - # Determine the best AIC - column_best = 0; - for (k in 1:m_orig) { - AIC_cur = as.scalar (AICs[1,k]); - if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) { - column_best = k; - AIC_best = as.scalar(AICs[1,k]); - } - } - - if (column_best == 0) { - print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!"); - S = matrix (0, rows=1, cols=1); - if (intercept_status == 0) { - B = matrix (beta, rows = m_orig, cols = 1); - } else { - B_tmp = matrix (0, rows = m_orig + 1, cols = 1); - B_tmp[m_orig + 1,] = beta; - B = B_tmp; - } - write (S, fileS, format=fmt); - write (B, fileB, format=fmt); - stop (""); - } - print ("Best AIC " + AIC_best + " achieved with feature: " + column_best); - columns_fixed[1,column_best] = 1; - columns_fixed_ordered[1,1] = column_best; - X_global = X_orig[,column_best]; - - while (continue) { - # Subsequent passes over the features - parfor (i in 1:m_orig) { - if (as.scalar(columns_fixed[1,i]) == 0) { - - # Construct the feature matrix - X = append (X_global, X_orig[,i]); - - [AIC_2] = linear_regression (X, y, m_orig, columns_fixed_ordered, " "); - AICs[1,i] = AIC_2; - } - } - - # Determine the best AIC - for (k in 1:m_orig) { - AIC_cur = as.scalar (AICs[1,k]); - if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) { - column_best = k; - AIC_best = as.scalar(AICs[1,k]); - } - } - - # Append best found features (i.e., columns) to X_global - if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best feature found - print ("Best AIC " + AIC_best + " achieved with feature: " + column_best); - columns_fixed[1,column_best] = 1; - columns_fixed_ordered = append (columns_fixed_ordered, as.matrix(column_best)); - if (ncol(columns_fixed_ordered) == m_orig) { # all features examined - X_global = append (X_global, X_orig[,column_best]); - continue = FALSE; - } else { - X_global = append (X_global, X_orig[,column_best]); - } - } else { - continue = FALSE; - } - } - - # run linear regression with selected set of features - print ("Running linear regression with selected features..."); - [AIC] = linear_regression (X_global, y, m_orig, columns_fixed_ordered, fileB); - -} else { - stop ("Currently only forward selection strategy is supported!"); -} - - -/* -* Computes linear regression using a direct solver for (X^T X) beta = X^T y. -* It also outputs the AIC of the computed model. -*/ -linear_regression = function (Matrix[Double] X, Matrix[Double] y, Double m_orig, Matrix[Double] Selected, String fileB) return (Double AIC) { - - intercept_status = ifdef ($icpt, 0); - n = nrow (X); - m = ncol (X); - - # Introduce the intercept, shift and rescale the columns of X if needed - if (intercept_status == 1 | intercept_status == 2) { # add the intercept column - ones_n = matrix (1, rows = n, cols = 1); - X = append (X, ones_n); - m = m - 1; - } - m_ext = ncol(X); - - if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 1 - # Important assumption: X [, m_ext] = ones_n - avg_X_cols = t(colSums(X)) / n; - var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1); - is_unsafe = ppred (var_X_cols, 0.0, "<="); - scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe); - scale_X [m_ext, 1] = 1; - shift_X = - avg_X_cols * scale_X; - shift_X [m_ext, 1] = 0; - } else { - scale_X = matrix (1, rows = m_ext, cols = 1); - shift_X = matrix (0, rows = m_ext, cols = 1); - } - - # BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL) - - A = t(X) %*% X; - b = t(X) %*% y; - if (intercept_status == 2) { - A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]); - A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ]; - b = diag (scale_X) %*% b + shift_X %*% b [m_ext, ]; - } - - beta_unscaled = solve (A, b); - - # END THE DIRECT SOLVE ALGORITHM - - if (intercept_status == 2) { - beta = scale_X * beta_unscaled; - beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled; - } else { - beta = beta_unscaled; - } - - # COMPUTE AIC - y_residual = y - X %*% beta; - ss_res = sum (y_residual ^ 2); - eq_deg_of_freedom = m_ext; - AIC = (2 * eq_deg_of_freedom) + n * log (ss_res / n); - - if (fileB != " ") { - - fileO = ifdef ($O, " "); - fileS = $S; - - print ("Computing the statistics..."); - avg_tot = sum (y) / n; - ss_tot = sum (y ^ 2); - ss_avg_tot = ss_tot - n * avg_tot ^ 2; - var_tot = ss_avg_tot / (n - 1); -# y_residual = y - X %*% beta; - avg_res = sum (y_residual) / n; -# ss_res = sum (y_residual ^ 2); - ss_avg_res = ss_res - n * avg_res ^ 2; - - plain_R2 = 1 - ss_res / ss_avg_tot; - if (n > m_ext) { - dispersion = ss_res / (n - m_ext); - adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1)); - } else { - dispersion = 0.0 / 0.0; - adjusted_R2 = 0.0 / 0.0; - } - - plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot; - deg_freedom = n - m - 1; - if (deg_freedom > 0) { - var_res = ss_avg_res / deg_freedom; - adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1)); - } else { - var_res = 0.0 / 0.0; - adjusted_R2_nobias = 0.0 / 0.0; - print ("Warning: zero or negative number of degrees of freedom."); - } - - plain_R2_vs_0 = 1 - ss_res / ss_tot; - if (n > m) { - adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n); - } else { - adjusted_R2_vs_0 = 0.0 / 0.0; - } - - str = "AVG_TOT_Y," + avg_tot; # Average of the response value Y - str = append (str, "STDEV_TOT_Y," + sqrt (var_tot)); # Standard Deviation of the response value Y - str = append (str, "AVG_RES_Y," + avg_res); # Average of the residual Y - pred(Y|X), i.e. residual bias - str = append (str, "STDEV_RES_Y," + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y|X) - str = append (str, "DISPERSION," + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f. - str = append (str, "PLAIN_R2," + plain_R2); # Plain R^2 of residual with bias included vs. total average - str = append (str, "ADJUSTED_R2," + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average - str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias); # Plain R^2 of residual with bias subtracted vs. total average - str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average - if (intercept_status == 0) { - str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0); # Plain R^2 of residual with bias included vs. zero constant - str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant - } - - if (fileO != " ") { - write (str, fileO); - } else { - print (str); - } - - # Prepare the output matrix - print ("Writing the output matrix..."); - if (intercept_status == 2) { - beta_out = append (beta, beta_unscaled); - } else { - beta_out = beta; - } - - # Output which features give the best AIC and are being used for linear regression - write (Selected, fileS, format=fmt); - - no_selected = ncol (Selected); - max_selected = max (Selected); - last = max_selected + 1; - - if (intercept_status != 0) { - - Selected_ext = append (Selected, as.matrix (last)); - P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext)); - - if (intercept_status == 2) { - - P1_beta = P1 * beta; - P2_beta = colSums (P1_beta); - P1_beta_unscaled = P1 * beta_unscaled; - P2_beta_unscaled = colSums(P1_beta_unscaled); - - if (max_selected < m_orig) { - P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected))); - P2_beta_unscaled = append (P2_beta_unscaled, matrix (0, rows=1, cols=(m_orig - max_selected))); - - P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1]; - P2_beta[1, max_selected + 1] = 0; - - P2_beta_unscaled[1, m_orig+1] = P2_beta_unscaled[1, max_selected + 1]; - P2_beta_unscaled[1, max_selected + 1] = 0; - } - beta_out = append (t(P2_beta), t(P2_beta_unscaled)); - - } else { - - P1_beta = P1 * beta; - P2_beta = colSums (P1_beta); - - if (max_selected < m_orig) { - P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected))); - P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1] ; - P2_beta[1, max_selected + 1] = 0; - } - beta_out = t(P2_beta); - - } - } else { - - P1 = table (seq (1, no_selected), t(Selected)); - P1_beta = P1 * beta; - P2_beta = colSums (P1_beta); - - if (max_selected < m_orig) { - P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected))); - } - - beta_out = t(P2_beta); - } - - write ( beta_out, fileB, format=fmt ); - } -} - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# THIS SCRIPT CHOOSES A LINEAR MODEL IN A STEPWISE ALGIRITHM USING AIC +# EACH LINEAR REGRESSION USES A DIRECT SOLVER FOR (X^T X) beta = X^T y +# +# INPUT PARAMETERS: +# -------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# -------------------------------------------------------------------------------------------- +# X String --- Location (on HDFS) to read the matrix X of feature vectors +# Y String --- Location (on HDFS) to read the 1-column matrix Y of response values +# B String --- Location to store estimated regression parameters (the betas) +# S String --- Location to write the selected features ordered as computed by the algorithm +# O String " " Location to write the printed statistics; by default is standard output +# icpt Int 0 Intercept presence, shifting and rescaling the columns of X: +# 0 = no intercept, no shifting, no rescaling; +# 1 = add intercept, but neither shift nor rescale X; +# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 +# thr Double 0.01 Threshold to stop the algorithm: if the decrease in the value of AIC falls below thr +# no further features are being checked and the algorithm stops +# fmt String "text" Matrix output format for B (the betas) only, usually "text" or "csv" +# -------------------------------------------------------------------------------------------- +# OUTPUT: Matrix of regression parameters (the betas) and its size depend on icpt input value: +# OUTPUT SIZE: OUTPUT CONTENTS: HOW TO PREDICT Y FROM X AND B: +# icpt=0: ncol(X) x 1 Betas for X only Y ~ X %*% B[1:ncol(X), 1], or just X %*% B +# icpt=1: ncol(X)+1 x 1 Betas for X and intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] +# icpt=2: ncol(X)+1 x 2 Col.1: betas for X & intercept Y ~ X %*% B[1:ncol(X), 1] + B[ncol(X)+1, 1] +# Col.2: betas for shifted/rescaled X and intercept +# +# In addition, in the last run of linear regression some statistics are provided in CSV format, one comma-separated +# name-value pair per each line, as follows: +# +# NAME MEANING +# ------------------------------------------------------------------------------------- +# AVG_TOT_Y Average of the response value Y +# STDEV_TOT_Y Standard Deviation of the response value Y +# AVG_RES_Y Average of the residual Y - pred(Y|X), i.e. residual bias +# STDEV_RES_Y Standard Deviation of the residual Y - pred(Y|X) +# DISPERSION GLM-style dispersion, i.e. residual sum of squares / # deg. fr. +# PLAIN_R2 Plain R^2 of residual with bias included vs. total average +# ADJUSTED_R2 Adjusted R^2 of residual with bias included vs. total average +# PLAIN_R2_NOBIAS Plain R^2 of residual with bias subtracted vs. total average +# ADJUSTED_R2_NOBIAS Adjusted R^2 of residual with bias subtracted vs. total average +# PLAIN_R2_VS_0 * Plain R^2 of residual with bias included vs. zero constant +# ADJUSTED_R2_VS_0 * Adjusted R^2 of residual with bias included vs. zero constant +# ------------------------------------------------------------------------------------- +# * The last two statistics are only printed if there is no intercept (icpt=0) +# If the best AIC is achieved without any features the matrix of selected features contains 0. +# Moreover, in this case no further statistics will be produced +# +# HOW TO INVOKE THIS SCRIPT - EXAMPLE: +# hadoop jar SystemML.jar -f StepLinearRegDS.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas +# O=OUTPUT_DIR/stats S=OUTPUT_DIR/selected icpt=2 thr=0.01 fmt=csv + +fileX = $X; +fileY = $Y; +fileB = $B; +fileS = $S; + +# currently only the forward selection strategy in supported: start from one feature and iteratively add +# features until AIC improves +dir = "forward"; + +fmt = ifdef ($fmt, "text"); +intercept_status = ifdef ($icpt, 0); +thr = ifdef ($thr, 0.01); + +print ("BEGIN STEPWISE LINEAR REGRESSION SCRIPT"); +print ("Reading X and Y..."); +X_orig = read (fileX); +y = read (fileY); + +n = nrow (X_orig); +m_orig = ncol (X_orig); + +# BEGIN STEPWISE LINEAR REGRESSION + +if (dir == "forward") { + + continue = TRUE; + columns_fixed = matrix (0, rows = 1, cols = m_orig); + columns_fixed_ordered = matrix (0, rows = 1, cols = 1); + + # X_global stores the best model found at each step + X_global = matrix (0, rows = n, cols = 1); + + if (intercept_status == 1 | intercept_status == 2) { + beta = mean (y); + AIC_best = 2 + n * log(sum((beta - y)^2) / n); + } else { + beta = 0; + AIC_best = n * log(sum(y^2) / n); + } + AICs = matrix (AIC_best, rows = 1, cols = m_orig); + print ("Best AIC without any features: " + AIC_best); + + # First pass to examine single features + parfor (i in 1:m_orig) { + [AIC_1] = linear_regression (X_orig[,i], y, m_orig, columns_fixed_ordered, " "); + AICs[1,i] = AIC_1; + } + + # Determine the best AIC + column_best = 0; + for (k in 1:m_orig) { + AIC_cur = as.scalar (AICs[1,k]); + if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) ) { + column_best = k; + AIC_best = as.scalar(AICs[1,k]); + } + } + + if (column_best == 0) { + print ("AIC of an empty model is " + AIC_best + " and adding no feature achieves more than " + (thr * 100) + "% decrease in AIC!"); + S = matrix (0, rows=1, cols=1); + if (intercept_status == 0) { + B = matrix (beta, rows = m_orig, cols = 1); + } else { + B_tmp = matrix (0, rows = m_orig + 1, cols = 1); + B_tmp[m_orig + 1,] = beta; + B = B_tmp; + } + write (S, fileS, format=fmt); + write (B, fileB, format=fmt); + stop (""); + } + print ("Best AIC " + AIC_best + " achieved with feature: " + column_best); + columns_fixed[1,column_best] = 1; + columns_fixed_ordered[1,1] = column_best; + X_global = X_orig[,column_best]; + + while (continue) { + # Subsequent passes over the features + parfor (i in 1:m_orig) { + if (as.scalar(columns_fixed[1,i]) == 0) { + + # Construct the feature matrix + X = append (X_global, X_orig[,i]); + + [AIC_2] = linear_regression (X, y, m_orig, columns_fixed_ordered, " "); + AICs[1,i] = AIC_2; + } + } + + # Determine the best AIC + for (k in 1:m_orig) { + AIC_cur = as.scalar (AICs[1,k]); + if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) { + column_best = k; + AIC_best = as.scalar(AICs[1,k]); + } + } + + # Append best found features (i.e., columns) to X_global + if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best feature found + print ("Best AIC " + AIC_best + " achieved with feature: " + column_best); + columns_fixed[1,column_best] = 1; + columns_fixed_ordered = append (columns_fixed_ordered, as.matrix(column_best)); + if (ncol(columns_fixed_ordered) == m_orig) { # all features examined + X_global = append (X_global, X_orig[,column_best]); + continue = FALSE; + } else { + X_global = append (X_global, X_orig[,column_best]); + } + } else { + continue = FALSE; + } + } + + # run linear regression with selected set of features + print ("Running linear regression with selected features..."); + [AIC] = linear_regression (X_global, y, m_orig, columns_fixed_ordered, fileB); + +} else { + stop ("Currently only forward selection strategy is supported!"); +} + + +/* +* Computes linear regression using a direct solver for (X^T X) beta = X^T y. +* It also outputs the AIC of the computed model. +*/ +linear_regression = function (Matrix[Double] X, Matrix[Double] y, Double m_orig, Matrix[Double] Selected, String fileB) return (Double AIC) { + + intercept_status = ifdef ($icpt, 0); + n = nrow (X); + m = ncol (X); + + # Introduce the intercept, shift and rescale the columns of X if needed + if (intercept_status == 1 | intercept_status == 2) { # add the intercept column + ones_n = matrix (1, rows = n, cols = 1); + X = append (X, ones_n); + m = m - 1; + } + m_ext = ncol(X); + + if (intercept_status == 2) { # scale-&-shift X columns to mean 0, variance 1 + # Important assumption: X [, m_ext] = ones_n + avg_X_cols = t(colSums(X)) / n; + var_X_cols = (t(colSums (X ^ 2)) - n * (avg_X_cols ^ 2)) / (n - 1); + is_unsafe = ppred (var_X_cols, 0.0, "<="); + scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe); + scale_X [m_ext, 1] = 1; + shift_X = - avg_X_cols * scale_X; + shift_X [m_ext, 1] = 0; + } else { + scale_X = matrix (1, rows = m_ext, cols = 1); + shift_X = matrix (0, rows = m_ext, cols = 1); + } + + # BEGIN THE DIRECT SOLVE ALGORITHM (EXTERNAL CALL) + + A = t(X) %*% X; + b = t(X) %*% y; + if (intercept_status == 2) { + A = t(diag (scale_X) %*% A + shift_X %*% A [m_ext, ]); + A = diag (scale_X) %*% A + shift_X %*% A [m_ext, ]; + b = diag (scale_X) %*% b + shift_X %*% b [m_ext, ]; + } + + beta_unscaled = solve (A, b); + + # END THE DIRECT SOLVE ALGORITHM + + if (intercept_status == 2) { + beta = scale_X * beta_unscaled; + beta [m_ext, ] = beta [m_ext, ] + t(shift_X) %*% beta_unscaled; + } else { + beta = beta_unscaled; + } + + # COMPUTE AIC + y_residual = y - X %*% beta; + ss_res = sum (y_residual ^ 2); + eq_deg_of_freedom = m_ext; + AIC = (2 * eq_deg_of_freedom) + n * log (ss_res / n); + + if (fileB != " ") { + + fileO = ifdef ($O, " "); + fileS = $S; + + print ("Computing the statistics..."); + avg_tot = sum (y) / n; + ss_tot = sum (y ^ 2); + ss_avg_tot = ss_tot - n * avg_tot ^ 2; + var_tot = ss_avg_tot / (n - 1); +# y_residual = y - X %*% beta; + avg_res = sum (y_residual) / n; +# ss_res = sum (y_residual ^ 2); + ss_avg_res = ss_res - n * avg_res ^ 2; + + plain_R2 = 1 - ss_res / ss_avg_tot; + if (n > m_ext) { + dispersion = ss_res / (n - m_ext); + adjusted_R2 = 1 - dispersion / (ss_avg_tot / (n - 1)); + } else { + dispersion = 0.0 / 0.0; + adjusted_R2 = 0.0 / 0.0; + } + + plain_R2_nobias = 1 - ss_avg_res / ss_avg_tot; + deg_freedom = n - m - 1; + if (deg_freedom > 0) { + var_res = ss_avg_res / deg_freedom; + adjusted_R2_nobias = 1 - var_res / (ss_avg_tot / (n - 1)); + } else { + var_res = 0.0 / 0.0; + adjusted_R2_nobias = 0.0 / 0.0; + print ("Warning: zero or negative number of degrees of freedom."); + } + + plain_R2_vs_0 = 1 - ss_res / ss_tot; + if (n > m) { + adjusted_R2_vs_0 = 1 - (ss_res / (n - m)) / (ss_tot / n); + } else { + adjusted_R2_vs_0 = 0.0 / 0.0; + } + + str = "AVG_TOT_Y," + avg_tot; # Average of the response value Y + str = append (str, "STDEV_TOT_Y," + sqrt (var_tot)); # Standard Deviation of the response value Y + str = append (str, "AVG_RES_Y," + avg_res); # Average of the residual Y - pred(Y|X), i.e. residual bias + str = append (str, "STDEV_RES_Y," + sqrt (var_res)); # Standard Deviation of the residual Y - pred(Y|X) + str = append (str, "DISPERSION," + dispersion); # GLM-style dispersion, i.e. residual sum of squares / # d.f. + str = append (str, "PLAIN_R2," + plain_R2); # Plain R^2 of residual with bias included vs. total average + str = append (str, "ADJUSTED_R2," + adjusted_R2); # Adjusted R^2 of residual with bias included vs. total average + str = append (str, "PLAIN_R2_NOBIAS," + plain_R2_nobias); # Plain R^2 of residual with bias subtracted vs. total average + str = append (str, "ADJUSTED_R2_NOBIAS," + adjusted_R2_nobias); # Adjusted R^2 of residual with bias subtracted vs. total average + if (intercept_status == 0) { + str = append (str, "PLAIN_R2_VS_0," + plain_R2_vs_0); # Plain R^2 of residual with bias included vs. zero constant + str = append (str, "ADJUSTED_R2_VS_0," + adjusted_R2_vs_0); # Adjusted R^2 of residual with bias included vs. zero constant + } + + if (fileO != " ") { + write (str, fileO); + } else { + print (str); + } + + # Prepare the output matrix + print ("Writing the output matrix..."); + if (intercept_status == 2) { + beta_out = append (beta, beta_unscaled); + } else { + beta_out = beta; + } + + # Output which features give the best AIC and are being used for linear regression + write (Selected, fileS, format=fmt); + + no_selected = ncol (Selected); + max_selected = max (Selected); + last = max_selected + 1; + + if (intercept_status != 0) { + + Selected_ext = append (Selected, as.matrix (last)); + P1 = table (seq (1, ncol (Selected_ext)), t(Selected_ext)); + + if (intercept_status == 2) { + + P1_beta = P1 * beta; + P2_beta = colSums (P1_beta); + P1_beta_unscaled = P1 * beta_unscaled; + P2_beta_unscaled = colSums(P1_beta_unscaled); + + if (max_selected < m_orig) { + P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected))); + P2_beta_unscaled = append (P2_beta_unscaled, matrix (0, rows=1, cols=(m_orig - max_selected))); + + P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1]; + P2_beta[1, max_selected + 1] = 0; + + P2_beta_unscaled[1, m_orig+1] = P2_beta_unscaled[1, max_selected + 1]; + P2_beta_unscaled[1, max_selected + 1] = 0; + } + beta_out = append (t(P2_beta), t(P2_beta_unscaled)); + + } else { + + P1_beta = P1 * beta; + P2_beta = colSums (P1_beta); + + if (max_selected < m_orig) { + P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected))); + P2_beta[1, m_orig+1] = P2_beta[1, max_selected + 1] ; + P2_beta[1, max_selected + 1] = 0; + } + beta_out = t(P2_beta); + + } + } else { + + P1 = table (seq (1, no_selected), t(Selected)); + P1_beta = P1 * beta; + P2_beta = colSums (P1_beta); + + if (max_selected < m_orig) { + P2_beta = append (P2_beta, matrix (0, rows=1, cols=(m_orig - max_selected))); + } + + beta_out = t(P2_beta); + } + + write ( beta_out, fileB, format=fmt ); + } +} +
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/Univar-Stats.dml ---------------------------------------------------------------------- diff --git a/scripts/algorithms/Univar-Stats.dml b/scripts/algorithms/Univar-Stats.dml index abb3fea..62d6a28 100644 --- a/scripts/algorithms/Univar-Stats.dml +++ b/scripts/algorithms/Univar-Stats.dml @@ -1,150 +1,150 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# DML Script to compute univariate statistics for all attributes -# in a given data set -# -# Three inputs: -# $1) X - input data -# $2) TYPES - row matrix that denotes the "kind"/"type" of all attributes -# kind=1 for scale, -# kind=2 for nominal, -# kind=3 for ordinal -# -# One output: -# $STATS) output directory in which following three statistics -# files are created -# + base.stats - matrix with all 17 statistics (14 scale, -# 3 categorical) computed for all attributes -# + categorical.counts - matrix in which each column -# gives the category-wise counts for all categories in -# that attribute -# -# - -A = read($X); # data file -K = read($TYPES); # attribute kind file - -# number of features/attributes -n = ncol(A); - -# number of data records -m = nrow(A); - -# number of statistics -numBaseStats = 17; # (14 scale stats, 3 categorical stats) - -max_kind = max(K); - -# matrices to store computed statistics -baseStats = matrix(0, rows=numBaseStats, cols=n); - -# Compute max domain size among all categorical attributes -maxs = colMaxs(A); -maxDomainSize = max( ppred(K, 1, ">") * maxs ); -maxDomain = as.integer(maxDomainSize); - - -parfor(i in 1:n, check=0) { - - # project out the i^th column - F = A[,i]; - - kind = castAsScalar(K[1,i]); - - if ( kind == 1 ) { - #print("[" + i + "] Scale"); - # compute SCALE statistics on the projected column - minimum = min(F); - maximum = max(F); - rng = maximum - minimum; - - mu = mean(F); - m2 = moment(F, 2); - m3 = moment(F, 3); - m4 = moment(F, 4); - - var = m/(m-1.0)*m2; - std_dev = sqrt(var); - se = std_dev/sqrt(m); - cv = std_dev/mu; - - g1 = m3/(std_dev^3); - g2 = m4/(std_dev^4) - 3; - #se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) ); - se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) ); - - #se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) ); - se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 ); - - md = median(F); #quantile(F, 0.5); - iqm = interQuartileMean(F); - - # place the computed statistics in output matrices - baseStats[1,i] = minimum; - baseStats[2,i] = maximum; - baseStats[3,i] = rng; - - baseStats[4,i] = mu; - baseStats[5,i] = var; - baseStats[6,i] = std_dev; - baseStats[7,i] = se; - baseStats[8,i] = cv; - - baseStats[9,i] = g1; - baseStats[10,i] = g2; - baseStats[11,i] = se_g1; - baseStats[12,i] = se_g2; - - baseStats[13,i] = md; - baseStats[14,i] = iqm; - } - else { - if (kind == 2 | kind == 3) { - #print("[" + i + "] Categorical"); - - # check if the categorical column has valid values - minF = min(F); - if (minF <=0) { - print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i); - } - else { - # compute CATEGORICAL statistics on the projected column - num_cat = max(F); # number of categories - cat_counts = table(F,1, maxDomain, 1); # counts for each category - - mode = rowIndexMax(t(cat_counts)); - mx = max(cat_counts) - modeArr = ppred(cat_counts, mx, "==") - numModes = sum(modeArr); - - # place the computed statistics in output matrices - baseStats[15,i] = num_cat; - baseStats[16,i] = mode; - baseStats[17,i] = numModes; - } - } - } -} - -write(baseStats, $STATS); - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# DML Script to compute univariate statistics for all attributes +# in a given data set +# +# Three inputs: +# $1) X - input data +# $2) TYPES - row matrix that denotes the "kind"/"type" of all attributes +# kind=1 for scale, +# kind=2 for nominal, +# kind=3 for ordinal +# +# One output: +# $STATS) output directory in which following three statistics +# files are created +# + base.stats - matrix with all 17 statistics (14 scale, +# 3 categorical) computed for all attributes +# + categorical.counts - matrix in which each column +# gives the category-wise counts for all categories in +# that attribute +# +# + +A = read($X); # data file +K = read($TYPES); # attribute kind file + +# number of features/attributes +n = ncol(A); + +# number of data records +m = nrow(A); + +# number of statistics +numBaseStats = 17; # (14 scale stats, 3 categorical stats) + +max_kind = max(K); + +# matrices to store computed statistics +baseStats = matrix(0, rows=numBaseStats, cols=n); + +# Compute max domain size among all categorical attributes +maxs = colMaxs(A); +maxDomainSize = max( ppred(K, 1, ">") * maxs ); +maxDomain = as.integer(maxDomainSize); + + +parfor(i in 1:n, check=0) { + + # project out the i^th column + F = A[,i]; + + kind = castAsScalar(K[1,i]); + + if ( kind == 1 ) { + #print("[" + i + "] Scale"); + # compute SCALE statistics on the projected column + minimum = min(F); + maximum = max(F); + rng = maximum - minimum; + + mu = mean(F); + m2 = moment(F, 2); + m3 = moment(F, 3); + m4 = moment(F, 4); + + var = m/(m-1.0)*m2; + std_dev = sqrt(var); + se = std_dev/sqrt(m); + cv = std_dev/mu; + + g1 = m3/(std_dev^3); + g2 = m4/(std_dev^4) - 3; + #se_g1=sqrt( 6*m*(m-1.0) / ((m-2.0)*(m+1.0)*(m+3.0)) ); + se_g1=sqrt( (6/(m-2.0)) * (m/(m+1.0)) * ((m-1.0)/(m+3.0)) ); + + #se_g2= sqrt( (4*(m^2-1)*se_g1^2)/((m+5.0)*(m-3.0)) ); + se_g2=sqrt( (4/(m+5.0)) * ((m^2-1)/(m-3.0)) * se_g1^2 ); + + md = median(F); #quantile(F, 0.5); + iqm = interQuartileMean(F); + + # place the computed statistics in output matrices + baseStats[1,i] = minimum; + baseStats[2,i] = maximum; + baseStats[3,i] = rng; + + baseStats[4,i] = mu; + baseStats[5,i] = var; + baseStats[6,i] = std_dev; + baseStats[7,i] = se; + baseStats[8,i] = cv; + + baseStats[9,i] = g1; + baseStats[10,i] = g2; + baseStats[11,i] = se_g1; + baseStats[12,i] = se_g2; + + baseStats[13,i] = md; + baseStats[14,i] = iqm; + } + else { + if (kind == 2 | kind == 3) { + #print("[" + i + "] Categorical"); + + # check if the categorical column has valid values + minF = min(F); + if (minF <=0) { + print("ERROR: Categorical attributes can only take values starting from 1. Encountered a value " + minF + " in attribute " + i); + } + else { + # compute CATEGORICAL statistics on the projected column + num_cat = max(F); # number of categories + cat_counts = table(F,1, maxDomain, 1); # counts for each category + + mode = rowIndexMax(t(cat_counts)); + mx = max(cat_counts) + modeArr = ppred(cat_counts, mx, "==") + numModes = sum(modeArr); + + # place the computed statistics in output matrices + baseStats[15,i] = num_cat; + baseStats[16,i] = mode; + baseStats[17,i] = numModes; + } + } + } +} + +write(baseStats, $STATS); + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/bivar-stats.dml ---------------------------------------------------------------------- diff --git a/scripts/algorithms/bivar-stats.dml b/scripts/algorithms/bivar-stats.dml index 4846f56..99549dc 100644 --- a/scripts/algorithms/bivar-stats.dml +++ b/scripts/algorithms/bivar-stats.dml @@ -1,398 +1,398 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# -# For a given pair of attribute sets, compute bivariate statistics between all attribute pairs -# Given, index1 = {A_11, A_12, ... A_1m} and index2 = {A_21, A_22, ... A_2n} -# compute bivariate stats for m*n pairs (A_1i, A_2j), (1<= i <=m) and (1<= j <=n) -# -# Six inputs: -# 1) X - input data -# 2) index1 - First attribute set {A_11, A_12, ... A_1m} -# 3) index2 - Second attribute set {A_21, A_22, ... A_2n} -# 4) types1 - kind for attributes in S1 -# 5) types2 - kind for attributes in S2 -# kind=1 for scale, kind=2 for nominal, kind=3 for ordinal -# -# One output: -# 6) output directory in which following (maximum of) four statistics files are created -# + bivar.scale.scale.stats - matrix containing scale-scale correlations -# + bivar.nominal.nominal.stats - -# + bivar.nominal.scale.stats - -# + bivar.ordinal.ordinal.stats - -# -# hadoop jar SystemML.jar -f bivar-stats.dml -nvargs X=<Data> -# index1=<Feature Index Set 1> -# index2=<Feature Index Set 2> -# types1=<Feature Types 1> -# types2=<Feature Types 2> -# OUTDIR=<Output Location> - -D = read($X); # input data set -S1 = read($index1); # attribute set 1 -S2 = read($index2); # attribute set 2 -K1 = read($types1); # kind for attributes in S1 -K2 = read($types2); # kind for attributes in S2 - -s1size = ncol(S1); -s2size = ncol(S2); -numPairs = s1size * s2size; - -#test: 1 is Pearson'R, 2 is F-test, 3 is chi-squared, 4 is Spearman'sRho -# R, (chisq, df, pval, cramersv,) spearman, eta, anovaf, feature_col_index1, feature_col_index2, test - -num_scale_scale_tests = 0 -num_nominal_nominal_tests = 0 -num_ordinal_ordinal_tests = 0 -num_nominal_scale_tests = 0 - -pair2row = matrix(0, rows=numPairs, cols=2) -for( i in 1:s1size, check=0) { - pre_a1 = castAsScalar(S1[1,i]); - pre_k1 = castAsScalar(K1[1,i]); - - for( j in 1:s2size, check=0) { - pre_pairID = (i-1)*s2size+j; - pre_a2 = castAsScalar(S2[1,j]); - pre_k2 = castAsScalar(K2[1,j]); - - if (pre_k1 == pre_k2) { - if (pre_k1 == 1) { - num_scale_scale_tests = num_scale_scale_tests + 1 - pair2row[pre_pairID,1] = num_scale_scale_tests - } else { - num_nominal_nominal_tests = num_nominal_nominal_tests + 1 - pair2row[pre_pairID,1] = num_nominal_nominal_tests - - if ( pre_k1 == 3 ) { - num_ordinal_ordinal_tests = num_ordinal_ordinal_tests + 1 - pair2row[pre_pairID, 2] = num_ordinal_ordinal_tests - } - } - } - else { - if (pre_k1 == 1 | pre_k2 == 1) { - num_nominal_scale_tests = num_nominal_scale_tests + 1 - pair2row[pre_pairID,1] = num_nominal_scale_tests - } else { - num_nominal_nominal_tests = num_nominal_nominal_tests + 1 - pair2row[pre_pairID,1] = num_nominal_nominal_tests - } - } - } -} - -size_scale_scale_tests = max(num_scale_scale_tests, 1); -size_nominal_nominal_tests = max(num_nominal_nominal_tests, 1) -size_ordinal_ordinal_tests = max(num_ordinal_ordinal_tests, 1); -size_nominal_scale_tests = max(num_nominal_scale_tests, 1); - -basestats = matrix(0, rows=11, cols=numPairs); -basestats_scale_scale = matrix(0, rows=6, cols=size_scale_scale_tests) -basestats_nominal_nominal = matrix(0, rows=6, cols=size_nominal_nominal_tests) -basestats_ordinal_ordinal = matrix(0, rows=3, cols=size_ordinal_ordinal_tests) -basestats_nominal_scale = matrix(0, rows=11, cols=size_nominal_scale_tests) - - -# Compute max domain size among all categorical attributes -# and check if these cols have been recoded - -debug_str = "Stopping execution of DML script due to invalid input"; - -error_flag = FALSE; - -maxs = colMaxs(D); -mins = colMins(D) -maxDomainSize = -1.0; -for(k in 1:ncol(K1) ) { - type = as.scalar(K1[1,k]); - - if ( type > 1) { - colID = as.scalar(S1[1,k]); - - colMaximum = as.scalar(maxs[1,colID]); - if(maxDomainSize < colMaximum) maxDomainSize = colMaximum; - - colMinimum = as.scalar(mins[1,colID]); - if(colMinimum < 1){ - if(type == 2) - debug_str = append(debug_str, "Column " + colID + " was declared as nominal but its minimum value is " + colMinimum) - else - debug_str = append(debug_str, "Column " + colID + " was declared as ordinal but its minimum value is " + colMinimum) - error_flag = TRUE; - } - } -} - -for(k in 1:ncol(K2) ) { - type = as.scalar(K2[1,k]); - - if ( type > 1) { - colID = as.scalar(S2[1,k]); - - colMaximum = as.scalar(maxs[1,colID]); - if(maxDomainSize < colMaximum) maxDomainSize = colMaximum; - - colMinimum = as.scalar(mins[1,colID]); - if(colMinimum < 1){ - if(type == 2) - debug_str = append(debug_str, "Column " + colID + " was declared as nominal but its minimum value is " + colMinimum) - else - debug_str = append(debug_str, "Column " + colID + " was declared as ordinal but its minimum value is " + colMinimum) - error_flag = TRUE; - } - } -} -maxDomain = as.integer(maxDomainSize); - -if(error_flag) stop(debug_str); - -parfor( i in 1:s1size, check=0) { - a1 = castAsScalar(S1[1,i]); - k1 = castAsScalar(K1[1,i]); - A1 = D[,a1]; - - parfor( j in 1:s2size, check=0) { - pairID = (i-1)*s2size+j; - a2 = castAsScalar(S2[1,j]); - k2 = castAsScalar(K2[1,j]); - A2 = D[,a2]; - - rowid1 = castAsScalar(pair2row[pairID, 1]) - rowid2 = castAsScalar(pair2row[pairID, 2]) - - if (k1 == k2) { - if (k1 == 1) { - # scale-scale - print("[" + i + "," + j + "] scale-scale"); - [r, cov, sigma1, sigma2] = bivar_ss(A1,A2); - - basestats_scale_scale[1,rowid1] = a1; - basestats_scale_scale[2,rowid1] = a2; - basestats_scale_scale[3,rowid1] = r; - basestats_scale_scale[4,rowid1] = cov; - basestats_scale_scale[5,rowid1] = sigma1; - basestats_scale_scale[6,rowid1] = sigma2; - } else { - # nominal-nominal or ordinal-ordinal - print("[" + i + "," + j + "] categorical-categorical"); - [chisq, df, pval, cramersv] = bivar_cc(A1, A2, maxDomain); - - basestats_nominal_nominal[1,rowid1] = a1; - basestats_nominal_nominal[2,rowid1] = a2; - basestats_nominal_nominal[3,rowid1] = chisq; - basestats_nominal_nominal[4,rowid1] = df; - basestats_nominal_nominal[5,rowid1] = pval; - basestats_nominal_nominal[6,rowid1] = cramersv; - - if ( k1 == 3 ) { - # ordinal-ordinal - print("[" + i + "," + j + "] ordinal-ordinal"); - sp = bivar_oo(A1, A2, maxDomain); - - basestats_ordinal_ordinal[1,rowid2] = a1; - basestats_ordinal_ordinal[2,rowid2] = a2; - basestats_ordinal_ordinal[3,rowid2] = sp; - } - } - } else { - if (k1 == 1 | k2 == 1) { - # Scale-nominal/ordinal - print("[" + i + "," + j + "] scale-categorical"); - - if ( k1 == 1 ) { - [eta, f, pval, bw_ss, within_ss, bw_df, within_df, bw_mean_square, within_mean_square] = bivar_sc(A1, A2, maxDomain); - } else { - [eta, f, pval, bw_ss, within_ss, bw_df, within_df, bw_mean_square, within_mean_square] = bivar_sc(A2, A1, maxDomain); - } - - basestats_nominal_scale[1,rowid1] = a1; - basestats_nominal_scale[2,rowid1] = a2; - basestats_nominal_scale[3,rowid1] = eta; - basestats_nominal_scale[4,rowid1] = f; - basestats_nominal_scale[5,rowid1] = pval; - basestats_nominal_scale[6,rowid1] = bw_ss; - basestats_nominal_scale[7,rowid1] = within_ss; - basestats_nominal_scale[8,rowid1] = bw_df; - basestats_nominal_scale[9,rowid1] = within_df; - basestats_nominal_scale[10,rowid1] = bw_mean_square; - basestats_nominal_scale[11,rowid1] = within_mean_square; - } else { - # nominal-ordinal or ordinal-nominal - print("[" + i + "," + j + "] categorical-categorical"); - [chisq, df, pval, cramersv] = bivar_cc(A1, A2, maxDomain); - - basestats_nominal_nominal[1,rowid1] = a1; - basestats_nominal_nominal[2,rowid1] = a2; - basestats_nominal_nominal[3,rowid1] = chisq; - basestats_nominal_nominal[4,rowid1] = df; - basestats_nominal_nominal[5,rowid1] = pval; - basestats_nominal_nominal[6,rowid1] = cramersv; - } - } - } -} - -if(num_scale_scale_tests == size_scale_scale_tests){ - write(basestats_scale_scale, $OUTDIR + "/bivar.scale.scale.stats"); -} - -if(num_nominal_scale_tests == size_nominal_scale_tests){ - write(basestats_nominal_scale, $OUTDIR + "/bivar.nominal.scale.stats"); -} - -if(num_nominal_nominal_tests == size_nominal_nominal_tests){ - write(basestats_nominal_nominal, $OUTDIR + "/bivar.nominal.nominal.stats"); -} - -if(num_ordinal_ordinal_tests == size_ordinal_ordinal_tests){ - write(basestats_ordinal_ordinal, $OUTDIR + "/bivar.ordinal.ordinal.stats"); -} - -# ----------------------------------------------------------------------------------------------------------- - -bivar_cc = function(Matrix[Double] A, Matrix[Double] B, Double maxDomain) return (Double chisq, Double df, Double pval, Double cramersv) { - - # Contingency Table - F = table(A, B, maxDomain, maxDomain); - F = F[1:max(A), 1:max(B)]; - - # Chi-Squared - W = sum(F); - r = rowSums(F); - c = colSums(F); - E = (r %*% c)/W; - T = (F-E)^2/E; - chi_squared = sum(T); - - # compute p-value - degFreedom = (nrow(F)-1)*(ncol(F)-1); - pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE); - - # Cramer's V - R = nrow(F); - C = ncol(F); - q = min(R,C); - cramers_v = sqrt(chi_squared/(W*(q-1))); - - # Assign return values - chisq = chi_squared; - df = as.double(degFreedom); - pval = pValue; - cramersv = cramers_v; -} - -# ----------------------------------------------------------------------------------------------------------- - -bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R, Double covXY, Double sigmaX, Double sigmaY) { - - # Unweighted co-variance - covXY = cov(X,Y); - - # compute standard deviations for both X and Y by computing 2^nd central moment - W = nrow(X); - m2X = moment(X,2); - m2Y = moment(Y,2); - sigmaX = sqrt(m2X * (W/(W-1.0)) ); - sigmaY = sqrt(m2Y * (W/(W-1.0)) ); - - # Pearson's R - R = covXY / (sigmaX*sigmaY); -} - -# ----------------------------------------------------------------------------------------------------------- - -# Y points to SCALE variable -# A points to CATEGORICAL variable -bivar_sc = function(Matrix[Double] Y, Matrix[Double] A, Double maxDomain) - return (Double Eta, Double AnovaF, Double pval, Double bw_ss, Double within_ss, Double bw_df, Double within_df, Double bw_mean_square, Double within_mean_square) { - - # mean and variance in target variable - W = nrow(A); - my = mean(Y); - varY = moment(Y,2) * W/(W-1.0) - - # category-wise (frequencies, means, variances) - CFreqs = aggregate(target=Y, groups=A, fn="count", ngroups=maxDomain); - CMeans = aggregate(target=Y, groups=A, fn="mean", ngroups=maxDomain); - CVars = aggregate(target=Y, groups=A, fn="variance", ngroups=maxDomain); - - # number of categories - R = nrow(CFreqs); - - Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) )); - - bw_ss = sum( (CFreqs*(CMeans-my)^2) ); - bw_df = as.double(R-1); - bw_mean_square = bw_ss/bw_df; - - within_ss = sum( (CFreqs-1)*CVars ); - within_df = as.double(W-R); - within_mean_square = within_ss/within_df; - - AnovaF = bw_mean_square/within_mean_square; - - pval = pf(target=AnovaF, df1=bw_df, df2=within_df, lower.tail=FALSE) -} - - -# ----------------------------------------------------------------------------------------------------------- -# Function to compute ranks -# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category -computeRanks = function(Matrix[Double] X) return (Matrix[Double] Ranks) { - Ranks = cumsum(X) - X/2 + 1/2; -} - -#------------------------------------------------------------------------- - -bivar_oo = function(Matrix[Double] A, Matrix[Double] B, Double maxDomain) return (Double sp) { - - # compute contingency table - F = table(A, B, maxDomain, maxDomain); - F = F[1:max(A), 1:max(B)]; - - catA = nrow(F); # number of categories in A - catB = ncol(F); # number of categories in B - - # compute category-wise counts for both the attributes - R = rowSums(F); - S = colSums(F); - - # compute scores, both are column vectors - [C] = computeRanks(R); - meanX = mean(C,R); - - columnS = t(S); - [D] = computeRanks(columnS); - - # scores (C,D) are individual values, and counts (R,S) act as weights - meanY = mean(D,columnS); - - W = sum(F); # total weight, or total #cases - varX = moment(C,R,2)*(W/(W-1.0)); - varY = moment(D,columnS,2)*(W/(W-1.0)); - covXY = sum( t(F/(W-1) * (C-meanX)) * (D-meanY) ); - - sp = covXY/(sqrt(varX)*sqrt(varY)); -} - -# ----------------------------------------------------------------------------------------------------------- +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# +# For a given pair of attribute sets, compute bivariate statistics between all attribute pairs +# Given, index1 = {A_11, A_12, ... A_1m} and index2 = {A_21, A_22, ... A_2n} +# compute bivariate stats for m*n pairs (A_1i, A_2j), (1<= i <=m) and (1<= j <=n) +# +# Six inputs: +# 1) X - input data +# 2) index1 - First attribute set {A_11, A_12, ... A_1m} +# 3) index2 - Second attribute set {A_21, A_22, ... A_2n} +# 4) types1 - kind for attributes in S1 +# 5) types2 - kind for attributes in S2 +# kind=1 for scale, kind=2 for nominal, kind=3 for ordinal +# +# One output: +# 6) output directory in which following (maximum of) four statistics files are created +# + bivar.scale.scale.stats - matrix containing scale-scale correlations +# + bivar.nominal.nominal.stats - +# + bivar.nominal.scale.stats - +# + bivar.ordinal.ordinal.stats - +# +# hadoop jar SystemML.jar -f bivar-stats.dml -nvargs X=<Data> +# index1=<Feature Index Set 1> +# index2=<Feature Index Set 2> +# types1=<Feature Types 1> +# types2=<Feature Types 2> +# OUTDIR=<Output Location> + +D = read($X); # input data set +S1 = read($index1); # attribute set 1 +S2 = read($index2); # attribute set 2 +K1 = read($types1); # kind for attributes in S1 +K2 = read($types2); # kind for attributes in S2 + +s1size = ncol(S1); +s2size = ncol(S2); +numPairs = s1size * s2size; + +#test: 1 is Pearson'R, 2 is F-test, 3 is chi-squared, 4 is Spearman'sRho +# R, (chisq, df, pval, cramersv,) spearman, eta, anovaf, feature_col_index1, feature_col_index2, test + +num_scale_scale_tests = 0 +num_nominal_nominal_tests = 0 +num_ordinal_ordinal_tests = 0 +num_nominal_scale_tests = 0 + +pair2row = matrix(0, rows=numPairs, cols=2) +for( i in 1:s1size, check=0) { + pre_a1 = castAsScalar(S1[1,i]); + pre_k1 = castAsScalar(K1[1,i]); + + for( j in 1:s2size, check=0) { + pre_pairID = (i-1)*s2size+j; + pre_a2 = castAsScalar(S2[1,j]); + pre_k2 = castAsScalar(K2[1,j]); + + if (pre_k1 == pre_k2) { + if (pre_k1 == 1) { + num_scale_scale_tests = num_scale_scale_tests + 1 + pair2row[pre_pairID,1] = num_scale_scale_tests + } else { + num_nominal_nominal_tests = num_nominal_nominal_tests + 1 + pair2row[pre_pairID,1] = num_nominal_nominal_tests + + if ( pre_k1 == 3 ) { + num_ordinal_ordinal_tests = num_ordinal_ordinal_tests + 1 + pair2row[pre_pairID, 2] = num_ordinal_ordinal_tests + } + } + } + else { + if (pre_k1 == 1 | pre_k2 == 1) { + num_nominal_scale_tests = num_nominal_scale_tests + 1 + pair2row[pre_pairID,1] = num_nominal_scale_tests + } else { + num_nominal_nominal_tests = num_nominal_nominal_tests + 1 + pair2row[pre_pairID,1] = num_nominal_nominal_tests + } + } + } +} + +size_scale_scale_tests = max(num_scale_scale_tests, 1); +size_nominal_nominal_tests = max(num_nominal_nominal_tests, 1) +size_ordinal_ordinal_tests = max(num_ordinal_ordinal_tests, 1); +size_nominal_scale_tests = max(num_nominal_scale_tests, 1); + +basestats = matrix(0, rows=11, cols=numPairs); +basestats_scale_scale = matrix(0, rows=6, cols=size_scale_scale_tests) +basestats_nominal_nominal = matrix(0, rows=6, cols=size_nominal_nominal_tests) +basestats_ordinal_ordinal = matrix(0, rows=3, cols=size_ordinal_ordinal_tests) +basestats_nominal_scale = matrix(0, rows=11, cols=size_nominal_scale_tests) + + +# Compute max domain size among all categorical attributes +# and check if these cols have been recoded + +debug_str = "Stopping execution of DML script due to invalid input"; + +error_flag = FALSE; + +maxs = colMaxs(D); +mins = colMins(D) +maxDomainSize = -1.0; +for(k in 1:ncol(K1) ) { + type = as.scalar(K1[1,k]); + + if ( type > 1) { + colID = as.scalar(S1[1,k]); + + colMaximum = as.scalar(maxs[1,colID]); + if(maxDomainSize < colMaximum) maxDomainSize = colMaximum; + + colMinimum = as.scalar(mins[1,colID]); + if(colMinimum < 1){ + if(type == 2) + debug_str = append(debug_str, "Column " + colID + " was declared as nominal but its minimum value is " + colMinimum) + else + debug_str = append(debug_str, "Column " + colID + " was declared as ordinal but its minimum value is " + colMinimum) + error_flag = TRUE; + } + } +} + +for(k in 1:ncol(K2) ) { + type = as.scalar(K2[1,k]); + + if ( type > 1) { + colID = as.scalar(S2[1,k]); + + colMaximum = as.scalar(maxs[1,colID]); + if(maxDomainSize < colMaximum) maxDomainSize = colMaximum; + + colMinimum = as.scalar(mins[1,colID]); + if(colMinimum < 1){ + if(type == 2) + debug_str = append(debug_str, "Column " + colID + " was declared as nominal but its minimum value is " + colMinimum) + else + debug_str = append(debug_str, "Column " + colID + " was declared as ordinal but its minimum value is " + colMinimum) + error_flag = TRUE; + } + } +} +maxDomain = as.integer(maxDomainSize); + +if(error_flag) stop(debug_str); + +parfor( i in 1:s1size, check=0) { + a1 = castAsScalar(S1[1,i]); + k1 = castAsScalar(K1[1,i]); + A1 = D[,a1]; + + parfor( j in 1:s2size, check=0) { + pairID = (i-1)*s2size+j; + a2 = castAsScalar(S2[1,j]); + k2 = castAsScalar(K2[1,j]); + A2 = D[,a2]; + + rowid1 = castAsScalar(pair2row[pairID, 1]) + rowid2 = castAsScalar(pair2row[pairID, 2]) + + if (k1 == k2) { + if (k1 == 1) { + # scale-scale + print("[" + i + "," + j + "] scale-scale"); + [r, cov, sigma1, sigma2] = bivar_ss(A1,A2); + + basestats_scale_scale[1,rowid1] = a1; + basestats_scale_scale[2,rowid1] = a2; + basestats_scale_scale[3,rowid1] = r; + basestats_scale_scale[4,rowid1] = cov; + basestats_scale_scale[5,rowid1] = sigma1; + basestats_scale_scale[6,rowid1] = sigma2; + } else { + # nominal-nominal or ordinal-ordinal + print("[" + i + "," + j + "] categorical-categorical"); + [chisq, df, pval, cramersv] = bivar_cc(A1, A2, maxDomain); + + basestats_nominal_nominal[1,rowid1] = a1; + basestats_nominal_nominal[2,rowid1] = a2; + basestats_nominal_nominal[3,rowid1] = chisq; + basestats_nominal_nominal[4,rowid1] = df; + basestats_nominal_nominal[5,rowid1] = pval; + basestats_nominal_nominal[6,rowid1] = cramersv; + + if ( k1 == 3 ) { + # ordinal-ordinal + print("[" + i + "," + j + "] ordinal-ordinal"); + sp = bivar_oo(A1, A2, maxDomain); + + basestats_ordinal_ordinal[1,rowid2] = a1; + basestats_ordinal_ordinal[2,rowid2] = a2; + basestats_ordinal_ordinal[3,rowid2] = sp; + } + } + } else { + if (k1 == 1 | k2 == 1) { + # Scale-nominal/ordinal + print("[" + i + "," + j + "] scale-categorical"); + + if ( k1 == 1 ) { + [eta, f, pval, bw_ss, within_ss, bw_df, within_df, bw_mean_square, within_mean_square] = bivar_sc(A1, A2, maxDomain); + } else { + [eta, f, pval, bw_ss, within_ss, bw_df, within_df, bw_mean_square, within_mean_square] = bivar_sc(A2, A1, maxDomain); + } + + basestats_nominal_scale[1,rowid1] = a1; + basestats_nominal_scale[2,rowid1] = a2; + basestats_nominal_scale[3,rowid1] = eta; + basestats_nominal_scale[4,rowid1] = f; + basestats_nominal_scale[5,rowid1] = pval; + basestats_nominal_scale[6,rowid1] = bw_ss; + basestats_nominal_scale[7,rowid1] = within_ss; + basestats_nominal_scale[8,rowid1] = bw_df; + basestats_nominal_scale[9,rowid1] = within_df; + basestats_nominal_scale[10,rowid1] = bw_mean_square; + basestats_nominal_scale[11,rowid1] = within_mean_square; + } else { + # nominal-ordinal or ordinal-nominal + print("[" + i + "," + j + "] categorical-categorical"); + [chisq, df, pval, cramersv] = bivar_cc(A1, A2, maxDomain); + + basestats_nominal_nominal[1,rowid1] = a1; + basestats_nominal_nominal[2,rowid1] = a2; + basestats_nominal_nominal[3,rowid1] = chisq; + basestats_nominal_nominal[4,rowid1] = df; + basestats_nominal_nominal[5,rowid1] = pval; + basestats_nominal_nominal[6,rowid1] = cramersv; + } + } + } +} + +if(num_scale_scale_tests == size_scale_scale_tests){ + write(basestats_scale_scale, $OUTDIR + "/bivar.scale.scale.stats"); +} + +if(num_nominal_scale_tests == size_nominal_scale_tests){ + write(basestats_nominal_scale, $OUTDIR + "/bivar.nominal.scale.stats"); +} + +if(num_nominal_nominal_tests == size_nominal_nominal_tests){ + write(basestats_nominal_nominal, $OUTDIR + "/bivar.nominal.nominal.stats"); +} + +if(num_ordinal_ordinal_tests == size_ordinal_ordinal_tests){ + write(basestats_ordinal_ordinal, $OUTDIR + "/bivar.ordinal.ordinal.stats"); +} + +# ----------------------------------------------------------------------------------------------------------- + +bivar_cc = function(Matrix[Double] A, Matrix[Double] B, Double maxDomain) return (Double chisq, Double df, Double pval, Double cramersv) { + + # Contingency Table + F = table(A, B, maxDomain, maxDomain); + F = F[1:max(A), 1:max(B)]; + + # Chi-Squared + W = sum(F); + r = rowSums(F); + c = colSums(F); + E = (r %*% c)/W; + T = (F-E)^2/E; + chi_squared = sum(T); + + # compute p-value + degFreedom = (nrow(F)-1)*(ncol(F)-1); + pValue = pchisq(target=chi_squared, df=degFreedom, lower.tail=FALSE); + + # Cramer's V + R = nrow(F); + C = ncol(F); + q = min(R,C); + cramers_v = sqrt(chi_squared/(W*(q-1))); + + # Assign return values + chisq = chi_squared; + df = as.double(degFreedom); + pval = pValue; + cramersv = cramers_v; +} + +# ----------------------------------------------------------------------------------------------------------- + +bivar_ss = function(Matrix[Double] X, Matrix[Double] Y) return (Double R, Double covXY, Double sigmaX, Double sigmaY) { + + # Unweighted co-variance + covXY = cov(X,Y); + + # compute standard deviations for both X and Y by computing 2^nd central moment + W = nrow(X); + m2X = moment(X,2); + m2Y = moment(Y,2); + sigmaX = sqrt(m2X * (W/(W-1.0)) ); + sigmaY = sqrt(m2Y * (W/(W-1.0)) ); + + # Pearson's R + R = covXY / (sigmaX*sigmaY); +} + +# ----------------------------------------------------------------------------------------------------------- + +# Y points to SCALE variable +# A points to CATEGORICAL variable +bivar_sc = function(Matrix[Double] Y, Matrix[Double] A, Double maxDomain) + return (Double Eta, Double AnovaF, Double pval, Double bw_ss, Double within_ss, Double bw_df, Double within_df, Double bw_mean_square, Double within_mean_square) { + + # mean and variance in target variable + W = nrow(A); + my = mean(Y); + varY = moment(Y,2) * W/(W-1.0) + + # category-wise (frequencies, means, variances) + CFreqs = aggregate(target=Y, groups=A, fn="count", ngroups=maxDomain); + CMeans = aggregate(target=Y, groups=A, fn="mean", ngroups=maxDomain); + CVars = aggregate(target=Y, groups=A, fn="variance", ngroups=maxDomain); + + # number of categories + R = nrow(CFreqs); + + Eta = sqrt(1 - ( sum((CFreqs-1)*CVars) / ((W-1)*varY) )); + + bw_ss = sum( (CFreqs*(CMeans-my)^2) ); + bw_df = as.double(R-1); + bw_mean_square = bw_ss/bw_df; + + within_ss = sum( (CFreqs-1)*CVars ); + within_df = as.double(W-R); + within_mean_square = within_ss/within_df; + + AnovaF = bw_mean_square/within_mean_square; + + pval = pf(target=AnovaF, df1=bw_df, df2=within_df, lower.tail=FALSE) +} + + +# ----------------------------------------------------------------------------------------------------------- +# Function to compute ranks +# takes a column vector as input, and produces a vector of same size in which each cell denotes to the computed score for that category +computeRanks = function(Matrix[Double] X) return (Matrix[Double] Ranks) { + Ranks = cumsum(X) - X/2 + 1/2; +} + +#------------------------------------------------------------------------- + +bivar_oo = function(Matrix[Double] A, Matrix[Double] B, Double maxDomain) return (Double sp) { + + # compute contingency table + F = table(A, B, maxDomain, maxDomain); + F = F[1:max(A), 1:max(B)]; + + catA = nrow(F); # number of categories in A + catB = ncol(F); # number of categories in B + + # compute category-wise counts for both the attributes + R = rowSums(F); + S = colSums(F); + + # compute scores, both are column vectors + [C] = computeRanks(R); + meanX = mean(C,R); + + columnS = t(S); + [D] = computeRanks(columnS); + + # scores (C,D) are individual values, and counts (R,S) act as weights + meanY = mean(D,columnS); + + W = sum(F); # total weight, or total #cases + varX = moment(C,R,2)*(W/(W-1.0)); + varY = moment(D,columnS,2)*(W/(W-1.0)); + covXY = sum( t(F/(W-1) * (C-meanX)) * (D-meanY) ); + + sp = covXY/(sqrt(varX)*sqrt(varY)); +} + +# ----------------------------------------------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/decision-tree-predict.dml ---------------------------------------------------------------------- diff --git a/scripts/algorithms/decision-tree-predict.dml b/scripts/algorithms/decision-tree-predict.dml index 9e01adb..3447da6 100644 --- a/scripts/algorithms/decision-tree-predict.dml +++ b/scripts/algorithms/decision-tree-predict.dml @@ -1,142 +1,142 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# THIS SCRIPT COMPUTES LABEL PREDICTIONS MEANT FOR USE WITH A DECISION TREE MODEL ON A HELD OUT TEST SET. -# -# INPUT PARAMETERS: -# --------------------------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# --------------------------------------------------------------------------------------------- -# X String --- Location to read the test feature matrix X; note that X needs to be both recoded and dummy coded -# Y String " " Location to read the true label matrix Y if requested; note that Y needs to be both recoded and dummy coded -# R String " " Location to read matrix R which for each feature in X contains the following information -# - R[,1]: column ids -# - R[,2]: start indices -# - R[,3]: end indices -# If R is not provided by default all variables are assumed to be scale -# M String --- Location to read matrix M containing the learned tree in the following format -# - M[1,j]: id of node j (in a complete binary tree) -# - M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0 -# - M[3,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0 -# - M[4,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features, -# otherwise the label that leaf node j is supposed to predict -# - M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale, otherwise the size of the subset of values -# stored in rows 6,7,... if j is categorical -# If j is a leaf node: number of misclassified samples reaching at node j -# - M[6:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[6,j] -# if the feature chosen for j is scale, otherwise if the feature chosen for j is categorical rows 6,7,... -# depict the value subset chosen for j -# If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0 -# P String --- Location to store the label predictions for X -# A String " " Location to write the test accuracy (%) for the prediction if requested -# CM String " " Location to write the confusion matrix if requested -# fmt String "text" The output format of the output, such as "text" or "csv" -# --------------------------------------------------------------------------------------------- -# OUTPUT: -# 1- Matrix Y containing the predicted labels for X -# 2- Test accuracy if requested -# 3- Confusion matrix C if requested -# ------------------------------------------------------------------------------------------- -# HOW TO INVOKE THIS SCRIPT - EXAMPLE: -# hadoop jar SystemML.jar -f decision-tree-predict.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=INPUT_DIR/model P=OUTPUT_DIR/predictions -# A=OUTPUT_DIR/accuracy CM=OUTPUT_DIR/confusion fmt=csv - -fileX = $X; -fileM = $M; -fileP = $P; -fileY = ifdef ($Y, " "); -fileR = ifdef ($R, " "); -fileCM = ifdef ($CM, " "); -fileA = ifdef ($A, " "); -fmtO = ifdef ($fmt, "text"); -X_test = read (fileX); -M = read (fileM); - -num_records = nrow (X_test); -Y_predicted = matrix (0, rows = num_records, cols = 1); - -R_cat = matrix (0, rows = 1, cols = 1); -R_scale = matrix (0, rows = 1, cols = 1); - -if (fileR != " ") { - R = read (fileR); - dummy_coded = ppred (R[,2], R[,3], "!="); - R_scale = removeEmpty (target = R[,2] * (1 - dummy_coded), margin = "rows"); - R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows"); -} else { # only scale features available - R_scale = seq (1, ncol (X_test)); -} - -parfor (i in 1:num_records, check = 0) { - cur_sample = X_test[i,]; - cur_node_pos = 1; - label_found = FALSE; - while (!label_found) { - cur_feature = as.scalar (M[3,cur_node_pos]); - type_label = as.scalar (M[4,cur_node_pos]); - if (cur_feature == 0) { # leaf node - label_found = TRUE; - Y_predicted[i,] = type_label; - } else { - # determine type: 1 for scale, 2 for categorical - if (type_label == 1) { # scale feature - cur_start_ind = as.scalar (R_scale[cur_feature,]); - cur_value = as.scalar (cur_sample[,cur_start_ind]); - cur_split = as.scalar (M[6,cur_node_pos]); - if (cur_value < cur_split) { # go to left branch - cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]); - } else { # go to right branch - cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]) + 1; - } - } else if (type_label == 2) { # categorical feature - cur_start_ind = as.scalar (R_cat[cur_feature,1]); - cur_end_ind = as.scalar (R_cat[cur_feature,2]); - cur_value = as.scalar (rowIndexMax(cur_sample[,cur_start_ind:cur_end_ind])); # as.scalar (cur_sample[,cur_feature]); - cur_offset = as.scalar (M[5,cur_node_pos]); - value_found = sum (ppred (M[6:(6 + cur_offset - 1),cur_node_pos], cur_value, "==")); - if (value_found) { # go to left branch - cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]); - } else { # go to right branch - cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]) + 1; - } -}}}} - -write (Y_predicted, fileP, format = fmtO); - -if (fileY != " ") { - Y_test = read (fileY); - num_classes = ncol (Y_test); - Y_test = rowSums (Y_test * t (seq (1, num_classes))); - result = ppred (Y_test, Y_predicted, "=="); - result = sum (result); - accuracy = result / num_records * 100; - acc_str = "Accuracy (%): " + accuracy; - if (fileA != " ") { - write (acc_str, fileA, format = fmtO); - } else { - print (acc_str); - } - if (fileCM != " ") { - confusion_mat = table(Y_predicted, Y_test, num_classes, num_classes) - write(confusion_mat, fileCM, format = fmtO) - } -} +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# THIS SCRIPT COMPUTES LABEL PREDICTIONS MEANT FOR USE WITH A DECISION TREE MODEL ON A HELD OUT TEST SET. +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X String --- Location to read the test feature matrix X; note that X needs to be both recoded and dummy coded +# Y String " " Location to read the true label matrix Y if requested; note that Y needs to be both recoded and dummy coded +# R String " " Location to read matrix R which for each feature in X contains the following information +# - R[,1]: column ids +# - R[,2]: start indices +# - R[,3]: end indices +# If R is not provided by default all variables are assumed to be scale +# M String --- Location to read matrix M containing the learned tree in the following format +# - M[1,j]: id of node j (in a complete binary tree) +# - M[2,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0 +# - M[3,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0 +# - M[4,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features, +# otherwise the label that leaf node j is supposed to predict +# - M[5,j]: If j is an internal node: 1 if the feature chosen for j is scale, otherwise the size of the subset of values +# stored in rows 6,7,... if j is categorical +# If j is a leaf node: number of misclassified samples reaching at node j +# - M[6:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[6,j] +# if the feature chosen for j is scale, otherwise if the feature chosen for j is categorical rows 6,7,... +# depict the value subset chosen for j +# If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0 +# P String --- Location to store the label predictions for X +# A String " " Location to write the test accuracy (%) for the prediction if requested +# CM String " " Location to write the confusion matrix if requested +# fmt String "text" The output format of the output, such as "text" or "csv" +# --------------------------------------------------------------------------------------------- +# OUTPUT: +# 1- Matrix Y containing the predicted labels for X +# 2- Test accuracy if requested +# 3- Confusion matrix C if requested +# ------------------------------------------------------------------------------------------- +# HOW TO INVOKE THIS SCRIPT - EXAMPLE: +# hadoop jar SystemML.jar -f decision-tree-predict.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=INPUT_DIR/model P=OUTPUT_DIR/predictions +# A=OUTPUT_DIR/accuracy CM=OUTPUT_DIR/confusion fmt=csv + +fileX = $X; +fileM = $M; +fileP = $P; +fileY = ifdef ($Y, " "); +fileR = ifdef ($R, " "); +fileCM = ifdef ($CM, " "); +fileA = ifdef ($A, " "); +fmtO = ifdef ($fmt, "text"); +X_test = read (fileX); +M = read (fileM); + +num_records = nrow (X_test); +Y_predicted = matrix (0, rows = num_records, cols = 1); + +R_cat = matrix (0, rows = 1, cols = 1); +R_scale = matrix (0, rows = 1, cols = 1); + +if (fileR != " ") { + R = read (fileR); + dummy_coded = ppred (R[,2], R[,3], "!="); + R_scale = removeEmpty (target = R[,2] * (1 - dummy_coded), margin = "rows"); + R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows"); +} else { # only scale features available + R_scale = seq (1, ncol (X_test)); +} + +parfor (i in 1:num_records, check = 0) { + cur_sample = X_test[i,]; + cur_node_pos = 1; + label_found = FALSE; + while (!label_found) { + cur_feature = as.scalar (M[3,cur_node_pos]); + type_label = as.scalar (M[4,cur_node_pos]); + if (cur_feature == 0) { # leaf node + label_found = TRUE; + Y_predicted[i,] = type_label; + } else { + # determine type: 1 for scale, 2 for categorical + if (type_label == 1) { # scale feature + cur_start_ind = as.scalar (R_scale[cur_feature,]); + cur_value = as.scalar (cur_sample[,cur_start_ind]); + cur_split = as.scalar (M[6,cur_node_pos]); + if (cur_value < cur_split) { # go to left branch + cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]); + } else { # go to right branch + cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]) + 1; + } + } else if (type_label == 2) { # categorical feature + cur_start_ind = as.scalar (R_cat[cur_feature,1]); + cur_end_ind = as.scalar (R_cat[cur_feature,2]); + cur_value = as.scalar (rowIndexMax(cur_sample[,cur_start_ind:cur_end_ind])); # as.scalar (cur_sample[,cur_feature]); + cur_offset = as.scalar (M[5,cur_node_pos]); + value_found = sum (ppred (M[6:(6 + cur_offset - 1),cur_node_pos], cur_value, "==")); + if (value_found) { # go to left branch + cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]); + } else { # go to right branch + cur_node_pos = cur_node_pos + as.scalar (M[2,cur_node_pos]) + 1; + } +}}}} + +write (Y_predicted, fileP, format = fmtO); + +if (fileY != " ") { + Y_test = read (fileY); + num_classes = ncol (Y_test); + Y_test = rowSums (Y_test * t (seq (1, num_classes))); + result = ppred (Y_test, Y_predicted, "=="); + result = sum (result); + accuracy = result / num_records * 100; + acc_str = "Accuracy (%): " + accuracy; + if (fileA != " ") { + write (acc_str, fileA, format = fmtO); + } else { + print (acc_str); + } + if (fileCM != " ") { + confusion_mat = table(Y_predicted, Y_test, num_classes, num_classes) + write(confusion_mat, fileCM, format = fmtO) + } +}
