http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_GLM.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/codegen/Algorithm_GLM.dml b/src/test/scripts/functions/codegen/Algorithm_GLM.dml new file mode 100644 index 0000000..d8eb966 --- /dev/null +++ b/src/test/scripts/functions/codegen/Algorithm_GLM.dml @@ -0,0 +1,1053 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +X = read ($1); +Y = read ($2); + +fileO = " "; +fileLog = " "; + +intercept_status = $3 +eps = $4 +max_iteration_IRLS = $5; +max_iteration_CG = $5; + +distribution_type = $6; +variance_as_power_of_the_mean = $7; +link_type = $8; + +if( distribution_type != 1 ) { + link_as_power_of_the_mean = $9; + bernoulli_No_label = 0.0; +} else { + link_as_power_of_the_mean = 1.0; + bernoulli_No_label = $9; +} + +dispersion = 0.0; +regularization = 0.001; + + +variance_as_power_of_the_mean = as.double (variance_as_power_of_the_mean); +link_as_power_of_the_mean = as.double (link_as_power_of_the_mean); +bernoulli_No_label = as.double (bernoulli_No_label); +dispersion = as.double (dispersion); +eps = as.double (eps); + + +# Default values for output statistics: + +termination_code = 0; +min_beta = 0.0 / 0.0; +i_min_beta = 0.0 / 0.0; +max_beta = 0.0 / 0.0; +i_max_beta = 0.0 / 0.0; +intercept_value = 0.0 / 0.0; +dispersion = 0.0 / 0.0; +estimated_dispersion = 0.0 / 0.0; +deviance_nodisp = 0.0 / 0.0; +deviance = 0.0 / 0.0; + +print("BEGIN GLM SCRIPT"); + +num_records = nrow (X); +num_features = ncol (X); +zeros_r = matrix (0, rows = num_records, cols = 1); +ones_r = 1 + zeros_r; + +# Introduce the intercept, shift and rescale the columns of X if needed + +if (intercept_status == 1 | intercept_status == 2) # add the intercept column +{ + X = append (X, ones_r); + num_features = ncol (X); +} + +scale_lambda = matrix (1, rows = num_features, cols = 1); +if (intercept_status == 1 | intercept_status == 2) +{ + scale_lambda [num_features, 1] = 0; +} + +if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1 +{ # Important assumption: X [, num_features] = ones_r + avg_X_cols = t(colSums(X)) / num_records; + var_X_cols = (t(colSums (X ^ 2)) - num_records * (avg_X_cols ^ 2)) / (num_records - 1); + is_unsafe = ppred (var_X_cols, 0.0, "<="); + scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe); + scale_X [num_features, 1] = 1; + shift_X = - avg_X_cols * scale_X; + shift_X [num_features, 1] = 0; + rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2); +} else { + scale_X = matrix (1, rows = num_features, cols = 1); + shift_X = matrix (0, rows = num_features, cols = 1); + rowSums_X_sq = rowSums (X ^ 2); +} + +# Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2) +# with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale. +# The transform is then associatively applied to the other side of the expression, +# and is rewritten via "scale_X" and "shift_X" as follows: +# +# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: +# ssX_A = diag (scale_X) %*% A; +# ssX_A [num_features, ] = ssX_A [num_features, ] + t(shift_X) %*% A; +# +# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: +# tssX_A = diag (scale_X) %*% A + shift_X %*% A [num_features, ]; + +# Initialize other input-dependent parameters + +lambda = scale_lambda * regularization; +if (max_iteration_CG == 0) { + max_iteration_CG = num_features; +} + +# In Bernoulli case, convert one-column "Y" into two-column + +if (distribution_type == 2 & ncol(Y) == 1) +{ + is_Y_negative = ppred (Y, bernoulli_No_label, "=="); + Y = append (1 - is_Y_negative, is_Y_negative); + count_Y_negative = sum (is_Y_negative); + if (count_Y_negative == 0) { + stop ("GLM Input Error: all Y-values encode Bernoulli YES-label, none encode NO-label"); + } + if (count_Y_negative == nrow(Y)) { + stop ("GLM Input Error: all Y-values encode Bernoulli NO-label, none encode YES-label"); + } +} + +# Set up the canonical link, if requested [Then we have: Var(mu) * (d link / d mu) = const] + +if (link_type == 0) +{ + if (distribution_type == 1) { + link_type = 1; + link_as_power_of_the_mean = 1.0 - variance_as_power_of_the_mean; + } else { if (distribution_type == 2) { + link_type = 2; +} } } + +# For power distributions and/or links, we use two constants, +# "variance as power of the mean" and "link_as_power_of_the_mean", +# to specify the variance and the link as arbitrary powers of the +# mean. However, the variance-powers of 1.0 (Poisson family) and +# 2.0 (Gamma family) have to be treated as special cases, because +# these values integrate into logarithms. The link-power of 0.0 +# is also special as it represents the logarithm link. + +num_response_columns = ncol (Y); + +is_supported = check_if_supported (num_response_columns, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); +if (is_supported == 1) +{ + +##### INITIALIZE THE BETAS ##### + +[beta, saturated_log_l, isNaN] = + glm_initialize (X, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean, intercept_status, max_iteration_CG); +if (isNaN == 0) +{ + +##### START OF THE MAIN PART ##### + +sum_X_sq = sum (rowSums_X_sq); +trust_delta = 0.5 * sqrt (num_features) / max (sqrt (rowSums_X_sq)); +### max_trust_delta = trust_delta * 10000.0; +log_l = 0.0; +deviance_nodisp = 0.0; +new_deviance_nodisp = 0.0; +isNaN_log_l = 2; +newbeta = beta; +g = matrix (0.0, rows = num_features, cols = 1); +g_norm = sqrt (sum ((g + lambda * beta) ^ 2)); +accept_new_beta = 1; +reached_trust_boundary = 0; +neg_log_l_change_predicted = 0.0; +i_IRLS = 0; + +print ("BEGIN IRLS ITERATIONS..."); + +ssX_newbeta = diag (scale_X) %*% newbeta; +ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta; +all_linear_terms = X %*% ssX_newbeta; + +[new_log_l, isNaN_new_log_l] = glm_log_likelihood_part + (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); + +if (isNaN_new_log_l == 0) { + new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l); + new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2); +} + +if (fileLog != " ") { + log_str = "POINT_STEP_NORM," + i_IRLS + "," + sqrt (sum (beta ^ 2)); + log_str = append (log_str, "OBJECTIVE," + i_IRLS + "," + (- new_log_l)); + log_str = append (log_str, "LINEAR_TERM_MIN," + i_IRLS + "," + min (all_linear_terms)); + log_str = append (log_str, "LINEAR_TERM_MAX," + i_IRLS + "," + max (all_linear_terms)); +} else { + log_str = " "; +} + +# set w to avoid 'Initialization of w depends on if-else/while execution' warnings +w = matrix (0.0, rows=1, cols=1); +while (termination_code == 0) +{ + accept_new_beta = 1; + + if (i_IRLS > 0) + { + if (isNaN_log_l == 0) { + accept_new_beta = 0; + } + +# Decide whether to accept a new iteration point and update the trust region +# See Alg. 4.1 on p. 69 of "Numerical Optimization" 2nd ed. by Nocedal and Wright + + rho = (- new_log_l + log_l) / neg_log_l_change_predicted; + if (rho < 0.25 | isNaN_new_log_l == 1) { + trust_delta = 0.25 * trust_delta; + } + if (rho > 0.75 & isNaN_new_log_l == 0 & reached_trust_boundary == 1) { + trust_delta = 2 * trust_delta; + +### if (trust_delta > max_trust_delta) { +### trust_delta = max_trust_delta; +### } + + } + if (rho > 0.1 & isNaN_new_log_l == 0) { + accept_new_beta = 1; + } + } + + if (fileLog != " ") { + log_str = append (log_str, "IS_POINT_UPDATED," + i_IRLS + "," + accept_new_beta); + log_str = append (log_str, "TRUST_DELTA," + i_IRLS + "," + trust_delta); + } + if (accept_new_beta == 1) + { + beta = newbeta; log_l = new_log_l; deviance_nodisp = new_deviance_nodisp; isNaN_log_l = isNaN_new_log_l; + + [g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); + + # We introduced these variables to avoid roundoff errors: + # g_Y = y_residual / (y_var * link_grad); + # w = 1.0 / (y_var * link_grad * link_grad); + + gXY = - t(X) %*% g_Y; + g = diag (scale_X) %*% gXY + shift_X %*% gXY [num_features, ]; + g_norm = sqrt (sum ((g + lambda * beta) ^ 2)); + + if (fileLog != " ") { + log_str = append (log_str, "GRADIENT_NORM," + i_IRLS + "," + g_norm); + } + } + + [z, neg_log_l_change_predicted, num_CG_iters, reached_trust_boundary] = + get_CG_Steihaug_point (X, scale_X, shift_X, w, g, beta, lambda, trust_delta, max_iteration_CG); + + newbeta = beta + z; + + ssX_newbeta = diag (scale_X) %*% newbeta; + ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta; + all_linear_terms = X %*% ssX_newbeta; + + [new_log_l, isNaN_new_log_l] = glm_log_likelihood_part + (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); + + if (isNaN_new_log_l == 0) { + new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l); + new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2); + } + + log_l_change = new_log_l - log_l; # R's criterion for termination: |dev - devold|/(|dev| + 0.1) < eps + + if (reached_trust_boundary == 0 & isNaN_new_log_l == 0 & + (2.0 * abs (log_l_change) < eps * (deviance_nodisp + 0.1) | abs (log_l_change) < (abs (log_l) + abs (new_log_l)) * 0.00000000000001) ) + { + termination_code = 1; + } + rho = - log_l_change / neg_log_l_change_predicted; + z_norm = sqrt (sum (z * z)); + + [z_norm_m, z_norm_e] = round_to_print (z_norm); + [trust_delta_m, trust_delta_e] = round_to_print (trust_delta); + [rho_m, rho_e] = round_to_print (rho); + [new_log_l_m, new_log_l_e] = round_to_print (new_log_l); + [log_l_change_m, log_l_change_e] = round_to_print (log_l_change); + [g_norm_m, g_norm_e] = round_to_print (g_norm); + + i_IRLS = i_IRLS + 1; + print ("Iter #" + i_IRLS + " completed" + + ", ||z|| = " + z_norm_m + "E" + z_norm_e + + ", trust_delta = " + trust_delta_m + "E" + trust_delta_e + + ", reached = " + reached_trust_boundary + + ", ||g|| = " + g_norm_m + "E" + g_norm_e + + ", new_log_l = " + new_log_l_m + "E" + new_log_l_e + + ", log_l_change = " + log_l_change_m + "E" + log_l_change_e + + ", rho = " + rho_m + "E" + rho_e); + + if (fileLog != " ") { + log_str = append (log_str, "NUM_CG_ITERS," + i_IRLS + "," + num_CG_iters); + log_str = append (log_str, "IS_TRUST_REACHED," + i_IRLS + "," + reached_trust_boundary); + log_str = append (log_str, "POINT_STEP_NORM," + i_IRLS + "," + z_norm); + log_str = append (log_str, "OBJECTIVE," + i_IRLS + "," + (- new_log_l)); + log_str = append (log_str, "OBJ_DROP_REAL," + i_IRLS + "," + log_l_change); + log_str = append (log_str, "OBJ_DROP_PRED," + i_IRLS + "," + (- neg_log_l_change_predicted)); + log_str = append (log_str, "OBJ_DROP_RATIO," + i_IRLS + "," + rho); + log_str = append (log_str, "LINEAR_TERM_MIN," + i_IRLS + "," + min (all_linear_terms)); + log_str = append (log_str, "LINEAR_TERM_MAX," + i_IRLS + "," + max (all_linear_terms)); + } + + if (i_IRLS == max_iteration_IRLS) { + termination_code = 2; + } +} + +beta = newbeta; +log_l = new_log_l; +deviance_nodisp = new_deviance_nodisp; + +if (termination_code == 1) { + print ("Converged in " + i_IRLS + " steps."); +} else { + print ("Did not converge."); +} + +ssX_beta = diag (scale_X) %*% beta; +ssX_beta [num_features, ] = ssX_beta [num_features, ] + t(shift_X) %*% beta; +if (intercept_status == 2) { + beta_out = append (ssX_beta, beta); +} else { + beta_out = ssX_beta; +} + +write (beta_out, $10); + +if (intercept_status == 1 | intercept_status == 2) { + intercept_value = as.scalar (beta_out [num_features, 1]); + beta_noicept = beta_out [1 : (num_features - 1), 1]; +} else { + beta_noicept = beta_out [1 : num_features, 1]; +} +min_beta = min (beta_noicept); +max_beta = max (beta_noicept); +tmp_i_min_beta = rowIndexMin (t(beta_noicept)) +i_min_beta = as.scalar (tmp_i_min_beta [1, 1]); +tmp_i_max_beta = rowIndexMax (t(beta_noicept)) +i_max_beta = as.scalar (tmp_i_max_beta [1, 1]); + +##### OVER-DISPERSION PART ##### + +all_linear_terms = X %*% ssX_beta; +[g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); + +pearson_residual_sq = g_Y ^ 2 / w; +pearson_residual_sq = replace (target = pearson_residual_sq, pattern = 0.0/0.0, replacement = 0); +# pearson_residual_sq = (y_residual ^ 2) / y_var; + +if (num_records > num_features) { + estimated_dispersion = sum (pearson_residual_sq) / (num_records - num_features); +} +if (dispersion <= 0.0) { + dispersion = estimated_dispersion; +} +deviance = deviance_nodisp / dispersion; + +##### END OF THE MAIN PART ##### + +} else { print ("Input matrices are out of range. Terminating the DML."); termination_code = 3; } +} else { print ("Distribution/Link not supported. Terminating the DML."); termination_code = 4; } + +str = "TERMINATION_CODE," + termination_code; +str = append (str, "BETA_MIN," + min_beta); +str = append (str, "BETA_MIN_INDEX," + i_min_beta); +str = append (str, "BETA_MAX," + max_beta); +str = append (str, "BETA_MAX_INDEX," + i_max_beta); +str = append (str, "INTERCEPT," + intercept_value); +str = append (str, "DISPERSION," + dispersion); +str = append (str, "DISPERSION_EST," + estimated_dispersion); +str = append (str, "DEVIANCE_UNSCALED," + deviance_nodisp); +str = append (str, "DEVIANCE_SCALED," + deviance); +print (str); + + + + +check_if_supported = + function (int ncol_y, int dist_type, double var_power, int link_type, double link_power) + return (int is_supported) +{ + is_supported = 0; + if (ncol_y == 1 & dist_type == 1 & link_type == 1) + { # POWER DISTRIBUTION + is_supported = 1; + if (var_power == 0.0 & link_power == -1.0) {print ("Gaussian.inverse"); } else { + if (var_power == 0.0 & link_power == 0.0) {print ("Gaussian.log"); } else { + if (var_power == 0.0 & link_power == 0.5) {print ("Gaussian.sqrt"); } else { + if (var_power == 0.0 & link_power == 1.0) {print ("Gaussian.id"); } else { + if (var_power == 0.0 ) {print ("Gaussian.power_nonlog"); } else { + if (var_power == 1.0 & link_power == -1.0) {print ("Poisson.inverse"); } else { + if (var_power == 1.0 & link_power == 0.0) {print ("Poisson.log"); } else { + if (var_power == 1.0 & link_power == 0.5) {print ("Poisson.sqrt"); } else { + if (var_power == 1.0 & link_power == 1.0) {print ("Poisson.id"); } else { + if (var_power == 1.0 ) {print ("Poisson.power_nonlog"); } else { + if (var_power == 2.0 & link_power == -1.0) {print ("Gamma.inverse"); } else { + if (var_power == 2.0 & link_power == 0.0) {print ("Gamma.log"); } else { + if (var_power == 2.0 & link_power == 0.5) {print ("Gamma.sqrt"); } else { + if (var_power == 2.0 & link_power == 1.0) {print ("Gamma.id"); } else { + if (var_power == 2.0 ) {print ("Gamma.power_nonlog"); } else { + if (var_power == 3.0 & link_power == -2.0) {print ("InvGaussian.1/mu^2"); } else { + if (var_power == 3.0 & link_power == -1.0) {print ("InvGaussian.inverse"); } else { + if (var_power == 3.0 & link_power == 0.0) {print ("InvGaussian.log"); } else { + if (var_power == 3.0 & link_power == 0.5) {print ("InvGaussian.sqrt"); } else { + if (var_power == 3.0 & link_power == 1.0) {print ("InvGaussian.id"); } else { + if (var_power == 3.0 ) {print ("InvGaussian.power_nonlog");}else{ + if ( link_power == 0.0) {print ("PowerDist.log"); } else { + print ("PowerDist.power_nonlog"); + } }}}}} }}}}} }}}}} }}}}} }} + if (ncol_y == 1 & dist_type == 2) + { + print ("Error: Bernoulli response matrix has not been converted into two-column format."); + } + if (ncol_y == 2 & dist_type == 2 & link_type >= 1 & link_type <= 5) + { # BINOMIAL/BERNOULLI DISTRIBUTION + is_supported = 1; + if (link_type == 1 & link_power == -1.0) {print ("Binomial.inverse"); } else { + if (link_type == 1 & link_power == 0.0) {print ("Binomial.log"); } else { + if (link_type == 1 & link_power == 0.5) {print ("Binomial.sqrt"); } else { + if (link_type == 1 & link_power == 1.0) {print ("Binomial.id"); } else { + if (link_type == 1) {print ("Binomial.power_nonlog"); } else { + if (link_type == 2) {print ("Binomial.logit"); } else { + if (link_type == 3) {print ("Binomial.probit"); } else { + if (link_type == 4) {print ("Binomial.cloglog"); } else { + if (link_type == 5) {print ("Binomial.cauchit"); } + } }}}}} }}} + if (is_supported == 0) { + print ("Response matrix with " + ncol_y + " columns, distribution family (" + dist_type + ", " + var_power + + ") and link family (" + link_type + ", " + link_power + ") are NOT supported together."); + } +} + +glm_initialize = function (Matrix[double] X, Matrix[double] Y, int dist_type, double var_power, int link_type, double link_power, int icept_status, int max_iter_CG) +return (Matrix[double] beta, double saturated_log_l, int isNaN) +{ + saturated_log_l = 0.0; + isNaN = 0; + y_corr = Y [, 1]; + if (dist_type == 2) { + n_corr = rowSums (Y); + is_n_zero = ppred (n_corr, 0.0, "=="); + y_corr = Y [, 1] / (n_corr + is_n_zero) + (0.5 - Y [, 1]) * is_n_zero; + } + linear_terms = y_corr; + if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION + if (link_power == 0.0) { + if (sum (ppred (y_corr, 0.0, "<")) == 0) { + is_zero_y_corr = ppred (y_corr, 0.0, "=="); + linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { isNaN = 1; } + } else { if (link_power == 1.0) { + linear_terms = y_corr; + } else { if (link_power == -1.0) { + linear_terms = 1.0 / y_corr; + } else { if (link_power == 0.5) { + if (sum (ppred (y_corr, 0.0, "<")) == 0) { + linear_terms = sqrt (y_corr); + } else { isNaN = 1; } + } else { if (link_power > 0.0) { + if (sum (ppred (y_corr, 0.0, "<")) == 0) { + is_zero_y_corr = ppred (y_corr, 0.0, "=="); + linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr; + } else { isNaN = 1; } + } else { + if (sum (ppred (y_corr, 0.0, "<=")) == 0) { + linear_terms = y_corr ^ link_power; + } else { isNaN = 1; } + }}}}} + } + if (dist_type == 2 & link_type >= 1 & link_type <= 5) + { # BINOMIAL/BERNOULLI DISTRIBUTION + if (link_type == 1 & link_power == 0.0) { # Binomial.log + if (sum (ppred (y_corr, 0.0, "<")) == 0) { + is_zero_y_corr = ppred (y_corr, 0.0, "=="); + linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { isNaN = 1; } + } else { if (link_type == 1 & link_power > 0.0) { # Binomial.power_nonlog pos + if (sum (ppred (y_corr, 0.0, "<")) == 0) { + is_zero_y_corr = ppred (y_corr, 0.0, "=="); + linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr; + } else { isNaN = 1; } + } else { if (link_type == 1) { # Binomial.power_nonlog neg + if (sum (ppred (y_corr, 0.0, "<=")) == 0) { + linear_terms = y_corr ^ link_power; + } else { isNaN = 1; } + } else { + is_zero_y_corr = ppred (y_corr, 0.0, "<="); + is_one_y_corr = ppred (y_corr, 1.0, ">="); + y_corr = y_corr * (1.0 - is_zero_y_corr) * (1.0 - is_one_y_corr) + 0.5 * (is_zero_y_corr + is_one_y_corr); + if (link_type == 2) { # Binomial.logit + linear_terms = log (y_corr / (1.0 - y_corr)) + + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { if (link_type == 3) { # Binomial.probit + y_below_half = y_corr + (1.0 - 2.0 * y_corr) * ppred (y_corr, 0.5, ">"); + t = sqrt (- 2.0 * log (y_below_half)); + approx_inv_Gauss_CDF = - t + (2.515517 + t * (0.802853 + t * 0.010328)) / (1.0 + t * (1.432788 + t * (0.189269 + t * 0.001308))); + linear_terms = approx_inv_Gauss_CDF * (1.0 - 2.0 * ppred (y_corr, 0.5, ">")) + + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { if (link_type == 4) { # Binomial.cloglog + linear_terms = log (- log (1.0 - y_corr)) + - log (- log (0.5)) * (is_zero_y_corr + is_one_y_corr) + + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { if (link_type == 5) { # Binomial.cauchit + linear_terms = tan ((y_corr - 0.5) * 3.1415926535897932384626433832795) + + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + }} }}}}} + } + + if (isNaN == 0) { + [saturated_log_l, isNaN] = + glm_log_likelihood_part (linear_terms, Y, dist_type, var_power, link_type, link_power); + } + + if ((dist_type == 1 & link_type == 1 & link_power == 0.0) | + (dist_type == 2 & link_type >= 2)) + { + desired_eta = 0.0; + } else { if (link_type == 1 & link_power == 0.0) { + desired_eta = log (0.5); + } else { if (link_type == 1) { + desired_eta = 0.5 ^ link_power; + } else { + desired_eta = 0.5; + }}} + + beta = matrix (0.0, rows = ncol(X), cols = 1); + + if (desired_eta != 0.0) { + if (icept_status == 1 | icept_status == 2) { + beta [nrow(beta), 1] = desired_eta; + } else { + # We want: avg (X %*% ssX_transform %*% beta) = desired_eta + # Note that "ssX_transform" is trivial here, hence ignored + + beta = straightenX (X, 0.000001, max_iter_CG); + beta = beta * desired_eta; +} } } + + +glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y, + int dist_type, double var_power, int link_type, double link_power) + return (Matrix[double] g_Y, Matrix[double] w) + # ORIGINALLY we returned more meaningful vectors, namely: + # Matrix[double] y_residual : y - y_mean, i.e. y observed - y predicted + # Matrix[double] link_gradient : derivative of the link function + # Matrix[double] var_function : variance without dispersion, i.e. the V(mu) function + # BUT, this caused roundoff errors, so we had to compute "directly useful" vectors + # and skip over the "meaningful intermediaries". Now we output these two variables: + # g_Y = y_residual / (var_function * link_gradient); + # w = 1.0 / (var_function * link_gradient ^ 2); +{ + num_records = nrow (linear_terms); + zeros_r = matrix (0.0, rows = num_records, cols = 1); + ones_r = 1 + zeros_r; + g_Y = zeros_r; + w = zeros_r; + + # Some constants + + one_over_sqrt_two_pi = 0.39894228040143267793994605993438; + ones_2 = matrix (1.0, rows = 1, cols = 2); + p_one_m_one = ones_2; + p_one_m_one [1, 2] = -1.0; + m_one_p_one = ones_2; + m_one_p_one [1, 1] = -1.0; + zero_one = ones_2; + zero_one [1, 1] = 0.0; + one_zero = ones_2; + one_zero [1, 2] = 0.0; + flip_pos = matrix (0, rows = 2, cols = 2); + flip_neg = flip_pos; + flip_pos [1, 2] = 1; + flip_pos [2, 1] = 1; + flip_neg [1, 2] = -1; + flip_neg [2, 1] = 1; + + if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION + y_mean = zeros_r; + if (link_power == 0.0) { + y_mean = exp (linear_terms); + y_mean_pow = y_mean ^ (1 - var_power); + w = y_mean_pow * y_mean; + g_Y = y_mean_pow * (Y - y_mean); + } else { if (link_power == 1.0) { + y_mean = linear_terms; + w = y_mean ^ (- var_power); + g_Y = w * (Y - y_mean); + } else { + y_mean = linear_terms ^ (1.0 / link_power); + c1 = (1 - var_power) / link_power - 1; + c2 = (2 - var_power) / link_power - 2; + g_Y = (linear_terms ^ c1) * (Y - y_mean) / link_power; + w = (linear_terms ^ c2) / (link_power ^ 2); + } }} + if (dist_type == 2 & link_type >= 1 & link_type <= 5) + { # BINOMIAL/BERNOULLI DISTRIBUTION + if (link_type == 1) { # BINOMIAL.POWER LINKS + if (link_power == 0.0) { # Binomial.log + vec1 = 1 / (exp (- linear_terms) - 1); + g_Y = Y [, 1] - Y [, 2] * vec1; + w = rowSums (Y) * vec1; + } else { # Binomial.nonlog + vec1 = zeros_r; + if (link_power == 0.5) { + vec1 = 1 / (1 - linear_terms ^ 2); + } else { if (sum (ppred (linear_terms, 0.0, "<")) == 0) { + vec1 = linear_terms ^ (- 2 + 1 / link_power) / (1 - linear_terms ^ (1 / link_power)); + } else {isNaN = 1;}} + # We want a "zero-protected" version of + # vec2 = Y [, 1] / linear_terms; + is_y_0 = ppred (Y [, 1], 0.0, "=="); + vec2 = (Y [, 1] + is_y_0) / (linear_terms * (1 - is_y_0) + is_y_0) - is_y_0; + g_Y = (vec2 - Y [, 2] * vec1 * linear_terms) / link_power; + w = rowSums (Y) * vec1 / link_power ^ 2; + } + } else { + is_LT_pos_infinite = ppred (linear_terms, 1.0/0.0, "=="); + is_LT_neg_infinite = ppred (linear_terms, -1.0/0.0, "=="); + is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one; + finite_linear_terms = replace (target = linear_terms, pattern = 1.0/0.0, replacement = 0); + finite_linear_terms = replace (target = finite_linear_terms, pattern = -1.0/0.0, replacement = 0); + if (link_type == 2) { # Binomial.logit + Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one; + Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2); + Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; + g_Y = rowSums (Y * (Y_prob %*% flip_neg)); ### = y_residual; + w = rowSums (Y * (Y_prob %*% flip_pos) * Y_prob); ### = y_variance; + } else { if (link_type == 3) { # Binomial.probit + is_lt_pos = ppred (linear_terms, 0.0, ">="); + t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0) + pt_gp = t_gp * ( 0.254829592 + + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, + + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 + + t_gp * (-1.453152027 + + t_gp * 1.061405429)))); + the_gauss_exp = exp (- (linear_terms ^ 2) / 2.0); + vec1 = 0.25 * pt_gp * (2 - the_gauss_exp * pt_gp); + vec2 = Y [, 1] - rowSums (Y) * is_lt_pos + the_gauss_exp * pt_gp * rowSums (Y) * (is_lt_pos - 0.5); + w = the_gauss_exp * (one_over_sqrt_two_pi ^ 2) * rowSums (Y) / vec1; + g_Y = one_over_sqrt_two_pi * vec2 / vec1; + } else { if (link_type == 4) { # Binomial.cloglog + the_exp = exp (linear_terms) + the_exp_exp = exp (- the_exp); + is_too_small = ppred (10000000 + the_exp, 10000000, "=="); + the_exp_ratio = (1 - is_too_small) * (1 - the_exp_exp) / (the_exp + is_too_small) + is_too_small * (1 - the_exp / 2); + g_Y = (rowSums (Y) * the_exp_exp - Y [, 2]) / the_exp_ratio; + w = the_exp_exp * the_exp * rowSums (Y) / the_exp_ratio; + } else { if (link_type == 5) { # Binomial.cauchit + Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / 3.1415926535897932384626433832795; + Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; + y_residual = Y [, 1] * Y_prob [, 2] - Y [, 2] * Y_prob [, 1]; + var_function = rowSums (Y) * Y_prob [, 1] * Y_prob [, 2]; + link_gradient_normalized = (1 + linear_terms ^ 2) * 3.1415926535897932384626433832795; + g_Y = rowSums (Y) * y_residual / (var_function * link_gradient_normalized); + w = (rowSums (Y) ^ 2) / (var_function * link_gradient_normalized ^ 2); + }}}} + } + } +} + + +glm_log_likelihood_part = function (Matrix[double] linear_terms, Matrix[double] Y, + int dist_type, double var_power, int link_type, double link_power) + return (double log_l, int isNaN) +{ + isNaN = 0; + log_l = 0.0; + num_records = nrow (Y); + zeros_r = matrix (0.0, rows = num_records, cols = 1); + + if (dist_type == 1 & link_type == 1) + { # POWER DISTRIBUTION + b_cumulant = zeros_r; + natural_parameters = zeros_r; + is_natural_parameter_log_zero = zeros_r; + if (var_power == 1.0 & link_power == 0.0) { # Poisson.log + b_cumulant = exp (linear_terms); + is_natural_parameter_log_zero = ppred (linear_terms, -1.0/0.0, "=="); + natural_parameters = replace (target = linear_terms, pattern = -1.0/0.0, replacement = 0); + } else { if (var_power == 1.0 & link_power == 1.0) { # Poisson.id + if (sum (ppred (linear_terms, 0.0, "<")) == 0) { + b_cumulant = linear_terms; + is_natural_parameter_log_zero = ppred (linear_terms, 0.0, "=="); + natural_parameters = log (linear_terms + is_natural_parameter_log_zero); + } else {isNaN = 1;} + } else { if (var_power == 1.0 & link_power == 0.5) { # Poisson.sqrt + if (sum (ppred (linear_terms, 0.0, "<")) == 0) { + b_cumulant = linear_terms ^ 2; + is_natural_parameter_log_zero = ppred (linear_terms, 0.0, "=="); + natural_parameters = 2.0 * log (linear_terms + is_natural_parameter_log_zero); + } else {isNaN = 1;} + } else { if (var_power == 1.0 & link_power > 0.0) { # Poisson.power_nonlog, pos + if (sum (ppred (linear_terms, 0.0, "<")) == 0) { + is_natural_parameter_log_zero = ppred (linear_terms, 0.0, "=="); + b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero; + natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power; + } else {isNaN = 1;} + } else { if (var_power == 1.0) { # Poisson.power_nonlog, neg + if (sum (ppred (linear_terms, 0.0, "<=")) == 0) { + b_cumulant = linear_terms ^ (1.0 / link_power); + natural_parameters = log (linear_terms) / link_power; + } else {isNaN = 1;} + } else { if (var_power == 2.0 & link_power == -1.0) { # Gamma.inverse + if (sum (ppred (linear_terms, 0.0, "<=")) == 0) { + b_cumulant = - log (linear_terms); + natural_parameters = - linear_terms; + } else {isNaN = 1;} + } else { if (var_power == 2.0 & link_power == 1.0) { # Gamma.id + if (sum (ppred (linear_terms, 0.0, "<=")) == 0) { + b_cumulant = log (linear_terms); + natural_parameters = - 1.0 / linear_terms; + } else {isNaN = 1;} + } else { if (var_power == 2.0 & link_power == 0.0) { # Gamma.log + b_cumulant = linear_terms; + natural_parameters = - exp (- linear_terms); + } else { if (var_power == 2.0) { # Gamma.power_nonlog + if (sum (ppred (linear_terms, 0.0, "<=")) == 0) { + b_cumulant = log (linear_terms) / link_power; + natural_parameters = - linear_terms ^ (- 1.0 / link_power); + } else {isNaN = 1;} + } else { if (link_power == 0.0) { # PowerDist.log + natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power); + b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power); + } else { # PowerDist.power_nonlog + if (-2 * link_power == 1.0 - var_power) { + natural_parameters = 1.0 / (linear_terms ^ 2) / (1.0 - var_power); + } else { if (-1 * link_power == 1.0 - var_power) { + natural_parameters = 1.0 / linear_terms / (1.0 - var_power); + } else { if ( link_power == 1.0 - var_power) { + natural_parameters = linear_terms / (1.0 - var_power); + } else { if ( 2 * link_power == 1.0 - var_power) { + natural_parameters = linear_terms ^ 2 / (1.0 - var_power); + } else { + if (sum (ppred (linear_terms, 0.0, "<=")) == 0) { + power = (1.0 - var_power) / link_power; + natural_parameters = (linear_terms ^ power) / (1.0 - var_power); + } else {isNaN = 1;} + }}}} + if (-2 * link_power == 2.0 - var_power) { + b_cumulant = 1.0 / (linear_terms ^ 2) / (2.0 - var_power); + } else { if (-1 * link_power == 2.0 - var_power) { + b_cumulant = 1.0 / linear_terms / (2.0 - var_power); + } else { if ( link_power == 2.0 - var_power) { + b_cumulant = linear_terms / (2.0 - var_power); + } else { if ( 2 * link_power == 2.0 - var_power) { + b_cumulant = linear_terms ^ 2 / (2.0 - var_power); + } else { + if (sum (ppred (linear_terms, 0.0, "<=")) == 0) { + power = (2.0 - var_power) / link_power; + b_cumulant = (linear_terms ^ power) / (2.0 - var_power); + } else {isNaN = 1;} + }}}} + }}}}} }}}}} + if (sum (is_natural_parameter_log_zero * abs (Y)) > 0.0) { + log_l = -1.0 / 0.0; + isNaN = 1; + } + if (isNaN == 0) + { + log_l = sum (Y * natural_parameters - b_cumulant); + if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) { + log_l = -1.0 / 0.0; + isNaN = 1; + } } } + + if (dist_type == 2 & link_type >= 1 & link_type <= 5) + { # BINOMIAL/BERNOULLI DISTRIBUTION + + [Y_prob, isNaN] = binomial_probability_two_column (linear_terms, link_type, link_power); + + if (isNaN == 0) { + does_prob_contradict = ppred (Y_prob, 0.0, "<="); + if (sum (does_prob_contradict * abs (Y)) == 0.0) { + log_l = sum (Y * log (Y_prob * (1 - does_prob_contradict) + does_prob_contradict)); + if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) { + isNaN = 1; + } + } else { + log_l = -1.0 / 0.0; + isNaN = 1; + } } } + + if (isNaN == 1) { + log_l = - 1.0 / 0.0; + } +} + + + +binomial_probability_two_column = + function (Matrix[double] linear_terms, int link_type, double link_power) + return (Matrix[double] Y_prob, int isNaN) +{ + isNaN = 0; + num_records = nrow (linear_terms); + + # Define some auxiliary matrices + + ones_2 = matrix (1.0, rows = 1, cols = 2); + p_one_m_one = ones_2; + p_one_m_one [1, 2] = -1.0; + m_one_p_one = ones_2; + m_one_p_one [1, 1] = -1.0; + zero_one = ones_2; + zero_one [1, 1] = 0.0; + one_zero = ones_2; + one_zero [1, 2] = 0.0; + + zeros_r = matrix (0.0, rows = num_records, cols = 1); + ones_r = 1.0 + zeros_r; + + # Begin the function body + + Y_prob = zeros_r %*% ones_2; + if (link_type == 1) { # Binomial.power + if (link_power == 0.0) { # Binomial.log + Y_prob = exp (linear_terms) %*% p_one_m_one + ones_r %*% zero_one; + } else { if (link_power == 0.5) { # Binomial.sqrt + Y_prob = (linear_terms ^ 2) %*% p_one_m_one + ones_r %*% zero_one; + } else { # Binomial.power_nonlog + if (sum (ppred (linear_terms, 0.0, "<")) == 0) { + Y_prob = (linear_terms ^ (1.0 / link_power)) %*% p_one_m_one + ones_r %*% zero_one; + } else {isNaN = 1;} + }} + } else { # Binomial.non_power + is_LT_pos_infinite = ppred (linear_terms, 1.0/0.0, "=="); + is_LT_neg_infinite = ppred (linear_terms, -1.0/0.0, "=="); + is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one; + finite_linear_terms = replace (target = linear_terms, pattern = 1.0/0.0, replacement = 0); + finite_linear_terms = replace (target = finite_linear_terms, pattern = -1.0/0.0, replacement = 0); + if (link_type == 2) { # Binomial.logit + Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one; + Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2); + } else { if (link_type == 3) { # Binomial.probit + lt_pos_neg = ppred (finite_linear_terms, 0.0, ">=") %*% p_one_m_one + ones_r %*% zero_one; + t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0) + pt_gp = t_gp * ( 0.254829592 + + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, + + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 + + t_gp * (-1.453152027 + + t_gp * 1.061405429)))); + the_gauss_exp = exp (- (finite_linear_terms ^ 2) / 2.0); + Y_prob = lt_pos_neg + ((the_gauss_exp * pt_gp) %*% ones_2) * (0.5 - lt_pos_neg); + } else { if (link_type == 4) { # Binomial.cloglog + the_exp = exp (finite_linear_terms); + the_exp_exp = exp (- the_exp); + is_too_small = ppred (10000000 + the_exp, 10000000, "=="); + Y_prob [, 1] = (1 - is_too_small) * (1 - the_exp_exp) + is_too_small * the_exp * (1 - the_exp / 2); + Y_prob [, 2] = the_exp_exp; + } else { if (link_type == 5) { # Binomial.cauchit + Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / 3.1415926535897932384626433832795; + } else { + isNaN = 1; + }}}} + Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; +} } + + +# THE CG-STEIHAUG PROCEDURE SCRIPT + +# Apply Conjugate Gradient - Steihaug algorithm in order to approximately minimize +# 0.5 z^T (X^T diag(w) X + diag (lambda)) z + (g + lambda * beta)^T z +# under constraint: ||z|| <= trust_delta. +# See Alg. 7.2 on p. 171 of "Numerical Optimization" 2nd ed. by Nocedal and Wright +# IN THE ABOVE, "X" IS UNDERSTOOD TO BE "X %*% (SHIFT/SCALE TRANSFORM)"; this transform +# is given separately because sparse "X" may become dense after applying the transform. +# +get_CG_Steihaug_point = + function (Matrix[double] X, Matrix[double] scale_X, Matrix[double] shift_X, Matrix[double] w, + Matrix[double] g, Matrix[double] beta, Matrix[double] lambda, double trust_delta, int max_iter_CG) + return (Matrix[double] z, double neg_log_l_change, int i_CG, int reached_trust_boundary) +{ + trust_delta_sq = trust_delta ^ 2; + size_CG = nrow (g); + z = matrix (0.0, rows = size_CG, cols = 1); + neg_log_l_change = 0.0; + reached_trust_boundary = 0; + g_reg = g + lambda * beta; + r_CG = g_reg; + p_CG = -r_CG; + rr_CG = sum(r_CG * r_CG); + eps_CG = rr_CG * min (0.25, sqrt (rr_CG)); + converged_CG = 0; + if (rr_CG < eps_CG) { + converged_CG = 1; + } + + max_iteration_CG = max_iter_CG; + if (max_iteration_CG <= 0) { + max_iteration_CG = size_CG; + } + i_CG = 0; + while (converged_CG == 0) + { + i_CG = i_CG + 1; + ssX_p_CG = diag (scale_X) %*% p_CG; + ssX_p_CG [size_CG, ] = ssX_p_CG [size_CG, ] + t(shift_X) %*% p_CG; + temp_CG = t(X) %*% (w * (X %*% ssX_p_CG)); + q_CG = (lambda * p_CG) + diag (scale_X) %*% temp_CG + shift_X %*% temp_CG [size_CG, ]; + pq_CG = sum (p_CG * q_CG); + if (pq_CG <= 0) { + pp_CG = sum (p_CG * p_CG); + if (pp_CG > 0) { + [z, neg_log_l_change] = + get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq); + reached_trust_boundary = 1; + } else { + neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg)); + } + converged_CG = 1; + } + if (converged_CG == 0) { + alpha_CG = rr_CG / pq_CG; + new_z = z + alpha_CG * p_CG; + if (sum(new_z * new_z) >= trust_delta_sq) { + pp_CG = sum (p_CG * p_CG); + [z, neg_log_l_change] = + get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq); + reached_trust_boundary = 1; + converged_CG = 1; + } + if (converged_CG == 0) { + z = new_z; + old_rr_CG = rr_CG; + r_CG = r_CG + alpha_CG * q_CG; + rr_CG = sum(r_CG * r_CG); + if (i_CG == max_iteration_CG | rr_CG < eps_CG) { + neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg)); + reached_trust_boundary = 0; + converged_CG = 1; + } + if (converged_CG == 0) { + p_CG = -r_CG + (rr_CG / old_rr_CG) * p_CG; +} } } } } + + +# An auxiliary function used twice inside the CG-STEIHAUG loop: +get_trust_boundary_point = + function (Matrix[double] g, Matrix[double] z, Matrix[double] p, + Matrix[double] q, Matrix[double] r, double pp, double pq, + double trust_delta_sq) + return (Matrix[double] new_z, double f_change) +{ + zz = sum (z * z); pz = sum (p * z); + sq_root_d = sqrt (pz * pz - pp * (zz - trust_delta_sq)); + tau_1 = (- pz + sq_root_d) / pp; + tau_2 = (- pz - sq_root_d) / pp; + zq = sum (z * q); gp = sum (g * p); + f_extra = 0.5 * sum (z * (r + g)); + f_change_1 = f_extra + (0.5 * tau_1 * pq + zq + gp) * tau_1; + f_change_2 = f_extra + (0.5 * tau_2 * pq + zq + gp) * tau_2; + if (f_change_1 < f_change_2) { + new_z = z + (tau_1 * p); + f_change = f_change_1; + } + else { + new_z = z + (tau_2 * p); + f_change = f_change_2; + } +} + + +# Computes vector w such that ||X %*% w - 1|| -> MIN given avg(X %*% w) = 1 +# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale +# it to compute w = c * z_LS such that sum(X %*% w) = nrow(X). +straightenX = + function (Matrix[double] X, double eps, int max_iter_CG) + return (Matrix[double] w) +{ + w_X = t(colSums(X)); + lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X); + eps_LS = eps * nrow(X); + + # BEGIN LEAST SQUARES + + r_LS = - w_X; + z_LS = matrix (0.0, rows = ncol(X), cols = 1); + p_LS = - r_LS; + norm_r2_LS = sum (r_LS ^ 2); + i_LS = 0; + while (i_LS < max_iter_CG & i_LS < ncol(X) & norm_r2_LS >= eps_LS) + { + q_LS = t(X) %*% X %*% p_LS; + q_LS = q_LS + lambda_LS * p_LS; + alpha_LS = norm_r2_LS / sum (p_LS * q_LS); + z_LS = z_LS + alpha_LS * p_LS; + old_norm_r2_LS = norm_r2_LS; + r_LS = r_LS + alpha_LS * q_LS; + norm_r2_LS = sum (r_LS ^ 2); + p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS; + i_LS = i_LS + 1; + } + + # END LEAST SQUARES + + w = (nrow(X) / sum (w_X * z_LS)) * z_LS; +} + + +round_to_print = function (double x_to_truncate) +return (double mantissa, int eee) +{ + mantissa = 1.0; + eee = 0; + positive_infinity = 1.0 / 0.0; + x = abs (x_to_truncate); + if (x != x / 2.0) { + log_ten = log (10.0); + d_eee = round (log (x) / log_ten - 0.5); + mantissa = round (x * exp (log_ten * (4.0 - d_eee))) / 10000; + if (mantissa == 10.0) { + mantissa = 1.0; + d_eee = d_eee + 1; + } + if (x_to_truncate < 0.0) { + mantissa = - mantissa; + } + eee = 0; + pow_two = 1; + res_eee = abs (d_eee); + while (res_eee != 0.0) { + new_res_eee = round (res_eee / 2.0 - 0.3); + if (new_res_eee * 2.0 < res_eee) { + eee = eee + pow_two; + } + res_eee = new_res_eee; + pow_two = 2 * pow_two; + } + if (d_eee < 0.0) { + eee = - eee; + } + } else { mantissa = x_to_truncate; } +}
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_Kmeans.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/codegen/Algorithm_Kmeans.dml b/src/test/scripts/functions/codegen/Algorithm_Kmeans.dml new file mode 100644 index 0000000..9bdfab0 --- /dev/null +++ b/src/test/scripts/functions/codegen/Algorithm_Kmeans.dml @@ -0,0 +1,243 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +X = read($1) +num_centroids = $2 +num_runs = $3 +eps = $4; +max_iter = $5; + +is_write_Y = 0; +is_verbose = 0; +avg_sample_size_per_centroid = 50; + +print ("BEGIN K-MEANS SCRIPT"); +print ("Reading X..."); +num_records = nrow (X); +num_features = ncol (X); + +sumXsq = sum (X ^ 2); +# Remark - A useful rewrite: sum (A %*% B) = sum (t(colSums(A)) * rowSums(B)) + +# STEP 1: INITIALIZE CENTROIDS FOR ALL RUNS FROM DATA SAMPLES: + +print ("Taking data samples for initialization..."); + +[sample_maps, samples_vs_runs_map, sample_block_size] = + get_sample_maps (num_records, num_runs, num_centroids * avg_sample_size_per_centroid); + +is_row_in_samples = rowSums (sample_maps); +X_samples = sample_maps %*% X; +X_samples_sq_norms = rowSums (X_samples ^ 2); + +print ("Initializing the centroids for all runs..."); +All_Centroids = matrix (0, rows = (num_runs * num_centroids), cols = num_features); + +# We select centroids according to the k-Means++ heuristic applied to a sample of X +# Loop invariant: min_distances ~ sq.distances from X_sample rows to nearest centroids, +# with the out-of-range X_sample positions in min_distances set to 0.0 + +min_distances = is_row_in_samples; # Pick the 1-st centroids uniformly at random + +for (i in 1 : num_centroids) +{ + # "Matricize" and prefix-sum to compute the cumulative distribution function: + min_distances_matrix_form = + matrix (min_distances, rows = sample_block_size, cols = num_runs, byrow = FALSE); + cdf_min_distances = cumsum (min_distances_matrix_form); + + # Select the i-th centroid in each sample as a random sample row id with + # probability ~ min_distances: + random_row = Rand (rows = 1, cols = num_runs, min = 0.0, max = 1.0); + threshold_matrix = random_row * cdf_min_distances [sample_block_size, ]; + centroid_ids = t(colSums (cdf_min_distances < threshold_matrix)) + 1; + + # Place the selected centroids together, one per run, into a matrix: + centroid_placer = matrix (0, rows = num_runs, cols = (sample_block_size * num_runs)); + centroid_placer_raw = + table (seq (1, num_runs, 1), sample_block_size * seq (0, num_runs - 1, 1) + centroid_ids); + centroid_placer [, 1 : ncol (centroid_placer_raw)] = centroid_placer_raw; + centroids = centroid_placer %*% X_samples; + + # Place the selected centroids into their appropriate slots in All_Centroids: + centroid_placer = matrix (0, rows = nrow (All_Centroids), cols = num_runs); + centroid_placer_raw = + table (seq (i, num_centroids * (num_runs - 1) + i, num_centroids), seq (1, num_runs, 1)); + centroid_placer [1 : nrow (centroid_placer_raw), ] = centroid_placer_raw; + All_Centroids = All_Centroids + centroid_placer %*% centroids; + + # Update min_distances to preserve the loop invariant: + distances = X_samples_sq_norms + samples_vs_runs_map %*% rowSums (centroids ^ 2) + - 2 * rowSums (X_samples * (samples_vs_runs_map %*% centroids)); + if (i == 1) { + min_distances = is_row_in_samples * distances; + } else { + min_distances = min (min_distances, distances); +} } + +# STEP 2: PERFORM K-MEANS ITERATIONS FOR ALL RUNS: + +termination_code = matrix (0, rows = num_runs, cols = 1); +final_wcss = matrix (0, rows = num_runs, cols = 1); +num_iterations = matrix (0, rows = num_runs, cols = 1); + +print ("Performing k-means iterations for all runs..."); + +parfor (run_index in 1 : num_runs, check = 0) +{ + C = All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * run_index), ]; + C_old = C; + iter_count = 0; + term_code = 0; + wcss = 0; + + while (term_code == 0) + { + # Compute Euclidean squared distances from records (X rows) to centroids (C rows) + # without the C-independent term, then take the minimum for each record + D = -2 * (X %*% t(C)) + t(rowSums (C ^ 2)); + minD = rowMins (D); + # Compute the current centroid-based within-cluster sum of squares (WCSS) + wcss_old = wcss; + wcss = sumXsq + sum (minD); + if (is_verbose == 1) { + if (iter_count == 0) { + print ("Run " + run_index + ", At Start-Up: Centroid WCSS = " + wcss); + } else { + print ("Run " + run_index + ", Iteration " + iter_count + ": Centroid WCSS = " + wcss + + "; Centroid change (avg.sq.dist.) = " + (sum ((C - C_old) ^ 2) / num_centroids)); + } } + # Check if convergence or maximum iteration has been reached + if (wcss_old - wcss < eps * wcss & iter_count > 0) { + term_code = 1; # Convergence is reached + } else { + if (iter_count >= max_iter) { + term_code = 2; # Maximum iteration is reached + } else { + iter_count = iter_count + 1; + # Find the closest centroid for each record + P = (D <= minD); + # If some records belong to multiple centroids, share them equally + P = P / rowSums (P); + # Compute the column normalization factor for P + P_denom = colSums (P); + if (sum (P_denom <= 0.0) > 0) { + term_code = 3; # There is a "runaway" centroid with 0.0 denominator + } else { + C_old = C; + # Compute new centroids as weighted averages over the records + C = (t(P) %*% X) / t(P_denom); + } } } } + print ("Run " + run_index + ", Iteration " + iter_count + ": Terminated with code = " + term_code + ", Centroid WCSS = " + wcss); + All_Centroids [(num_centroids * (run_index - 1) + 1) : (num_centroids * run_index), ] = C; + final_wcss [run_index, 1] = wcss; + termination_code [run_index, 1] = term_code; + num_iterations [run_index, 1] = iter_count; +} + +# STEP 3: SELECT THE RUN WITH BEST CENTROID-WCSS AND OUTPUT ITS CENTROIDS: + +termination_bitmap = matrix (0, rows = num_runs, cols = 3); +termination_bitmap_raw = table (seq (1, num_runs, 1), termination_code); +termination_bitmap [, 1 : ncol(termination_bitmap_raw)] = termination_bitmap_raw; +termination_stats = colSums (termination_bitmap); +print ("Number of successful runs = " + as.integer (as.scalar (termination_stats [1, 1]))); +print ("Number of incomplete runs = " + as.integer (as.scalar (termination_stats [1, 2]))); +print ("Number of failed runs (with lost centroids) = " + as.integer (as.scalar (termination_stats [1, 3]))); + +num_successful_runs = as.scalar (termination_stats [1, 1]); +if (num_successful_runs > 0) { + final_wcss_successful = final_wcss * termination_bitmap [, 1]; + worst_wcss = max (final_wcss_successful); + best_wcss = min (final_wcss_successful + (10 * worst_wcss + 10) * (1 - termination_bitmap [, 1])); + avg_wcss = sum (final_wcss_successful) / num_successful_runs; + best_index_vector = (final_wcss_successful == best_wcss); + aggr_best_index_vector = cumsum (best_index_vector); + best_index = as.integer (sum (aggr_best_index_vector == 0) + 1); + print ("Successful runs: Best run is " + best_index + " with Centroid WCSS = " + best_wcss + + "; Avg WCSS = " + avg_wcss + "; Worst WCSS = " + worst_wcss); + C = All_Centroids [(num_centroids * (best_index - 1) + 1) : (num_centroids * best_index), ]; + print ("Writing out the best-WCSS centroids..."); + write (C, $6, format="text"); + print ("DONE."); +} else { + stop ("No output is produced. Try increasing the number of iterations and/or runs."); +} + + + +get_sample_maps = function (int num_records, int num_samples, int approx_sample_size) + return (Matrix[double] sample_maps, Matrix[double] sample_col_map, int sample_block_size) +{ + if (approx_sample_size < num_records) { + # Input value "approx_sample_size" is the average sample size; increase it by ~10 std.dev's + # to get the sample block size (to allocate space): + sample_block_size = as.integer (approx_sample_size + round (10 * sqrt (approx_sample_size))); + num_rows = sample_block_size * num_samples; + + # Generate all samples in parallel by converting uniform random values into random + # integer skip-ahead intervals and prefix-summing them: + sample_rec_ids = Rand (rows = sample_block_size, cols = num_samples, min = 0.0, max = 1.0); + sample_rec_ids = round (log (sample_rec_ids) / log (1.0 - approx_sample_size / num_records) + 0.5); + # Prob [k-1 < log(uniform)/log(1-p) < k] = p*(1-p)^(k-1) = Prob [k-1 zeros before a one] + sample_rec_ids = cumsum (sample_rec_ids); # (skip to next one) --> (skip to i-th one) + + # Replace all sample record ids over "num_records" (i.e. out of range) by "num_records + 1": + is_sample_rec_id_within_range = (sample_rec_ids <= num_records); + sample_rec_ids = sample_rec_ids * is_sample_rec_id_within_range + + (num_records + 1) * (1 - is_sample_rec_id_within_range); + + # Rearrange all samples (and their out-of-range indicators) into one column-vector: + sample_rec_ids = + matrix (sample_rec_ids, rows = num_rows, cols = 1, byrow = FALSE); + is_row_in_samples = + matrix (is_sample_rec_id_within_range, rows = num_rows, cols = 1, byrow = FALSE); + + # Use contingency table to create the "sample_maps" matrix that is a vertical concatenation + # of 0-1-matrices, one per sample, each with 1s at (i, sample_record[i]) and 0s elsewhere: + sample_maps_raw = table (seq (1, num_rows), sample_rec_ids); + max_rec_id = ncol (sample_maps_raw); + if (max_rec_id >= num_records) { + sample_maps = sample_maps_raw [, 1 : num_records]; + } else { + sample_maps = matrix (0, rows = num_rows, cols = num_records); + sample_maps [, 1 : max_rec_id] = sample_maps_raw; + } + + # Create a 0-1-matrix that maps each sample column ID into all row positions of the + # corresponding sample; map out-of-sample-range positions to row id = num_rows + 1: + sample_positions = (num_rows + 1) - is_row_in_samples * seq (num_rows, 1, -1); + # Column ID positions = 1, 1, ..., 1, 2, 2, ..., 2, . . . , n_c, n_c, ..., n_c: + col_positions = round (0.5 + seq (0, num_rows - 1, 1) / sample_block_size); + sample_col_map = table (sample_positions, col_positions); + # Remove the out-of-sample-range positions by cutting off the last row: + sample_col_map = sample_col_map [1 : (num_rows), ]; + + } else { + one_per_record = matrix (1, rows = num_records, cols = 1); + sample_block_size = num_records; + sample_maps = matrix (0, rows = (num_records * num_samples), cols = num_records); + sample_col_map = matrix (0, rows = (num_records * num_samples), cols = num_samples); + for (i in 1:num_samples) { + sample_maps [(num_records * (i - 1) + 1) : (num_records * i), ] = diag (one_per_record); + sample_col_map [(num_records * (i - 1) + 1) : (num_records * i), i] = one_per_record; +} } } + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_L2SVM.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/codegen/Algorithm_L2SVM.R b/src/test/scripts/functions/codegen/Algorithm_L2SVM.R new file mode 100644 index 0000000..36e844e --- /dev/null +++ b/src/test/scripts/functions/codegen/Algorithm_L2SVM.R @@ -0,0 +1,98 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +args <- commandArgs(TRUE) +library("Matrix") + +X = readMM(paste(args[1], "X.mtx", sep="")); +Y = readMM(paste(args[1], "Y.mtx", sep="")); +intercept = as.integer(args[2]); +epsilon = as.double(args[3]); +lambda = 0.001; +maxiterations = as.integer(args[4]); + +check_min = min(Y) +check_max = max(Y) +num_min = sum(Y == check_min) +num_max = sum(Y == check_max) +if(num_min + num_max != nrow(Y)){ + print("please check Y, it should contain only 2 labels") +}else{ + if(check_min != -1 | check_max != +1) + Y = 2/(check_max - check_min)*Y - (check_min + check_max)/(check_max - check_min) +} + +N = nrow(X) +D = ncol(X) + +if (intercept == 1) { + ones = matrix(1,N,1) + X = cbind(X, ones); +} + +num_rows_in_w = D +if(intercept == 1){ + num_rows_in_w = num_rows_in_w + 1 +} +w = matrix(0, num_rows_in_w, 1) + +g_old = t(X) %*% Y +s = g_old + +Xw = matrix(0,nrow(X),1) +iter = 0 +continue = TRUE +while(continue && iter < maxiterations){ + t = 0 + Xd = X %*% s + wd = lambda * sum(w * s) + dd = lambda * sum(s * s) + continue1 = TRUE + while(continue1){ + tmp_Xw = Xw + t*Xd + out = 1 - Y * (tmp_Xw) + sv = which(out > 0) + g = wd + t*dd - sum(out[sv] * Y[sv] * Xd[sv]) + h = dd + sum(Xd[sv] * Xd[sv]) + t = t - g/h + continue1 = (g*g/h >= 1e-10) + } + + w = w + t*s + Xw = Xw + t*Xd + + out = 1 - Y * (X %*% w) + sv = which(out > 0) + obj = 0.5 * sum(out[sv] * out[sv]) + lambda/2 * sum(w * w) + g_new = t(X[sv,]) %*% (out[sv] * Y[sv]) - lambda * w + + print(paste("OBJ : ", obj)) + + continue = (t*sum(s * g_old) >= epsilon*obj) + + be = sum(g_new * g_new)/sum(g_old * g_old) + s = be * s + g_new + g_old = g_new + + iter = iter + 1 +} + +writeMM(as(w,"CsparseMatrix"), paste(args[5], "w", sep="")); http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_L2SVM.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/codegen/Algorithm_L2SVM.dml b/src/test/scripts/functions/codegen/Algorithm_L2SVM.dml new file mode 100644 index 0000000..9a6a631 --- /dev/null +++ b/src/test/scripts/functions/codegen/Algorithm_L2SVM.dml @@ -0,0 +1,106 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +X = read($1) +Y = read($2) +intercept = $3; +eps = $4; +maxiter = $5; + +check_min = min(Y) +check_max = max(Y) +num_min = sum(ppred(Y, check_min, "==")) +num_max = sum(ppred(Y, check_max, "==")) +if(num_min + num_max != nrow(Y)) print("please check Y, it should contain only 2 labels") +else{ + if(check_min != -1 | check_max != +1) + Y = 2/(check_max - check_min)*Y - (check_min + check_max)/(check_max - check_min) +} + +epsilon = eps +lambda = 0.001 +maxiterations = maxiter +num_samples = nrow(X) +dimensions = ncol(X) + +if (intercept == 1) { + ones = matrix(1, rows=num_samples, cols=1) + X = append(X, ones); +} + +num_rows_in_w = dimensions +if(intercept == 1){ + num_rows_in_w = num_rows_in_w + 1 +} +w = matrix(0, rows=num_rows_in_w, cols=1) + +g_old = t(X) %*% Y +s = g_old + +Xw = matrix(0, rows=nrow(X), cols=1) +debug_str = "# Iter, Obj" +iter = 0 +continue = 1 +while(continue == 1 & iter < maxiterations) { + # minimizing primal obj along direction s + step_sz = 0 + Xd = X %*% s + wd = lambda * sum(w * s) + dd = lambda * sum(s * s) + continue1 = 1 + while(continue1 == 1){ + tmp_Xw = Xw + step_sz*Xd + out = 1 - Y * (tmp_Xw) + sv = ppred(out, 0, ">") + out = out * sv + g = wd + step_sz*dd - sum(out * Y * Xd) + h = dd + sum(Xd * sv * Xd) + step_sz = step_sz - g/h + if (g*g/h < 0.0000000001){ + continue1 = 0 + } + } + + #update weights + w = w + step_sz*s + Xw = Xw + step_sz*Xd + + out = 1 - Y * Xw + sv = ppred(out, 0, ">") + out = sv * out + obj = 0.5 * sum(out * out) + lambda/2 * sum(w * w) + g_new = t(X) %*% (out * Y) - lambda * w + + print("OBJ = " + obj) + tmp = sum(s * g_old) + if(step_sz*tmp < epsilon*obj){ + continue = 0 + } + + #non-linear CG step + be = sum(g_new * g_new)/sum(g_old * g_old) + s = be * s + g_new + g_old = g_new + + iter = iter + 1 +} + +write(w, $6, format="text") http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_LinregCG.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/codegen/Algorithm_LinregCG.R b/src/test/scripts/functions/codegen/Algorithm_LinregCG.R new file mode 100644 index 0000000..5dcad95 --- /dev/null +++ b/src/test/scripts/functions/codegen/Algorithm_LinregCG.R @@ -0,0 +1,57 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +args <- commandArgs(TRUE) +options(digits=22) +library("Matrix") + +X = readMM(paste(args[1], "X.mtx", sep="")) +y = readMM(paste(args[1], "y.mtx", sep="")) + +intercept = as.integer(args[2]); +eps = as.double(args[3]); +maxiter = as.double(args[4]); + +if( intercept == 1 ){ + ones = matrix(1, nrow(X), 1); + X = cbind(X, ones); +} + +r = -(t(X) %*% y); +p = -r; +norm_r2 = sum(r * r); +w = matrix(0, ncol(X), 1); + +i = 0; +while(i < maxiter) { + q = ((t(X) %*% (X %*% p)) + eps * p); + alpha = norm_r2 / ((t(p) %*% q)[1:1]); + w = w + alpha * p; + old_norm_r2 = norm_r2; + r = r + alpha * q; + norm_r2 = sum(r * r); + beta = norm_r2 / old_norm_r2; + p = -r + beta * p; + i = i + 1; +} + +writeMM(as(w,"CsparseMatrix"), paste(args[5], "w", sep="")) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_LinregCG.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/codegen/Algorithm_LinregCG.dml b/src/test/scripts/functions/codegen/Algorithm_LinregCG.dml new file mode 100644 index 0000000..92f15d7 --- /dev/null +++ b/src/test/scripts/functions/codegen/Algorithm_LinregCG.dml @@ -0,0 +1,56 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + + +X = read($1); +y = read($2); +intercept = $3; +eps = $4; +maxiter = $5; + +if( intercept == 1 ){ + ones = matrix(1, nrow(X), 1); + X = append(X, ones); +} + +r = -(t(X) %*% y); +p = -r; +norm_r2 = sum(r * r); +w = matrix(0, rows = ncol(X), cols = 1); + +i = 0; +while(i < maxiter) { + q = ((t(X) %*% (X %*% p)) + eps * p); + alpha = norm_r2 / as.scalar(t(p) %*% q); + w = w + alpha * p; + old_norm_r2 = norm_r2; + r = r + alpha * q; + norm_r2 = sum(r * r); + beta = norm_r2 / old_norm_r2; + p = -r + beta * p; + i = i + 1; +} + +write(w, $6); + + + + http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/bbefe96b/src/test/scripts/functions/codegen/Algorithm_MLogreg.R ---------------------------------------------------------------------- diff --git a/src/test/scripts/functions/codegen/Algorithm_MLogreg.R b/src/test/scripts/functions/codegen/Algorithm_MLogreg.R new file mode 100644 index 0000000..121aba7 --- /dev/null +++ b/src/test/scripts/functions/codegen/Algorithm_MLogreg.R @@ -0,0 +1,278 @@ +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +args <- commandArgs(TRUE) +library("Matrix") +library("matrixStats") + +X = readMM(paste(args[1], "X.mtx", sep="")); +Y_vec = readMM(paste(args[1], "Y.mtx", sep="")); +intercept = as.integer(args[2]); +tol = as.double(args[3]); +maxiter = as.integer(args[4]); + +intercept_status = intercept; +regularization = 0.001; +maxinneriter = 0; + +print ("BEGIN MULTINOMIAL LOGISTIC REGRESSION SCRIPT"); + +eta0 = 0.0001; +eta1 = 0.25; +eta2 = 0.75; +sigma1 = 0.25; +sigma2 = 0.5; +sigma3 = 4.0; +psi = 0.1; + +N = nrow (X); +D = ncol (X); + +# Introduce the intercept, shift and rescale the columns of X if needed +if (intercept_status == 1 | intercept_status == 2) # add the intercept column +{ + X = cbind (X, matrix (1, N, 1)); + D = ncol (X); +} + +scale_lambda = matrix (1, D, 1); +if (intercept_status == 1 | intercept_status == 2) +{ + scale_lambda [D, 1] = 0; +} + +if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1 +{ # Important assumption: X [, D] = matrix (1, rows = N, cols = 1) + avg_X_cols = t(colSums(X)) / N; + var_X_cols = (t(colSums (X ^ 2)) - N * (avg_X_cols ^ 2)) / (N - 1); + is_unsafe = (var_X_cols <= 0.0); + scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe); + scale_X [D, 1] = 1; + shift_X = - avg_X_cols * scale_X; + shift_X [D, 1] = 0; + rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2); +} else { + scale_X = matrix (1, D, 1); + shift_X = matrix (0, D, 1); + rowSums_X_sq = rowSums (X ^ 2); +} + +# Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2) +# with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale. +# The transform is then associatively applied to the other side of the expression, +# and is rewritten via "scale_X" and "shift_X" as follows: +# +# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: +# ssX_A = diag (scale_X) %*% A; +# ssX_A [D, ] = ssX_A [D, ] + t(shift_X) %*% A; +# +# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: +# tssX_A = diag (scale_X) %*% A + shift_X %*% A [D, ]; + +# Convert "Y_vec" into indicator matrice: +if (min (Y_vec) <= 0) { + # Category labels "0", "-1" etc. are converted into the largest label + max_y = max (Y_vec); + Y_vec = Y_vec + (- Y_vec + max_y + 1) * (Y_vec <= 0.0); +} +Y = table (seq (1, N, 1), as.vector(Y_vec)); +Y = as.matrix(as.data.frame.matrix(Y)) #this is required due to different table semantics + +K = ncol (Y) - 1; # The number of non-baseline categories + +lambda = (scale_lambda %*% matrix (1, 1, K)) * regularization; +delta = 0.5 * sqrt (D) / max (sqrt (rowSums_X_sq)); + +B = matrix (0, D, K); ### LT = X %*% (SHIFT/SCALE TRANSFORM) %*% B; + ### LT = append (LT, matrix (0, rows = N, cols = 1)); + ### LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1); +P = matrix (1, N, K+1); ### exp_LT = exp (LT); +P = P / (K + 1); ### P = exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1)); +obj = N * log (K + 1); ### obj = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2)); + +Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]); +if (intercept_status == 2) { + Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ]; +} +Grad = Grad + lambda * B; +norm_Grad = sqrt (sum (Grad ^ 2)); +norm_Grad_initial = norm_Grad; + +if (maxinneriter == 0) { + maxinneriter = D * K; +} +iter = 1; + +# boolean for convergence check +converge = (norm_Grad < tol) | (iter > maxiter); + +print (paste("-- Initially: Objective = ", obj, ", Gradient Norm = ", norm_Grad , ", Trust Delta = " , delta)); + +while (! converge) +{ + # SOLVE TRUST REGION SUB-PROBLEM + S = matrix (0, D, K); + R = - Grad; + V = R; + delta2 = delta ^ 2; + inneriter = 1; + norm_R2 = sum (R ^ 2); + innerconverge = (sqrt (norm_R2) <= psi * norm_Grad); + is_trust_boundary_reached = 0; + + while (! innerconverge) + { + if (intercept_status == 2) { + ssX_V = diag (scale_X) %*% V; + ssX_V [D, ] = ssX_V [D, ] + t(shift_X) %*% V; + } else { + ssX_V = V; + } + Q = P [, 1:K] * (X %*% ssX_V); + HV = t(X) %*% (Q - P [, 1:K] * (rowSums (Q) %*% matrix (1, 1, K))); + if (intercept_status == 2) { + HV = diag (scale_X) %*% HV + shift_X %*% HV [D, ]; + } + HV = HV + lambda * V; + alpha = norm_R2 / sum (V * HV); + Snew = S + alpha * V; + norm_Snew2 = sum (Snew ^ 2); + if (norm_Snew2 <= delta2) + { + S = Snew; + R = R - alpha * HV; + old_norm_R2 = norm_R2 + norm_R2 = sum (R ^ 2); + V = R + (norm_R2 / old_norm_R2) * V; + innerconverge = (sqrt (norm_R2) <= psi * norm_Grad); + } else { + is_trust_boundary_reached = 1; + sv = sum (S * V); + v2 = sum (V ^ 2); + s2 = sum (S ^ 2); + rad = sqrt (sv ^ 2 + v2 * (delta2 - s2)); + if (sv >= 0) { + alpha = (delta2 - s2) / (sv + rad); + } else { + alpha = (rad - sv) / v2; + } + S = S + alpha * V; + R = R - alpha * HV; + innerconverge = TRUE; + } + inneriter = inneriter + 1; + innerconverge = innerconverge | (inneriter > maxinneriter); + } + + # END TRUST REGION SUB-PROBLEM + + # compute rho, update B, obtain delta + gs = sum (S * Grad); + qk = - 0.5 * (gs - sum (S * R)); + B_new = B + S; + if (intercept_status == 2) { + ssX_B_new = diag (scale_X) %*% B_new; + ssX_B_new [D, ] = ssX_B_new [D, ] + t(shift_X) %*% B_new; + } else { + ssX_B_new = B_new; + } + + LT = as.matrix(cbind ((X %*% ssX_B_new), matrix (0, N, 1))); + LT = LT - rowMaxs (LT) %*% matrix (1, 1, K+1); + exp_LT = exp (LT); + P_new = exp_LT / (rowSums (exp_LT) %*% matrix (1, 1, K+1)); + obj_new = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2)); + + # Consider updating LT in the inner loop + # Consider the big "obj" and "obj_new" rounding-off their small difference below: + + actred = (obj - obj_new); + + rho = actred / qk; + is_rho_accepted = (rho > eta0); + snorm = sqrt (sum (S ^ 2)); + + if (iter == 1) { + delta = min (delta, snorm); + } + + alpha2 = obj_new - obj - gs; + if (alpha2 <= 0) { + alpha = sigma3; + } + else { + alpha = max (sigma1, -0.5 * gs / alpha2); + } + + if (rho < eta0) { + delta = min (max (alpha, sigma1) * snorm, sigma2 * delta); + } + else { + if (rho < eta1) { + delta = max (sigma1 * delta, min (alpha * snorm, sigma2 * delta)); + } + else { + if (rho < eta2) { + delta = max (sigma1 * delta, min (alpha * snorm, sigma3 * delta)); + } + else { + delta = max (delta, min (alpha * snorm, sigma3 * delta)); + } + } + } + + if (is_trust_boundary_reached == 1) + { + print (paste("-- Outer Iteration " , iter , ": Had " , (inneriter - 1) , " CG iterations, trust bound REACHED")); + } else { + print (paste("-- Outer Iteration " , iter , ": Had " , (inneriter - 1) , " CG iterations")); + } + print (paste(" -- Obj.Reduction: Actual = " , actred , ", Predicted = " , qk , + " (A/P: " , (round (10000.0 * rho) / 10000.0) , "), Trust Delta = " , delta)); + + if (is_rho_accepted) + { + B = B_new; + P = P_new; + Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]); + if (intercept_status == 2) { + Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ]; + } + Grad = Grad + lambda * B; + norm_Grad = sqrt (sum (Grad ^ 2)); + obj = obj_new; + print (paste(" -- New Objective = " , obj , ", Beta Change Norm = " , snorm , ", Gradient Norm = " , norm_Grad)); + } + + iter = iter + 1; + converge = ((norm_Grad < (tol * norm_Grad_initial)) | (iter > maxiter) | + ((is_trust_boundary_reached == 0) & (abs (actred) < (abs (obj) + abs (obj_new)) * 0.00000000000001))); + if (converge) { print ("Termination / Convergence condition satisfied."); } else { print (" "); } +} + +if (intercept_status == 2) { + B_out = diag (scale_X) %*% B; + B_out [D, ] = B_out [D, ] + t(shift_X) %*% B; +} else { + B_out = B; +} + +writeMM(as(B_out,"CsparseMatrix"), paste(args[5], "w", sep=""));
