http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/src/test/scripts/applications/glm/GLM.dml ---------------------------------------------------------------------- diff --git a/src/test/scripts/applications/glm/GLM.dml b/src/test/scripts/applications/glm/GLM.dml index bed88a1..25832c5 100644 --- a/src/test/scripts/applications/glm/GLM.dml +++ b/src/test/scripts/applications/glm/GLM.dml @@ -1,1167 +1,1167 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# THIS SCRIPT SOLVES GLM REGRESSION USING NEWTON/FISHER SCORING WITH TRUST REGIONS -# -# INPUT PARAMETERS: -# --------------------------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# --------------------------------------------------------------------------------------------- -# X String --- Location to read the matrix X of feature vectors -# Y String --- Location to read response matrix Y with either 1 or 2 columns: -# if dfam = 2, Y is 1-column Bernoulli or 2-column Binomial (#pos, #neg) -# B String --- Location to store estimated regression parameters (the betas) -# fmt String "text" The betas matrix output format, such as "text" or "csv" -# O String " " Location to write the printed statistics; by default is standard output -# Log String " " Location to write per-iteration variables for log/debugging purposes -# dfam Int 1 Distribution family code: 1 = Power, 2 = Binomial -# vpow Double 0.0 Power for Variance defined as (mean)^power (ignored if dfam != 1): -# 0.0 = Gaussian, 1.0 = Poisson, 2.0 = Gamma, 3.0 = Inverse Gaussian -# link Int 0 Link function code: 0 = canonical (depends on distribution), -# 1 = Power, 2 = Logit, 3 = Probit, 4 = Cloglog, 5 = Cauchit -# lpow Double 1.0 Power for Link function defined as (mean)^power (ignored if link != 1): -# -2.0 = 1/mu^2, -1.0 = reciprocal, 0.0 = log, 0.5 = sqrt, 1.0 = identity -# yneg Double 0.0 Response value for Bernoulli "No" label, usually 0.0 or -1.0 -# icpt Int 0 Intercept presence, X columns shifting and rescaling: -# 0 = no intercept, no shifting, no rescaling; -# 1 = add intercept, but neither shift nor rescale X; -# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 -# reg Double 0.0 Regularization parameter (lambda) for L2 regularization -# tol Double 0.000001 Tolerance (epsilon) -# disp Double 0.0 (Over-)dispersion value, or 0.0 to estimate it from data -# moi Int 200 Maximum number of outer (Newton / Fisher Scoring) iterations -# mii Int 0 Maximum number of inner (Conjugate Gradient) iterations, 0 = no maximum -# --------------------------------------------------------------------------------------------- -# OUTPUT: Matrix beta, whose size depends on icpt: -# icpt=0: ncol(X) x 1; icpt=1: (ncol(X) + 1) x 1; icpt=2: (ncol(X) + 1) x 2 -# -# In addition, some GLM statistics are provided in CSV format, one comma-separated name-value -# pair per each line, as follows: -# -# NAME MEANING -# ------------------------------------------------------------------------------------------- -# TERMINATION_CODE A positive integer indicating success/failure as follows: -# 1 = Converged successfully; 2 = Maximum number of iterations reached; -# 3 = Input (X, Y) out of range; 4 = Distribution/link is not supported -# BETA_MIN Smallest beta value (regression coefficient), excluding the intercept -# BETA_MIN_INDEX Column index for the smallest beta value -# BETA_MAX Largest beta value (regression coefficient), excluding the intercept -# BETA_MAX_INDEX Column index for the largest beta value -# INTERCEPT Intercept value, or NaN if there is no intercept (if icpt=0) -# DISPERSION Dispersion used to scale deviance, provided as "disp" input parameter -# or estimated (same as DISPERSION_EST) if the "disp" parameter is <= 0 -# DISPERSION_EST Dispersion estimated from the dataset -# DEVIANCE_UNSCALED Deviance from the saturated model, assuming dispersion == 1.0 -# DEVIANCE_SCALED Deviance from the saturated model, scaled by the DISPERSION value -# ------------------------------------------------------------------------------------------- -# -# The Log file, when requested, contains the following per-iteration variables in CSV format, -# each line containing triple (NAME, ITERATION, VALUE) with ITERATION = 0 for initial values: -# -# NAME MEANING -# ------------------------------------------------------------------------------------------- -# NUM_CG_ITERS Number of inner (Conj.Gradient) iterations in this outer iteration -# IS_TRUST_REACHED 1 = trust region boundary was reached, 0 = otherwise -# POINT_STEP_NORM L2-norm of iteration step from old point (i.e. "beta") to new point -# OBJECTIVE The loss function we minimize (i.e. negative partial log-likelihood) -# OBJ_DROP_REAL Reduction in the objective during this iteration, actual value -# OBJ_DROP_PRED Reduction in the objective predicted by a quadratic approximation -# OBJ_DROP_RATIO Actual-to-predicted reduction ratio, used to update the trust region -# GRADIENT_NORM L2-norm of the loss function gradient (NOTE: sometimes omitted) -# LINEAR_TERM_MIN The minimum value of X %*% beta, used to check for overflows -# LINEAR_TERM_MAX The maximum value of X %*% beta, used to check for overflows -# IS_POINT_UPDATED 1 = new point accepted; 0 = new point rejected, old point restored -# TRUST_DELTA Updated trust region size, the "delta" -# ------------------------------------------------------------------------------------------- -# -# Example with distribution = "Binomial.logit": -# hadoop jar SystemML.jar -f GLM_HOME/GLM.dml -nvargs dfam=2 link=2 yneg=-1.0 icpt=2 reg=0.001 -# tol=0.00000001 disp=1.0 moi=100 mii=10 X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas -# fmt=csv O=OUTPUT_DIR/stats Log=OUTPUT_DIR/log -# -# SOME OF THE SUPPORTED GLM DISTRIBUTION FAMILIES -# AND LINK FUNCTIONS: -# ----------------------------------------------- -# INPUT PARAMETERS: MEANING: Cano- -# dfam vpow link lpow Distribution.link nical? -# ----------------------------------------------- -# 1 0.0 1 -1.0 Gaussian.inverse -# 1 0.0 1 0.0 Gaussian.log -# 1 0.0 1 1.0 Gaussian.id Yes -# 1 1.0 1 0.0 Poisson.log Yes -# 1 1.0 1 0.5 Poisson.sqrt -# 1 1.0 1 1.0 Poisson.id -# 1 2.0 1 -1.0 Gamma.inverse Yes -# 1 2.0 1 0.0 Gamma.log -# 1 2.0 1 1.0 Gamma.id -# 1 3.0 1 -2.0 InvGaussian.1/mu^2 Yes -# 1 3.0 1 -1.0 InvGaussian.inverse -# 1 3.0 1 0.0 InvGaussian.log -# 1 3.0 1 1.0 InvGaussian.id -# 1 * 1 * AnyVariance.AnyLink -# ----------------------------------------------- -# 2 * 1 0.0 Binomial.log -# 2 * 1 0.5 Binomial.sqrt -# 2 * 2 * Binomial.logit Yes -# 2 * 3 * Binomial.probit -# 2 * 4 * Binomial.cloglog -# 2 * 5 * Binomial.cauchit -# ----------------------------------------------- - - -# Default values for input parameters - -fileX = $X; -fileY = $Y; -fileB = $B; -fileO = ifdef ($O, " "); -fileLog = ifdef ($Log, " "); -fmtB = ifdef ($fmt, "text"); - -distribution_type = ifdef ($dfam, 1); # $dfam = 1; -variance_as_power_of_the_mean = ifdef ($vpow, 0.0); # $vpow = 0.0; -link_type = ifdef ($link, 0); # $link = 0; -link_as_power_of_the_mean = ifdef ($lpow, 1.0); # $lpow = 1.0; -bernoulli_No_label = ifdef ($yneg, 0.0); # $yneg = 0.0; -intercept_status = ifdef ($icpt, 0); # $icpt = 0; -dispersion = ifdef ($disp, 0.0); # $disp = 0.0; -regularization = ifdef ($reg, 0.0); # $reg = 0.0; -eps = ifdef ($tol, 0.000001); # $tol = 0.000001; -max_iteration_IRLS = ifdef ($moi, 200); # $moi = 200; -max_iteration_CG = ifdef ($mii, 0); # $mii = 0; - -variance_as_power_of_the_mean = as.double (variance_as_power_of_the_mean); -link_as_power_of_the_mean = as.double (link_as_power_of_the_mean); -bernoulli_No_label = as.double (bernoulli_No_label); -dispersion = as.double (dispersion); -eps = as.double (eps); - - -# Default values for output statistics: - -termination_code = 0.0; -min_beta = 0.0 / 0.0; -i_min_beta = 0.0 / 0.0; -max_beta = 0.0 / 0.0; -i_max_beta = 0.0 / 0.0; -intercept_value = 0.0 / 0.0; -dispersion = 0.0 / 0.0; -estimated_dispersion = 0.0 / 0.0; -deviance_nodisp = 0.0 / 0.0; -deviance = 0.0 / 0.0; - -print("BEGIN GLM SCRIPT"); -print("Reading X..."); -X = read (fileX); -print("Reading Y..."); -Y = read (fileY); - -num_records = nrow (X); -num_features = ncol (X); -zeros_r = matrix (0, rows = num_records, cols = 1); -ones_r = 1 + zeros_r; - -# Introduce the intercept, shift and rescale the columns of X if needed - -if (intercept_status == 1 | intercept_status == 2) # add the intercept column -{ - X = append (X, ones_r); - num_features = ncol (X); -} - -scale_lambda = matrix (1, rows = num_features, cols = 1); -if (intercept_status == 1 | intercept_status == 2) -{ - scale_lambda [num_features, 1] = 0; -} - -if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1 -{ # Important assumption: X [, num_features] = ones_r - avg_X_cols = t(colSums(X)) / num_records; - var_X_cols = (t(colSums (X ^ 2)) - num_records * (avg_X_cols ^ 2)) / (num_records - 1); - is_unsafe = ppred (var_X_cols, 0.0, "<="); - scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe); - scale_X [num_features, 1] = 1; - shift_X = - avg_X_cols * scale_X; - shift_X [num_features, 1] = 0; - rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2); -} else { - scale_X = matrix (1, rows = num_features, cols = 1); - shift_X = matrix (0, rows = num_features, cols = 1); - rowSums_X_sq = rowSums (X ^ 2); -} - -# Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2) -# with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale. -# The transform is then associatively applied to the other side of the expression, -# and is rewritten via "scale_X" and "shift_X" as follows: -# -# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: -# ssX_A = diag (scale_X) %*% A; -# ssX_A [num_features, ] = ssX_A [num_features, ] + t(shift_X) %*% A; -# -# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: -# tssX_A = diag (scale_X) %*% A + shift_X %*% A [num_features, ]; - -# Initialize other input-dependent parameters - -lambda = scale_lambda * regularization; -if (max_iteration_CG == 0) { - max_iteration_CG = num_features; -} - -# In Bernoulli case, convert one-column "Y" into two-column - -if (distribution_type == 2 & ncol(Y) == 1) -{ - is_Y_negative = ppred (Y, bernoulli_No_label, "=="); - Y = append (1 - is_Y_negative, is_Y_negative); - count_Y_negative = sum (is_Y_negative); - if (count_Y_negative == 0) { - stop ("GLM Input Error: all Y-values encode Bernoulli YES-label, none encode NO-label"); - } - if (count_Y_negative == nrow(Y)) { - stop ("GLM Input Error: all Y-values encode Bernoulli NO-label, none encode YES-label"); - } -} - -# Set up the canonical link, if requested [Then we have: Var(mu) * (d link / d mu) = const] - -if (link_type == 0) -{ - if (distribution_type == 1) { - link_type = 1; - link_as_power_of_the_mean = 1.0 - variance_as_power_of_the_mean; - } else { if (distribution_type == 2) { - link_type = 2; -} } } - -# For power distributions and/or links, we use two constants, -# "variance as power of the mean" and "link_as_power_of_the_mean", -# to specify the variance and the link as arbitrary powers of the -# mean. However, the variance-powers of 1.0 (Poisson family) and -# 2.0 (Gamma family) have to be treated as special cases, because -# these values integrate into logarithms. The link-power of 0.0 -# is also special as it represents the logarithm link. - -num_response_columns = ncol (Y); - -is_supported = check_if_supported (num_response_columns, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); -if (is_supported == 1) -{ - -##### INITIALIZE THE BETAS ##### - -[beta, saturated_log_l, isNaN] = - glm_initialize (X, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean, intercept_status, max_iteration_CG); -if (isNaN == 0) -{ - -##### START OF THE MAIN PART ##### - -sum_X_sq = sum (rowSums_X_sq); -trust_delta = 0.5 * sqrt (num_features) / max (sqrt (rowSums_X_sq)); -### max_trust_delta = trust_delta * 10000.0; -log_l = 0.0; -deviance_nodisp = 0.0; -new_deviance_nodisp = 0.0; -isNaN_log_l = 2; -newbeta = beta; -g = matrix (0.0, rows = num_features, cols = 1); -g_norm = sqrt (sum ((g + lambda * beta) ^ 2)); -accept_new_beta = 1; -reached_trust_boundary = 0; -neg_log_l_change_predicted = 0.0; -i_IRLS = 0; - -print ("BEGIN IRLS ITERATIONS..."); - -ssX_newbeta = diag (scale_X) %*% newbeta; -ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta; -all_linear_terms = X %*% ssX_newbeta; - -[new_log_l, isNaN_new_log_l] = glm_log_likelihood_part - (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); - -if (isNaN_new_log_l == 0) { - new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l); - new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2); -} - -if (fileLog != " ") { - log_str = "POINT_STEP_NORM," + i_IRLS + "," + sqrt (sum (beta ^ 2)); - log_str = append (log_str, "OBJECTIVE," + i_IRLS + "," + (- new_log_l)); - log_str = append (log_str, "LINEAR_TERM_MIN," + i_IRLS + "," + min (all_linear_terms)); - log_str = append (log_str, "LINEAR_TERM_MAX," + i_IRLS + "," + max (all_linear_terms)); -} else { - log_str = " "; -} - -while (termination_code == 0) -{ - accept_new_beta = 1; - - if (i_IRLS > 0) - { - if (isNaN_log_l == 0) { - accept_new_beta = 0; - } - -# Decide whether to accept a new iteration point and update the trust region -# See Alg. 4.1 on p. 69 of "Numerical Optimization" 2nd ed. by Nocedal and Wright - - rho = (- new_log_l + log_l) / neg_log_l_change_predicted; - if (rho < 0.25 | isNaN_new_log_l == 1) { - trust_delta = 0.25 * trust_delta; - } - if (rho > 0.75 & isNaN_new_log_l == 0 & reached_trust_boundary == 1) { - trust_delta = 2 * trust_delta; - -### if (trust_delta > max_trust_delta) { -### trust_delta = max_trust_delta; -### } - - } - if (rho > 0.1 & isNaN_new_log_l == 0) { - accept_new_beta = 1; - } - } - - if (fileLog != " ") { - log_str = append (log_str, "IS_POINT_UPDATED," + i_IRLS + "," + accept_new_beta); - log_str = append (log_str, "TRUST_DELTA," + i_IRLS + "," + trust_delta); - } - if (accept_new_beta == 1) - { - beta = newbeta; log_l = new_log_l; deviance_nodisp = new_deviance_nodisp; isNaN_log_l = isNaN_new_log_l; - - [g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); - - # We introduced these variables to avoid roundoff errors: - # g_Y = y_residual / (y_var * link_grad); - # w = 1.0 / (y_var * link_grad * link_grad); - - gXY = - t(X) %*% g_Y; - g = diag (scale_X) %*% gXY + shift_X %*% gXY [num_features, ]; - g_norm = sqrt (sum ((g + lambda * beta) ^ 2)); - - if (fileLog != " ") { - log_str = append (log_str, "GRADIENT_NORM," + i_IRLS + "," + g_norm); - } - } - - [z, neg_log_l_change_predicted, num_CG_iters, reached_trust_boundary] = - get_CG_Steihaug_point (X, scale_X, shift_X, w, g, beta, lambda, trust_delta, max_iteration_CG); - - newbeta = beta + z; - - ssX_newbeta = diag (scale_X) %*% newbeta; - ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta; - all_linear_terms = X %*% ssX_newbeta; - - [new_log_l, isNaN_new_log_l] = glm_log_likelihood_part - (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); - - if (isNaN_new_log_l == 0) { - new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l); - new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2); - } - - log_l_change = new_log_l - log_l; # R's criterion for termination: |dev - devold|/(|dev| + 0.1) < eps - - if (reached_trust_boundary == 0 & isNaN_new_log_l == 0 & - (2.0 * abs (log_l_change) < eps * (deviance_nodisp + 0.1) | abs (log_l_change) < (abs (log_l) + abs (new_log_l)) * 0.00000000000001) ) - { - termination_code = 1; - } - rho = - log_l_change / neg_log_l_change_predicted; - z_norm = sqrt (sum (z * z)); - - [z_norm_m, z_norm_e] = round_to_print (z_norm); - [trust_delta_m, trust_delta_e] = round_to_print (trust_delta); - [rho_m, rho_e] = round_to_print (rho); - [new_log_l_m, new_log_l_e] = round_to_print (new_log_l); - [log_l_change_m, log_l_change_e] = round_to_print (log_l_change); - [g_norm_m, g_norm_e] = round_to_print (g_norm); - - i_IRLS = i_IRLS + 1; - print ("Iter #" + i_IRLS + " completed" - + ", ||z|| = " + z_norm_m + "E" + z_norm_e - + ", trust_delta = " + trust_delta_m + "E" + trust_delta_e - + ", reached = " + reached_trust_boundary - + ", ||g|| = " + g_norm_m + "E" + g_norm_e - + ", new_log_l = " + new_log_l_m + "E" + new_log_l_e - + ", log_l_change = " + log_l_change_m + "E" + log_l_change_e - + ", rho = " + rho_m + "E" + rho_e); - - if (fileLog != " ") { - log_str = append (log_str, "NUM_CG_ITERS," + i_IRLS + "," + num_CG_iters); - log_str = append (log_str, "IS_TRUST_REACHED," + i_IRLS + "," + reached_trust_boundary); - log_str = append (log_str, "POINT_STEP_NORM," + i_IRLS + "," + z_norm); - log_str = append (log_str, "OBJECTIVE," + i_IRLS + "," + (- new_log_l)); - log_str = append (log_str, "OBJ_DROP_REAL," + i_IRLS + "," + log_l_change); - log_str = append (log_str, "OBJ_DROP_PRED," + i_IRLS + "," + (- neg_log_l_change_predicted)); - log_str = append (log_str, "OBJ_DROP_RATIO," + i_IRLS + "," + rho); - log_str = append (log_str, "LINEAR_TERM_MIN," + i_IRLS + "," + min (all_linear_terms)); - log_str = append (log_str, "LINEAR_TERM_MAX," + i_IRLS + "," + max (all_linear_terms)); - } - - if (i_IRLS == max_iteration_IRLS) { - termination_code = 2; - } -} - -beta = newbeta; -log_l = new_log_l; -deviance_nodisp = new_deviance_nodisp; - -if (termination_code == 1) { - print ("Converged in " + i_IRLS + " steps."); -} else { - print ("Did not converge."); -} - -ssX_beta = diag (scale_X) %*% beta; -ssX_beta [num_features, ] = ssX_beta [num_features, ] + t(shift_X) %*% beta; -if (intercept_status == 2) { - beta_out = append (ssX_beta, beta); -} else { - beta_out = ssX_beta; -} - -write (beta_out, fileB, format=fmtB); - -if (intercept_status == 1 | intercept_status == 2) { - intercept_value = castAsScalar (beta_out [num_features, 1]); - beta_noicept = beta_out [1 : (num_features - 1), 1]; -} else { - beta_noicept = beta_out [1 : num_features, 1]; -} -min_beta = min (beta_noicept); -max_beta = max (beta_noicept); -tmp_i_min_beta = rowIndexMin (t(beta_noicept)) -i_min_beta = castAsScalar (tmp_i_min_beta [1, 1]); -tmp_i_max_beta = rowIndexMax (t(beta_noicept)) -i_max_beta = castAsScalar (tmp_i_max_beta [1, 1]); - -##### OVER-DISPERSION PART ##### - -all_linear_terms = X %*% ssX_beta; -[g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); - -pearson_residual_sq = g_Y ^ 2 / w; -pearson_residual_sq = replace (target = pearson_residual_sq, pattern = 0.0/0.0, replacement = 0); -# pearson_residual_sq = (y_residual ^ 2) / y_var; - -if (num_records > num_features) { - estimated_dispersion = sum (pearson_residual_sq) / (num_records - num_features); -} -if (dispersion <= 0.0) { - dispersion = estimated_dispersion; -} -deviance = deviance_nodisp / dispersion; - -if (fileLog != " ") { - write (log_str, fileLog); -} - -##### END OF THE MAIN PART ##### - -} else { print ("Input matrices are out of range. Terminating the DML."); termination_code = 3; } -} else { print ("Distribution/Link not supported. Terminating the DML."); termination_code = 4; } - -str = "TERMINATION_CODE," + termination_code; -str = append (str, "BETA_MIN," + min_beta); -str = append (str, "BETA_MIN_INDEX," + i_min_beta); -str = append (str, "BETA_MAX," + max_beta); -str = append (str, "BETA_MAX_INDEX," + i_max_beta); -str = append (str, "INTERCEPT," + intercept_value); -str = append (str, "DISPERSION," + dispersion); -str = append (str, "DISPERSION_EST," + estimated_dispersion); -str = append (str, "DEVIANCE_UNSCALED," + deviance_nodisp); -str = append (str, "DEVIANCE_SCALED," + deviance); - -if (fileO != " ") { - write (str, fileO); -} else { - print (str); -} - - - - -check_if_supported = - function (int ncol_y, int dist_type, double var_power, int link_type, double link_power) - return (int is_supported) -{ - is_supported = 0; - if (ncol_y == 1 & dist_type == 1 & link_type == 1) - { # POWER DISTRIBUTION - is_supported = 1; - if (var_power == 0.0 & link_power == -1.0) {print ("Gaussian.inverse"); } else { - if (var_power == 0.0 & link_power == 0.0) {print ("Gaussian.log"); } else { - if (var_power == 0.0 & link_power == 0.5) {print ("Gaussian.sqrt"); } else { - if (var_power == 0.0 & link_power == 1.0) {print ("Gaussian.id"); } else { - if (var_power == 0.0 ) {print ("Gaussian.power_nonlog"); } else { - if (var_power == 1.0 & link_power == -1.0) {print ("Poisson.inverse"); } else { - if (var_power == 1.0 & link_power == 0.0) {print ("Poisson.log"); } else { - if (var_power == 1.0 & link_power == 0.5) {print ("Poisson.sqrt"); } else { - if (var_power == 1.0 & link_power == 1.0) {print ("Poisson.id"); } else { - if (var_power == 1.0 ) {print ("Poisson.power_nonlog"); } else { - if (var_power == 2.0 & link_power == -1.0) {print ("Gamma.inverse"); } else { - if (var_power == 2.0 & link_power == 0.0) {print ("Gamma.log"); } else { - if (var_power == 2.0 & link_power == 0.5) {print ("Gamma.sqrt"); } else { - if (var_power == 2.0 & link_power == 1.0) {print ("Gamma.id"); } else { - if (var_power == 2.0 ) {print ("Gamma.power_nonlog"); } else { - if (var_power == 3.0 & link_power == -2.0) {print ("InvGaussian.1/mu^2"); } else { - if (var_power == 3.0 & link_power == -1.0) {print ("InvGaussian.inverse"); } else { - if (var_power == 3.0 & link_power == 0.0) {print ("InvGaussian.log"); } else { - if (var_power == 3.0 & link_power == 0.5) {print ("InvGaussian.sqrt"); } else { - if (var_power == 3.0 & link_power == 1.0) {print ("InvGaussian.id"); } else { - if (var_power == 3.0 ) {print ("InvGaussian.power_nonlog");}else{ - if ( link_power == 0.0) {print ("PowerDist.log"); } else { - print ("PowerDist.power_nonlog"); - } }}}}} }}}}} }}}}} }}}}} }} - if (ncol_y == 1 & dist_type == 2) - { - print ("Error: Bernoulli response matrix has not been converted into two-column format."); - } - if (ncol_y == 2 & dist_type == 2 & link_type >= 1 & link_type <= 5) - { # BINOMIAL/BERNOULLI DISTRIBUTION - is_supported = 1; - if (link_type == 1 & link_power == -1.0) {print ("Binomial.inverse"); } else { - if (link_type == 1 & link_power == 0.0) {print ("Binomial.log"); } else { - if (link_type == 1 & link_power == 0.5) {print ("Binomial.sqrt"); } else { - if (link_type == 1 & link_power == 1.0) {print ("Binomial.id"); } else { - if (link_type == 1) {print ("Binomial.power_nonlog"); } else { - if (link_type == 2) {print ("Binomial.logit"); } else { - if (link_type == 3) {print ("Binomial.probit"); } else { - if (link_type == 4) {print ("Binomial.cloglog"); } else { - if (link_type == 5) {print ("Binomial.cauchit"); } - } }}}}} }}} - if (is_supported == 0) { - print ("Response matrix with " + ncol_y + " columns, distribution family (" + dist_type + ", " + var_power - + ") and link family (" + link_type + ", " + link_power + ") are NOT supported together."); - } -} - -glm_initialize = function (Matrix[double] X, Matrix[double] Y, int dist_type, double var_power, int link_type, double link_power, int icept_status, int max_iter_CG) -return (Matrix[double] beta, double saturated_log_l, int isNaN) -{ - saturated_log_l = 0.0; - isNaN = 0; - y_corr = Y [, 1]; - if (dist_type == 2) { - n_corr = rowSums (Y); - is_n_zero = ppred (n_corr, 0.0, "=="); - y_corr = Y [, 1] / (n_corr + is_n_zero) + (0.5 - Y [, 1]) * is_n_zero; - } - linear_terms = y_corr; - if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION - if (link_power == 0.0) { - if (sum (ppred (y_corr, 0.0, "<")) == 0) { - is_zero_y_corr = ppred (y_corr, 0.0, "=="); - linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); - } else { isNaN = 1; } - } else { if (link_power == 1.0) { - linear_terms = y_corr; - } else { if (link_power == -1.0) { - linear_terms = 1.0 / y_corr; - } else { if (link_power == 0.5) { - if (sum (ppred (y_corr, 0.0, "<")) == 0) { - linear_terms = sqrt (y_corr); - } else { isNaN = 1; } - } else { if (link_power > 0.0) { - if (sum (ppred (y_corr, 0.0, "<")) == 0) { - is_zero_y_corr = ppred (y_corr, 0.0, "=="); - linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr; - } else { isNaN = 1; } - } else { - if (sum (ppred (y_corr, 0.0, "<=")) == 0) { - linear_terms = y_corr ^ link_power; - } else { isNaN = 1; } - }}}}} - } - if (dist_type == 2 & link_type >= 1 & link_type <= 5) - { # BINOMIAL/BERNOULLI DISTRIBUTION - if (link_type == 1 & link_power == 0.0) { # Binomial.log - if (sum (ppred (y_corr, 0.0, "<")) == 0) { - is_zero_y_corr = ppred (y_corr, 0.0, "=="); - linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); - } else { isNaN = 1; } - } else { if (link_type == 1 & link_power > 0.0) { # Binomial.power_nonlog pos - if (sum (ppred (y_corr, 0.0, "<")) == 0) { - is_zero_y_corr = ppred (y_corr, 0.0, "=="); - linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr; - } else { isNaN = 1; } - } else { if (link_type == 1) { # Binomial.power_nonlog neg - if (sum (ppred (y_corr, 0.0, "<=")) == 0) { - linear_terms = y_corr ^ link_power; - } else { isNaN = 1; } - } else { - is_zero_y_corr = ppred (y_corr, 0.0, "<="); - is_one_y_corr = ppred (y_corr, 1.0, ">="); - y_corr = y_corr * (1.0 - is_zero_y_corr) * (1.0 - is_one_y_corr) + 0.5 * (is_zero_y_corr + is_one_y_corr); - if (link_type == 2) { # Binomial.logit - linear_terms = log (y_corr / (1.0 - y_corr)) - + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); - } else { if (link_type == 3) { # Binomial.probit - y_below_half = y_corr + (1.0 - 2.0 * y_corr) * ppred (y_corr, 0.5, ">"); - t = sqrt (- 2.0 * log (y_below_half)); - approx_inv_Gauss_CDF = - t + (2.515517 + t * (0.802853 + t * 0.010328)) / (1.0 + t * (1.432788 + t * (0.189269 + t * 0.001308))); - linear_terms = approx_inv_Gauss_CDF * (1.0 - 2.0 * ppred (y_corr, 0.5, ">")) - + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); - } else { if (link_type == 4) { # Binomial.cloglog - linear_terms = log (- log (1.0 - y_corr)) - - log (- log (0.5)) * (is_zero_y_corr + is_one_y_corr) - + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); - } else { if (link_type == 5) { # Binomial.cauchit - linear_terms = tan ((y_corr - 0.5) * 3.1415926535897932384626433832795) - + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); - }} }}}}} - } - - if (isNaN == 0) { - [saturated_log_l, isNaN] = - glm_log_likelihood_part (linear_terms, Y, dist_type, var_power, link_type, link_power); - } - - if ((dist_type == 1 & link_type == 1 & link_power == 0.0) | - (dist_type == 2 & link_type >= 2)) - { - desired_eta = 0.0; - } else { if (link_type == 1 & link_power == 0.0) { - desired_eta = log (0.5); - } else { if (link_type == 1) { - desired_eta = 0.5 ^ link_power; - } else { - desired_eta = 0.5; - }}} - - beta = matrix (0.0, rows = ncol(X), cols = 1); - - if (desired_eta != 0.0) { - if (icept_status == 1 | icept_status == 2) { - beta [nrow(beta), 1] = desired_eta; - } else { - # We want: avg (X %*% ssX_transform %*% beta) = desired_eta - # Note that "ssX_transform" is trivial here, hence ignored - - beta = straightenX (X, 0.000001, max_iter_CG); - beta = beta * desired_eta; -} } } - - -glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y, - int dist_type, double var_power, int link_type, double link_power) - return (Matrix[double] g_Y, Matrix[double] w) - # ORIGINALLY we returned more meaningful vectors, namely: - # Matrix[double] y_residual : y - y_mean, i.e. y observed - y predicted - # Matrix[double] link_gradient : derivative of the link function - # Matrix[double] var_function : variance without dispersion, i.e. the V(mu) function - # BUT, this caused roundoff errors, so we had to compute "directly useful" vectors - # and skip over the "meaningful intermediaries". Now we output these two variables: - # g_Y = y_residual / (var_function * link_gradient); - # w = 1.0 / (var_function * link_gradient ^ 2); -{ - num_records = nrow (linear_terms); - zeros_r = matrix (0.0, rows = num_records, cols = 1); - ones_r = 1 + zeros_r; - g_Y = zeros_r; - w = zeros_r; - - # Some constants - - one_over_sqrt_two_pi = 0.39894228040143267793994605993438; - ones_2 = matrix (1.0, rows = 1, cols = 2); - p_one_m_one = ones_2; - p_one_m_one [1, 2] = -1.0; - m_one_p_one = ones_2; - m_one_p_one [1, 1] = -1.0; - zero_one = ones_2; - zero_one [1, 1] = 0.0; - one_zero = ones_2; - one_zero [1, 2] = 0.0; - flip_pos = matrix (0, rows = 2, cols = 2); - flip_neg = flip_pos; - flip_pos [1, 2] = 1; - flip_pos [2, 1] = 1; - flip_neg [1, 2] = -1; - flip_neg [2, 1] = 1; - - if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION - y_mean = zeros_r; - if (link_power == 0.0) { - y_mean = exp (linear_terms); - y_mean_pow = y_mean ^ (1 - var_power); - w = y_mean_pow * y_mean; - g_Y = y_mean_pow * (Y - y_mean); - } else { if (link_power == 1.0) { - y_mean = linear_terms; - w = y_mean ^ (- var_power); - g_Y = w * (Y - y_mean); - } else { - y_mean = linear_terms ^ (1.0 / link_power); - c1 = (1 - var_power) / link_power - 1; - c2 = (2 - var_power) / link_power - 2; - g_Y = (linear_terms ^ c1) * (Y - y_mean) / link_power; - w = (linear_terms ^ c2) / (link_power ^ 2); - } }} - if (dist_type == 2 & link_type >= 1 & link_type <= 5) - { # BINOMIAL/BERNOULLI DISTRIBUTION - if (link_type == 1) { # BINOMIAL.POWER LINKS - if (link_power == 0.0) { # Binomial.log - vec1 = 1 / (exp (- linear_terms) - 1); - g_Y = Y [, 1] - Y [, 2] * vec1; - w = rowSums (Y) * vec1; - } else { # Binomial.nonlog - vec1 = zeros_r; - if (link_power == 0.5) { - vec1 = 1 / (1 - linear_terms ^ 2); - } else { if (sum (ppred (linear_terms, 0.0, "<")) == 0) { - vec1 = linear_terms ^ (- 2 + 1 / link_power) / (1 - linear_terms ^ (1 / link_power)); - } else {isNaN = 1;}} - # We want a "zero-protected" version of - # vec2 = Y [, 1] / linear_terms; - is_y_0 = ppred (Y [, 1], 0.0, "=="); - vec2 = (Y [, 1] + is_y_0) / (linear_terms * (1 - is_y_0) + is_y_0) - is_y_0; - g_Y = (vec2 - Y [, 2] * vec1 * linear_terms) / link_power; - w = rowSums (Y) * vec1 / link_power ^ 2; - } - } else { - is_LT_pos_infinite = ppred (linear_terms, 1.0/0.0, "=="); - is_LT_neg_infinite = ppred (linear_terms, -1.0/0.0, "=="); - is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one; - finite_linear_terms = replace (target = linear_terms, pattern = 1.0/0.0, replacement = 0); - finite_linear_terms = replace (target = finite_linear_terms, pattern = -1.0/0.0, replacement = 0); - if (link_type == 2) { # Binomial.logit - Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one; - Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2); - Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; - g_Y = rowSums (Y * (Y_prob %*% flip_neg)); ### = y_residual; - w = rowSums (Y * (Y_prob %*% flip_pos) * Y_prob); ### = y_variance; - } else { if (link_type == 3) { # Binomial.probit - is_lt_pos = ppred (linear_terms, 0.0, ">="); - t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0) - pt_gp = t_gp * ( 0.254829592 - + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, - + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 - + t_gp * (-1.453152027 - + t_gp * 1.061405429)))); - the_gauss_exp = exp (- (linear_terms ^ 2) / 2.0); - vec1 = 0.25 * pt_gp * (2 - the_gauss_exp * pt_gp); - vec2 = Y [, 1] - rowSums (Y) * is_lt_pos + the_gauss_exp * pt_gp * rowSums (Y) * (is_lt_pos - 0.5); - w = the_gauss_exp * (one_over_sqrt_two_pi ^ 2) * rowSums (Y) / vec1; - g_Y = one_over_sqrt_two_pi * vec2 / vec1; - } else { if (link_type == 4) { # Binomial.cloglog - the_exp = exp (linear_terms) - the_exp_exp = exp (- the_exp); - is_too_small = ppred (10000000 + the_exp, 10000000, "=="); - the_exp_ratio = (1 - is_too_small) * (1 - the_exp_exp) / (the_exp + is_too_small) + is_too_small * (1 - the_exp / 2); - g_Y = (rowSums (Y) * the_exp_exp - Y [, 2]) / the_exp_ratio; - w = the_exp_exp * the_exp * rowSums (Y) / the_exp_ratio; - } else { if (link_type == 5) { # Binomial.cauchit - Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / 3.1415926535897932384626433832795; - Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; - y_residual = Y [, 1] * Y_prob [, 2] - Y [, 2] * Y_prob [, 1]; - var_function = rowSums (Y) * Y_prob [, 1] * Y_prob [, 2]; - link_gradient_normalized = (1 + linear_terms ^ 2) * 3.1415926535897932384626433832795; - g_Y = rowSums (Y) * y_residual / (var_function * link_gradient_normalized); - w = (rowSums (Y) ^ 2) / (var_function * link_gradient_normalized ^ 2); - }}}} - } - } -} - - -glm_log_likelihood_part = function (Matrix[double] linear_terms, Matrix[double] Y, - int dist_type, double var_power, int link_type, double link_power) - return (double log_l, int isNaN) -{ - isNaN = 0; - log_l = 0.0; - num_records = nrow (Y); - zeros_r = matrix (0.0, rows = num_records, cols = 1); - - if (dist_type == 1 & link_type == 1) - { # POWER DISTRIBUTION - b_cumulant = zeros_r; - natural_parameters = zeros_r; - is_natural_parameter_log_zero = zeros_r; - if (var_power == 1.0 & link_power == 0.0) { # Poisson.log - b_cumulant = exp (linear_terms); - is_natural_parameter_log_zero = ppred (linear_terms, -1.0/0.0, "=="); - natural_parameters = replace (target = linear_terms, pattern = -1.0/0.0, replacement = 0); - } else { if (var_power == 1.0 & link_power == 1.0) { # Poisson.id - if (sum (ppred (linear_terms, 0.0, "<")) == 0) { - b_cumulant = linear_terms; - is_natural_parameter_log_zero = ppred (linear_terms, 0.0, "=="); - natural_parameters = log (linear_terms + is_natural_parameter_log_zero); - } else {isNaN = 1;} - } else { if (var_power == 1.0 & link_power == 0.5) { # Poisson.sqrt - if (sum (ppred (linear_terms, 0.0, "<")) == 0) { - b_cumulant = linear_terms ^ 2; - is_natural_parameter_log_zero = ppred (linear_terms, 0.0, "=="); - natural_parameters = 2.0 * log (linear_terms + is_natural_parameter_log_zero); - } else {isNaN = 1;} - } else { if (var_power == 1.0 & link_power > 0.0) { # Poisson.power_nonlog, pos - if (sum (ppred (linear_terms, 0.0, "<")) == 0) { - is_natural_parameter_log_zero = ppred (linear_terms, 0.0, "=="); - b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / link_power) - is_natural_parameter_log_zero; - natural_parameters = log (linear_terms + is_natural_parameter_log_zero) / link_power; - } else {isNaN = 1;} - } else { if (var_power == 1.0) { # Poisson.power_nonlog, neg - if (sum (ppred (linear_terms, 0.0, "<=")) == 0) { - b_cumulant = linear_terms ^ (1.0 / link_power); - natural_parameters = log (linear_terms) / link_power; - } else {isNaN = 1;} - } else { if (var_power == 2.0 & link_power == -1.0) { # Gamma.inverse - if (sum (ppred (linear_terms, 0.0, "<=")) == 0) { - b_cumulant = - log (linear_terms); - natural_parameters = - linear_terms; - } else {isNaN = 1;} - } else { if (var_power == 2.0 & link_power == 1.0) { # Gamma.id - if (sum (ppred (linear_terms, 0.0, "<=")) == 0) { - b_cumulant = log (linear_terms); - natural_parameters = - 1.0 / linear_terms; - } else {isNaN = 1;} - } else { if (var_power == 2.0 & link_power == 0.0) { # Gamma.log - b_cumulant = linear_terms; - natural_parameters = - exp (- linear_terms); - } else { if (var_power == 2.0) { # Gamma.power_nonlog - if (sum (ppred (linear_terms, 0.0, "<=")) == 0) { - b_cumulant = log (linear_terms) / link_power; - natural_parameters = - linear_terms ^ (- 1.0 / link_power); - } else {isNaN = 1;} - } else { if (link_power == 0.0) { # PowerDist.log - natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - var_power); - b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - var_power); - } else { # PowerDist.power_nonlog - if (-2 * link_power == 1.0 - var_power) { - natural_parameters = 1.0 / (linear_terms ^ 2) / (1.0 - var_power); - } else { if (-1 * link_power == 1.0 - var_power) { - natural_parameters = 1.0 / linear_terms / (1.0 - var_power); - } else { if ( link_power == 1.0 - var_power) { - natural_parameters = linear_terms / (1.0 - var_power); - } else { if ( 2 * link_power == 1.0 - var_power) { - natural_parameters = linear_terms ^ 2 / (1.0 - var_power); - } else { - if (sum (ppred (linear_terms, 0.0, "<=")) == 0) { - power = (1.0 - var_power) / link_power; - natural_parameters = (linear_terms ^ power) / (1.0 - var_power); - } else {isNaN = 1;} - }}}} - if (-2 * link_power == 2.0 - var_power) { - b_cumulant = 1.0 / (linear_terms ^ 2) / (2.0 - var_power); - } else { if (-1 * link_power == 2.0 - var_power) { - b_cumulant = 1.0 / linear_terms / (2.0 - var_power); - } else { if ( link_power == 2.0 - var_power) { - b_cumulant = linear_terms / (2.0 - var_power); - } else { if ( 2 * link_power == 2.0 - var_power) { - b_cumulant = linear_terms ^ 2 / (2.0 - var_power); - } else { - if (sum (ppred (linear_terms, 0.0, "<=")) == 0) { - power = (2.0 - var_power) / link_power; - b_cumulant = (linear_terms ^ power) / (2.0 - var_power); - } else {isNaN = 1;} - }}}} - }}}}} }}}}} - if (sum (is_natural_parameter_log_zero * abs (Y)) > 0.0) { - log_l = -1.0 / 0.0; - isNaN = 1; - } - if (isNaN == 0) - { - log_l = sum (Y * natural_parameters - b_cumulant); - if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) { - log_l = -1.0 / 0.0; - isNaN = 1; - } } } - - if (dist_type == 2 & link_type >= 1 & link_type <= 5) - { # BINOMIAL/BERNOULLI DISTRIBUTION - - [Y_prob, isNaN] = binomial_probability_two_column (linear_terms, link_type, link_power); - - if (isNaN == 0) { - does_prob_contradict = ppred (Y_prob, 0.0, "<="); - if (sum (does_prob_contradict * abs (Y)) == 0.0) { - log_l = sum (Y * log (Y_prob * (1 - does_prob_contradict) + does_prob_contradict)); - if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) { - isNaN = 1; - } - } else { - log_l = -1.0 / 0.0; - isNaN = 1; - } } } - - if (isNaN == 1) { - log_l = - 1.0 / 0.0; - } -} - - - -binomial_probability_two_column = - function (Matrix[double] linear_terms, int link_type, double link_power) - return (Matrix[double] Y_prob, int isNaN) -{ - isNaN = 0; - num_records = nrow (linear_terms); - - # Define some auxiliary matrices - - ones_2 = matrix (1.0, rows = 1, cols = 2); - p_one_m_one = ones_2; - p_one_m_one [1, 2] = -1.0; - m_one_p_one = ones_2; - m_one_p_one [1, 1] = -1.0; - zero_one = ones_2; - zero_one [1, 1] = 0.0; - one_zero = ones_2; - one_zero [1, 2] = 0.0; - - zeros_r = matrix (0.0, rows = num_records, cols = 1); - ones_r = 1.0 + zeros_r; - - # Begin the function body - - Y_prob = zeros_r %*% ones_2; - if (link_type == 1) { # Binomial.power - if (link_power == 0.0) { # Binomial.log - Y_prob = exp (linear_terms) %*% p_one_m_one + ones_r %*% zero_one; - } else { if (link_power == 0.5) { # Binomial.sqrt - Y_prob = (linear_terms ^ 2) %*% p_one_m_one + ones_r %*% zero_one; - } else { # Binomial.power_nonlog - if (sum (ppred (linear_terms, 0.0, "<")) == 0) { - Y_prob = (linear_terms ^ (1.0 / link_power)) %*% p_one_m_one + ones_r %*% zero_one; - } else {isNaN = 1;} - }} - } else { # Binomial.non_power - is_LT_pos_infinite = ppred (linear_terms, 1.0/0.0, "=="); - is_LT_neg_infinite = ppred (linear_terms, -1.0/0.0, "=="); - is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite %*% zero_one; - finite_linear_terms = replace (target = linear_terms, pattern = 1.0/0.0, replacement = 0); - finite_linear_terms = replace (target = finite_linear_terms, pattern = -1.0/0.0, replacement = 0); - if (link_type == 2) { # Binomial.logit - Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% zero_one; - Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2); - } else { if (link_type == 3) { # Binomial.probit - lt_pos_neg = ppred (finite_linear_terms, 0.0, ">=") %*% p_one_m_one + ones_r %*% zero_one; - t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888); # 0.231641888 = 0.3275911 / sqrt (2.0) - pt_gp = t_gp * ( 0.254829592 - + t_gp * (-0.284496736 # "Handbook of Mathematical Functions", ed. by M. Abramowitz and I.A. Stegun, - + t_gp * ( 1.421413741 # U.S. Nat-l Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299 - + t_gp * (-1.453152027 - + t_gp * 1.061405429)))); - the_gauss_exp = exp (- (finite_linear_terms ^ 2) / 2.0); - Y_prob = lt_pos_neg + ((the_gauss_exp * pt_gp) %*% ones_2) * (0.5 - lt_pos_neg); - } else { if (link_type == 4) { # Binomial.cloglog - the_exp = exp (finite_linear_terms); - the_exp_exp = exp (- the_exp); - is_too_small = ppred (10000000 + the_exp, 10000000, "=="); - Y_prob [, 1] = (1 - is_too_small) * (1 - the_exp_exp) + is_too_small * the_exp * (1 - the_exp / 2); - Y_prob [, 2] = the_exp_exp; - } else { if (link_type == 5) { # Binomial.cauchit - Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / 3.1415926535897932384626433832795; - } else { - isNaN = 1; - }}}} - Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + is_LT_infinite; -} } - - -# THE CG-STEIHAUG PROCEDURE SCRIPT - -# Apply Conjugate Gradient - Steihaug algorithm in order to approximately minimize -# 0.5 z^T (X^T diag(w) X + diag (lambda)) z + (g + lambda * beta)^T z -# under constraint: ||z|| <= trust_delta. -# See Alg. 7.2 on p. 171 of "Numerical Optimization" 2nd ed. by Nocedal and Wright -# IN THE ABOVE, "X" IS UNDERSTOOD TO BE "X %*% (SHIFT/SCALE TRANSFORM)"; this transform -# is given separately because sparse "X" may become dense after applying the transform. -# -get_CG_Steihaug_point = - function (Matrix[double] X, Matrix[double] scale_X, Matrix[double] shift_X, Matrix[double] w, - Matrix[double] g, Matrix[double] beta, Matrix[double] lambda, double trust_delta, int max_iter_CG) - return (Matrix[double] z, double neg_log_l_change, int i_CG, int reached_trust_boundary) -{ - trust_delta_sq = trust_delta ^ 2; - size_CG = nrow (g); - z = matrix (0.0, rows = size_CG, cols = 1); - neg_log_l_change = 0.0; - reached_trust_boundary = 0; - g_reg = g + lambda * beta; - r_CG = g_reg; - p_CG = -r_CG; - rr_CG = sum(r_CG * r_CG); - eps_CG = rr_CG * min (0.25, sqrt (rr_CG)); - converged_CG = 0; - if (rr_CG < eps_CG) { - converged_CG = 1; - } - - max_iteration_CG = max_iter_CG; - if (max_iteration_CG <= 0) { - max_iteration_CG = size_CG; - } - i_CG = 0; - while (converged_CG == 0) - { - i_CG = i_CG + 1; - ssX_p_CG = diag (scale_X) %*% p_CG; - ssX_p_CG [size_CG, ] = ssX_p_CG [size_CG, ] + t(shift_X) %*% p_CG; - temp_CG = t(X) %*% (w * (X %*% ssX_p_CG)); - q_CG = (lambda * p_CG) + diag (scale_X) %*% temp_CG + shift_X %*% temp_CG [size_CG, ]; - pq_CG = sum (p_CG * q_CG); - if (pq_CG <= 0) { - pp_CG = sum (p_CG * p_CG); - if (pp_CG > 0) { - [z, neg_log_l_change] = - get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq); - reached_trust_boundary = 1; - } else { - neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg)); - } - converged_CG = 1; - } - if (converged_CG == 0) { - alpha_CG = rr_CG / pq_CG; - new_z = z + alpha_CG * p_CG; - if (sum(new_z * new_z) >= trust_delta_sq) { - pp_CG = sum (p_CG * p_CG); - [z, neg_log_l_change] = - get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, pq_CG, trust_delta_sq); - reached_trust_boundary = 1; - converged_CG = 1; - } - if (converged_CG == 0) { - z = new_z; - old_rr_CG = rr_CG; - r_CG = r_CG + alpha_CG * q_CG; - rr_CG = sum(r_CG * r_CG); - if (i_CG == max_iteration_CG | rr_CG < eps_CG) { - neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg)); - reached_trust_boundary = 0; - converged_CG = 1; - } - if (converged_CG == 0) { - p_CG = -r_CG + (rr_CG / old_rr_CG) * p_CG; -} } } } } - - -# An auxiliary function used twice inside the CG-STEIHAUG loop: -get_trust_boundary_point = - function (Matrix[double] g, Matrix[double] z, Matrix[double] p, - Matrix[double] q, Matrix[double] r, double pp, double pq, - double trust_delta_sq) - return (Matrix[double] new_z, double f_change) -{ - zz = sum (z * z); pz = sum (p * z); - sq_root_d = sqrt (pz * pz - pp * (zz - trust_delta_sq)); - tau_1 = (- pz + sq_root_d) / pp; - tau_2 = (- pz - sq_root_d) / pp; - zq = sum (z * q); gp = sum (g * p); - f_extra = 0.5 * sum (z * (r + g)); - f_change_1 = f_extra + (0.5 * tau_1 * pq + zq + gp) * tau_1; - f_change_2 = f_extra + (0.5 * tau_2 * pq + zq + gp) * tau_2; - if (f_change_1 < f_change_2) { - new_z = z + (tau_1 * p); - f_change = f_change_1; - } - else { - new_z = z + (tau_2 * p); - f_change = f_change_2; - } -} - - -# Computes vector w such that ||X %*% w - 1|| -> MIN given avg(X %*% w) = 1 -# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale -# it to compute w = c * z_LS such that sum(X %*% w) = nrow(X). -straightenX = - function (Matrix[double] X, double eps, int max_iter_CG) - return (Matrix[double] w) -{ - w_X = t(colSums(X)); - lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X); - eps_LS = eps * nrow(X); - - # BEGIN LEAST SQUARES - - r_LS = - w_X; - z_LS = matrix (0.0, rows = ncol(X), cols = 1); - p_LS = - r_LS; - norm_r2_LS = sum (r_LS ^ 2); - i_LS = 0; - while (i_LS < max_iter_CG & i_LS < ncol(X) & norm_r2_LS >= eps_LS) - { - q_LS = t(X) %*% X %*% p_LS; - q_LS = q_LS + lambda_LS * p_LS; - alpha_LS = norm_r2_LS / sum (p_LS * q_LS); - z_LS = z_LS + alpha_LS * p_LS; - old_norm_r2_LS = norm_r2_LS; - r_LS = r_LS + alpha_LS * q_LS; - norm_r2_LS = sum (r_LS ^ 2); - p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS; - i_LS = i_LS + 1; - } - - # END LEAST SQUARES - - w = (nrow(X) / sum (w_X * z_LS)) * z_LS; -} - - -round_to_print = function (double x_to_truncate) -return (double mantissa, int eee) -{ - mantissa = 1.0; - eee = 0; - positive_infinity = 1.0 / 0.0; - x = abs (x_to_truncate); - if (x != x / 2.0) { - log_ten = log (10.0); - d_eee = round (log (x) / log_ten - 0.5); - mantissa = round (x * exp (log_ten * (4.0 - d_eee))) / 10000; - if (mantissa == 10.0) { - mantissa = 1.0; - d_eee = d_eee + 1; - } - if (x_to_truncate < 0.0) { - mantissa = - mantissa; - } - eee = 0; - pow_two = 1; - res_eee = abs (d_eee); - while (res_eee != 0.0) { - new_res_eee = round (res_eee / 2.0 - 0.3); - if (new_res_eee * 2.0 < res_eee) { - eee = eee + pow_two; - } - res_eee = new_res_eee; - pow_two = 2 * pow_two; - } - if (d_eee < 0.0) { - eee = - eee; - } - } else { mantissa = x_to_truncate; } -} +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# THIS SCRIPT SOLVES GLM REGRESSION USING NEWTON/FISHER SCORING WITH TRUST REGIONS +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X String --- Location to read the matrix X of feature vectors +# Y String --- Location to read response matrix Y with either 1 or 2 columns: +# if dfam = 2, Y is 1-column Bernoulli or 2-column Binomial (#pos, #neg) +# B String --- Location to store estimated regression parameters (the betas) +# fmt String "text" The betas matrix output format, such as "text" or "csv" +# O String " " Location to write the printed statistics; by default is standard output +# Log String " " Location to write per-iteration variables for log/debugging purposes +# dfam Int 1 Distribution family code: 1 = Power, 2 = Binomial +# vpow Double 0.0 Power for Variance defined as (mean)^power (ignored if dfam != 1): +# 0.0 = Gaussian, 1.0 = Poisson, 2.0 = Gamma, 3.0 = Inverse Gaussian +# link Int 0 Link function code: 0 = canonical (depends on distribution), +# 1 = Power, 2 = Logit, 3 = Probit, 4 = Cloglog, 5 = Cauchit +# lpow Double 1.0 Power for Link function defined as (mean)^power (ignored if link != 1): +# -2.0 = 1/mu^2, -1.0 = reciprocal, 0.0 = log, 0.5 = sqrt, 1.0 = identity +# yneg Double 0.0 Response value for Bernoulli "No" label, usually 0.0 or -1.0 +# icpt Int 0 Intercept presence, X columns shifting and rescaling: +# 0 = no intercept, no shifting, no rescaling; +# 1 = add intercept, but neither shift nor rescale X; +# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 +# reg Double 0.0 Regularization parameter (lambda) for L2 regularization +# tol Double 0.000001 Tolerance (epsilon) +# disp Double 0.0 (Over-)dispersion value, or 0.0 to estimate it from data +# moi Int 200 Maximum number of outer (Newton / Fisher Scoring) iterations +# mii Int 0 Maximum number of inner (Conjugate Gradient) iterations, 0 = no maximum +# --------------------------------------------------------------------------------------------- +# OUTPUT: Matrix beta, whose size depends on icpt: +# icpt=0: ncol(X) x 1; icpt=1: (ncol(X) + 1) x 1; icpt=2: (ncol(X) + 1) x 2 +# +# In addition, some GLM statistics are provided in CSV format, one comma-separated name-value +# pair per each line, as follows: +# +# NAME MEANING +# ------------------------------------------------------------------------------------------- +# TERMINATION_CODE A positive integer indicating success/failure as follows: +# 1 = Converged successfully; 2 = Maximum number of iterations reached; +# 3 = Input (X, Y) out of range; 4 = Distribution/link is not supported +# BETA_MIN Smallest beta value (regression coefficient), excluding the intercept +# BETA_MIN_INDEX Column index for the smallest beta value +# BETA_MAX Largest beta value (regression coefficient), excluding the intercept +# BETA_MAX_INDEX Column index for the largest beta value +# INTERCEPT Intercept value, or NaN if there is no intercept (if icpt=0) +# DISPERSION Dispersion used to scale deviance, provided as "disp" input parameter +# or estimated (same as DISPERSION_EST) if the "disp" parameter is <= 0 +# DISPERSION_EST Dispersion estimated from the dataset +# DEVIANCE_UNSCALED Deviance from the saturated model, assuming dispersion == 1.0 +# DEVIANCE_SCALED Deviance from the saturated model, scaled by the DISPERSION value +# ------------------------------------------------------------------------------------------- +# +# The Log file, when requested, contains the following per-iteration variables in CSV format, +# each line containing triple (NAME, ITERATION, VALUE) with ITERATION = 0 for initial values: +# +# NAME MEANING +# ------------------------------------------------------------------------------------------- +# NUM_CG_ITERS Number of inner (Conj.Gradient) iterations in this outer iteration +# IS_TRUST_REACHED 1 = trust region boundary was reached, 0 = otherwise +# POINT_STEP_NORM L2-norm of iteration step from old point (i.e. "beta") to new point +# OBJECTIVE The loss function we minimize (i.e. negative partial log-likelihood) +# OBJ_DROP_REAL Reduction in the objective during this iteration, actual value +# OBJ_DROP_PRED Reduction in the objective predicted by a quadratic approximation +# OBJ_DROP_RATIO Actual-to-predicted reduction ratio, used to update the trust region +# GRADIENT_NORM L2-norm of the loss function gradient (NOTE: sometimes omitted) +# LINEAR_TERM_MIN The minimum value of X %*% beta, used to check for overflows +# LINEAR_TERM_MAX The maximum value of X %*% beta, used to check for overflows +# IS_POINT_UPDATED 1 = new point accepted; 0 = new point rejected, old point restored +# TRUST_DELTA Updated trust region size, the "delta" +# ------------------------------------------------------------------------------------------- +# +# Example with distribution = "Binomial.logit": +# hadoop jar SystemML.jar -f GLM_HOME/GLM.dml -nvargs dfam=2 link=2 yneg=-1.0 icpt=2 reg=0.001 +# tol=0.00000001 disp=1.0 moi=100 mii=10 X=INPUT_DIR/X Y=INPUT_DIR/Y B=OUTPUT_DIR/betas +# fmt=csv O=OUTPUT_DIR/stats Log=OUTPUT_DIR/log +# +# SOME OF THE SUPPORTED GLM DISTRIBUTION FAMILIES +# AND LINK FUNCTIONS: +# ----------------------------------------------- +# INPUT PARAMETERS: MEANING: Cano- +# dfam vpow link lpow Distribution.link nical? +# ----------------------------------------------- +# 1 0.0 1 -1.0 Gaussian.inverse +# 1 0.0 1 0.0 Gaussian.log +# 1 0.0 1 1.0 Gaussian.id Yes +# 1 1.0 1 0.0 Poisson.log Yes +# 1 1.0 1 0.5 Poisson.sqrt +# 1 1.0 1 1.0 Poisson.id +# 1 2.0 1 -1.0 Gamma.inverse Yes +# 1 2.0 1 0.0 Gamma.log +# 1 2.0 1 1.0 Gamma.id +# 1 3.0 1 -2.0 InvGaussian.1/mu^2 Yes +# 1 3.0 1 -1.0 InvGaussian.inverse +# 1 3.0 1 0.0 InvGaussian.log +# 1 3.0 1 1.0 InvGaussian.id +# 1 * 1 * AnyVariance.AnyLink +# ----------------------------------------------- +# 2 * 1 0.0 Binomial.log +# 2 * 1 0.5 Binomial.sqrt +# 2 * 2 * Binomial.logit Yes +# 2 * 3 * Binomial.probit +# 2 * 4 * Binomial.cloglog +# 2 * 5 * Binomial.cauchit +# ----------------------------------------------- + + +# Default values for input parameters + +fileX = $X; +fileY = $Y; +fileB = $B; +fileO = ifdef ($O, " "); +fileLog = ifdef ($Log, " "); +fmtB = ifdef ($fmt, "text"); + +distribution_type = ifdef ($dfam, 1); # $dfam = 1; +variance_as_power_of_the_mean = ifdef ($vpow, 0.0); # $vpow = 0.0; +link_type = ifdef ($link, 0); # $link = 0; +link_as_power_of_the_mean = ifdef ($lpow, 1.0); # $lpow = 1.0; +bernoulli_No_label = ifdef ($yneg, 0.0); # $yneg = 0.0; +intercept_status = ifdef ($icpt, 0); # $icpt = 0; +dispersion = ifdef ($disp, 0.0); # $disp = 0.0; +regularization = ifdef ($reg, 0.0); # $reg = 0.0; +eps = ifdef ($tol, 0.000001); # $tol = 0.000001; +max_iteration_IRLS = ifdef ($moi, 200); # $moi = 200; +max_iteration_CG = ifdef ($mii, 0); # $mii = 0; + +variance_as_power_of_the_mean = as.double (variance_as_power_of_the_mean); +link_as_power_of_the_mean = as.double (link_as_power_of_the_mean); +bernoulli_No_label = as.double (bernoulli_No_label); +dispersion = as.double (dispersion); +eps = as.double (eps); + + +# Default values for output statistics: + +termination_code = 0.0; +min_beta = 0.0 / 0.0; +i_min_beta = 0.0 / 0.0; +max_beta = 0.0 / 0.0; +i_max_beta = 0.0 / 0.0; +intercept_value = 0.0 / 0.0; +dispersion = 0.0 / 0.0; +estimated_dispersion = 0.0 / 0.0; +deviance_nodisp = 0.0 / 0.0; +deviance = 0.0 / 0.0; + +print("BEGIN GLM SCRIPT"); +print("Reading X..."); +X = read (fileX); +print("Reading Y..."); +Y = read (fileY); + +num_records = nrow (X); +num_features = ncol (X); +zeros_r = matrix (0, rows = num_records, cols = 1); +ones_r = 1 + zeros_r; + +# Introduce the intercept, shift and rescale the columns of X if needed + +if (intercept_status == 1 | intercept_status == 2) # add the intercept column +{ + X = append (X, ones_r); + num_features = ncol (X); +} + +scale_lambda = matrix (1, rows = num_features, cols = 1); +if (intercept_status == 1 | intercept_status == 2) +{ + scale_lambda [num_features, 1] = 0; +} + +if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1 +{ # Important assumption: X [, num_features] = ones_r + avg_X_cols = t(colSums(X)) / num_records; + var_X_cols = (t(colSums (X ^ 2)) - num_records * (avg_X_cols ^ 2)) / (num_records - 1); + is_unsafe = ppred (var_X_cols, 0.0, "<="); + scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe); + scale_X [num_features, 1] = 1; + shift_X = - avg_X_cols * scale_X; + shift_X [num_features, 1] = 0; + rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2); +} else { + scale_X = matrix (1, rows = num_features, cols = 1); + shift_X = matrix (0, rows = num_features, cols = 1); + rowSums_X_sq = rowSums (X ^ 2); +} + +# Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2) +# with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale. +# The transform is then associatively applied to the other side of the expression, +# and is rewritten via "scale_X" and "shift_X" as follows: +# +# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: +# ssX_A = diag (scale_X) %*% A; +# ssX_A [num_features, ] = ssX_A [num_features, ] + t(shift_X) %*% A; +# +# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: +# tssX_A = diag (scale_X) %*% A + shift_X %*% A [num_features, ]; + +# Initialize other input-dependent parameters + +lambda = scale_lambda * regularization; +if (max_iteration_CG == 0) { + max_iteration_CG = num_features; +} + +# In Bernoulli case, convert one-column "Y" into two-column + +if (distribution_type == 2 & ncol(Y) == 1) +{ + is_Y_negative = ppred (Y, bernoulli_No_label, "=="); + Y = append (1 - is_Y_negative, is_Y_negative); + count_Y_negative = sum (is_Y_negative); + if (count_Y_negative == 0) { + stop ("GLM Input Error: all Y-values encode Bernoulli YES-label, none encode NO-label"); + } + if (count_Y_negative == nrow(Y)) { + stop ("GLM Input Error: all Y-values encode Bernoulli NO-label, none encode YES-label"); + } +} + +# Set up the canonical link, if requested [Then we have: Var(mu) * (d link / d mu) = const] + +if (link_type == 0) +{ + if (distribution_type == 1) { + link_type = 1; + link_as_power_of_the_mean = 1.0 - variance_as_power_of_the_mean; + } else { if (distribution_type == 2) { + link_type = 2; +} } } + +# For power distributions and/or links, we use two constants, +# "variance as power of the mean" and "link_as_power_of_the_mean", +# to specify the variance and the link as arbitrary powers of the +# mean. However, the variance-powers of 1.0 (Poisson family) and +# 2.0 (Gamma family) have to be treated as special cases, because +# these values integrate into logarithms. The link-power of 0.0 +# is also special as it represents the logarithm link. + +num_response_columns = ncol (Y); + +is_supported = check_if_supported (num_response_columns, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); +if (is_supported == 1) +{ + +##### INITIALIZE THE BETAS ##### + +[beta, saturated_log_l, isNaN] = + glm_initialize (X, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean, intercept_status, max_iteration_CG); +if (isNaN == 0) +{ + +##### START OF THE MAIN PART ##### + +sum_X_sq = sum (rowSums_X_sq); +trust_delta = 0.5 * sqrt (num_features) / max (sqrt (rowSums_X_sq)); +### max_trust_delta = trust_delta * 10000.0; +log_l = 0.0; +deviance_nodisp = 0.0; +new_deviance_nodisp = 0.0; +isNaN_log_l = 2; +newbeta = beta; +g = matrix (0.0, rows = num_features, cols = 1); +g_norm = sqrt (sum ((g + lambda * beta) ^ 2)); +accept_new_beta = 1; +reached_trust_boundary = 0; +neg_log_l_change_predicted = 0.0; +i_IRLS = 0; + +print ("BEGIN IRLS ITERATIONS..."); + +ssX_newbeta = diag (scale_X) %*% newbeta; +ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta; +all_linear_terms = X %*% ssX_newbeta; + +[new_log_l, isNaN_new_log_l] = glm_log_likelihood_part + (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); + +if (isNaN_new_log_l == 0) { + new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l); + new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2); +} + +if (fileLog != " ") { + log_str = "POINT_STEP_NORM," + i_IRLS + "," + sqrt (sum (beta ^ 2)); + log_str = append (log_str, "OBJECTIVE," + i_IRLS + "," + (- new_log_l)); + log_str = append (log_str, "LINEAR_TERM_MIN," + i_IRLS + "," + min (all_linear_terms)); + log_str = append (log_str, "LINEAR_TERM_MAX," + i_IRLS + "," + max (all_linear_terms)); +} else { + log_str = " "; +} + +while (termination_code == 0) +{ + accept_new_beta = 1; + + if (i_IRLS > 0) + { + if (isNaN_log_l == 0) { + accept_new_beta = 0; + } + +# Decide whether to accept a new iteration point and update the trust region +# See Alg. 4.1 on p. 69 of "Numerical Optimization" 2nd ed. by Nocedal and Wright + + rho = (- new_log_l + log_l) / neg_log_l_change_predicted; + if (rho < 0.25 | isNaN_new_log_l == 1) { + trust_delta = 0.25 * trust_delta; + } + if (rho > 0.75 & isNaN_new_log_l == 0 & reached_trust_boundary == 1) { + trust_delta = 2 * trust_delta; + +### if (trust_delta > max_trust_delta) { +### trust_delta = max_trust_delta; +### } + + } + if (rho > 0.1 & isNaN_new_log_l == 0) { + accept_new_beta = 1; + } + } + + if (fileLog != " ") { + log_str = append (log_str, "IS_POINT_UPDATED," + i_IRLS + "," + accept_new_beta); + log_str = append (log_str, "TRUST_DELTA," + i_IRLS + "," + trust_delta); + } + if (accept_new_beta == 1) + { + beta = newbeta; log_l = new_log_l; deviance_nodisp = new_deviance_nodisp; isNaN_log_l = isNaN_new_log_l; + + [g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); + + # We introduced these variables to avoid roundoff errors: + # g_Y = y_residual / (y_var * link_grad); + # w = 1.0 / (y_var * link_grad * link_grad); + + gXY = - t(X) %*% g_Y; + g = diag (scale_X) %*% gXY + shift_X %*% gXY [num_features, ]; + g_norm = sqrt (sum ((g + lambda * beta) ^ 2)); + + if (fileLog != " ") { + log_str = append (log_str, "GRADIENT_NORM," + i_IRLS + "," + g_norm); + } + } + + [z, neg_log_l_change_predicted, num_CG_iters, reached_trust_boundary] = + get_CG_Steihaug_point (X, scale_X, shift_X, w, g, beta, lambda, trust_delta, max_iteration_CG); + + newbeta = beta + z; + + ssX_newbeta = diag (scale_X) %*% newbeta; + ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + t(shift_X) %*% newbeta; + all_linear_terms = X %*% ssX_newbeta; + + [new_log_l, isNaN_new_log_l] = glm_log_likelihood_part + (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); + + if (isNaN_new_log_l == 0) { + new_deviance_nodisp = 2.0 * (saturated_log_l - new_log_l); + new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2); + } + + log_l_change = new_log_l - log_l; # R's criterion for termination: |dev - devold|/(|dev| + 0.1) < eps + + if (reached_trust_boundary == 0 & isNaN_new_log_l == 0 & + (2.0 * abs (log_l_change) < eps * (deviance_nodisp + 0.1) | abs (log_l_change) < (abs (log_l) + abs (new_log_l)) * 0.00000000000001) ) + { + termination_code = 1; + } + rho = - log_l_change / neg_log_l_change_predicted; + z_norm = sqrt (sum (z * z)); + + [z_norm_m, z_norm_e] = round_to_print (z_norm); + [trust_delta_m, trust_delta_e] = round_to_print (trust_delta); + [rho_m, rho_e] = round_to_print (rho); + [new_log_l_m, new_log_l_e] = round_to_print (new_log_l); + [log_l_change_m, log_l_change_e] = round_to_print (log_l_change); + [g_norm_m, g_norm_e] = round_to_print (g_norm); + + i_IRLS = i_IRLS + 1; + print ("Iter #" + i_IRLS + " completed" + + ", ||z|| = " + z_norm_m + "E" + z_norm_e + + ", trust_delta = " + trust_delta_m + "E" + trust_delta_e + + ", reached = " + reached_trust_boundary + + ", ||g|| = " + g_norm_m + "E" + g_norm_e + + ", new_log_l = " + new_log_l_m + "E" + new_log_l_e + + ", log_l_change = " + log_l_change_m + "E" + log_l_change_e + + ", rho = " + rho_m + "E" + rho_e); + + if (fileLog != " ") { + log_str = append (log_str, "NUM_CG_ITERS," + i_IRLS + "," + num_CG_iters); + log_str = append (log_str, "IS_TRUST_REACHED," + i_IRLS + "," + reached_trust_boundary); + log_str = append (log_str, "POINT_STEP_NORM," + i_IRLS + "," + z_norm); + log_str = append (log_str, "OBJECTIVE," + i_IRLS + "," + (- new_log_l)); + log_str = append (log_str, "OBJ_DROP_REAL," + i_IRLS + "," + log_l_change); + log_str = append (log_str, "OBJ_DROP_PRED," + i_IRLS + "," + (- neg_log_l_change_predicted)); + log_str = append (log_str, "OBJ_DROP_RATIO," + i_IRLS + "," + rho); + log_str = append (log_str, "LINEAR_TERM_MIN," + i_IRLS + "," + min (all_linear_terms)); + log_str = append (log_str, "LINEAR_TERM_MAX," + i_IRLS + "," + max (all_linear_terms)); + } + + if (i_IRLS == max_iteration_IRLS) { + termination_code = 2; + } +} + +beta = newbeta; +log_l = new_log_l; +deviance_nodisp = new_deviance_nodisp; + +if (termination_code == 1) { + print ("Converged in " + i_IRLS + " steps."); +} else { + print ("Did not converge."); +} + +ssX_beta = diag (scale_X) %*% beta; +ssX_beta [num_features, ] = ssX_beta [num_features, ] + t(shift_X) %*% beta; +if (intercept_status == 2) { + beta_out = append (ssX_beta, beta); +} else { + beta_out = ssX_beta; +} + +write (beta_out, fileB, format=fmtB); + +if (intercept_status == 1 | intercept_status == 2) { + intercept_value = castAsScalar (beta_out [num_features, 1]); + beta_noicept = beta_out [1 : (num_features - 1), 1]; +} else { + beta_noicept = beta_out [1 : num_features, 1]; +} +min_beta = min (beta_noicept); +max_beta = max (beta_noicept); +tmp_i_min_beta = rowIndexMin (t(beta_noicept)) +i_min_beta = castAsScalar (tmp_i_min_beta [1, 1]); +tmp_i_max_beta = rowIndexMax (t(beta_noicept)) +i_max_beta = castAsScalar (tmp_i_max_beta [1, 1]); + +##### OVER-DISPERSION PART ##### + +all_linear_terms = X %*% ssX_beta; +[g_Y, w] = glm_dist (all_linear_terms, Y, distribution_type, variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean); + +pearson_residual_sq = g_Y ^ 2 / w; +pearson_residual_sq = replace (target = pearson_residual_sq, pattern = 0.0/0.0, replacement = 0); +# pearson_residual_sq = (y_residual ^ 2) / y_var; + +if (num_records > num_features) { + estimated_dispersion = sum (pearson_residual_sq) / (num_records - num_features); +} +if (dispersion <= 0.0) { + dispersion = estimated_dispersion; +} +deviance = deviance_nodisp / dispersion; + +if (fileLog != " ") { + write (log_str, fileLog); +} + +##### END OF THE MAIN PART ##### + +} else { print ("Input matrices are out of range. Terminating the DML."); termination_code = 3; } +} else { print ("Distribution/Link not supported. Terminating the DML."); termination_code = 4; } + +str = "TERMINATION_CODE," + termination_code; +str = append (str, "BETA_MIN," + min_beta); +str = append (str, "BETA_MIN_INDEX," + i_min_beta); +str = append (str, "BETA_MAX," + max_beta); +str = append (str, "BETA_MAX_INDEX," + i_max_beta); +str = append (str, "INTERCEPT," + intercept_value); +str = append (str, "DISPERSION," + dispersion); +str = append (str, "DISPERSION_EST," + estimated_dispersion); +str = append (str, "DEVIANCE_UNSCALED," + deviance_nodisp); +str = append (str, "DEVIANCE_SCALED," + deviance); + +if (fileO != " ") { + write (str, fileO); +} else { + print (str); +} + + + + +check_if_supported = + function (int ncol_y, int dist_type, double var_power, int link_type, double link_power) + return (int is_supported) +{ + is_supported = 0; + if (ncol_y == 1 & dist_type == 1 & link_type == 1) + { # POWER DISTRIBUTION + is_supported = 1; + if (var_power == 0.0 & link_power == -1.0) {print ("Gaussian.inverse"); } else { + if (var_power == 0.0 & link_power == 0.0) {print ("Gaussian.log"); } else { + if (var_power == 0.0 & link_power == 0.5) {print ("Gaussian.sqrt"); } else { + if (var_power == 0.0 & link_power == 1.0) {print ("Gaussian.id"); } else { + if (var_power == 0.0 ) {print ("Gaussian.power_nonlog"); } else { + if (var_power == 1.0 & link_power == -1.0) {print ("Poisson.inverse"); } else { + if (var_power == 1.0 & link_power == 0.0) {print ("Poisson.log"); } else { + if (var_power == 1.0 & link_power == 0.5) {print ("Poisson.sqrt"); } else { + if (var_power == 1.0 & link_power == 1.0) {print ("Poisson.id"); } else { + if (var_power == 1.0 ) {print ("Poisson.power_nonlog"); } else { + if (var_power == 2.0 & link_power == -1.0) {print ("Gamma.inverse"); } else { + if (var_power == 2.0 & link_power == 0.0) {print ("Gamma.log"); } else { + if (var_power == 2.0 & link_power == 0.5) {print ("Gamma.sqrt"); } else { + if (var_power == 2.0 & link_power == 1.0) {print ("Gamma.id"); } else { + if (var_power == 2.0 ) {print ("Gamma.power_nonlog"); } else { + if (var_power == 3.0 & link_power == -2.0) {print ("InvGaussian.1/mu^2"); } else { + if (var_power == 3.0 & link_power == -1.0) {print ("InvGaussian.inverse"); } else { + if (var_power == 3.0 & link_power == 0.0) {print ("InvGaussian.log"); } else { + if (var_power == 3.0 & link_power == 0.5) {print ("InvGaussian.sqrt"); } else { + if (var_power == 3.0 & link_power == 1.0) {print ("InvGaussian.id"); } else { + if (var_power == 3.0 ) {print ("InvGaussian.power_nonlog");}else{ + if ( link_power == 0.0) {print ("PowerDist.log"); } else { + print ("PowerDist.power_nonlog"); + } }}}}} }}}}} }}}}} }}}}} }} + if (ncol_y == 1 & dist_type == 2) + { + print ("Error: Bernoulli response matrix has not been converted into two-column format."); + } + if (ncol_y == 2 & dist_type == 2 & link_type >= 1 & link_type <= 5) + { # BINOMIAL/BERNOULLI DISTRIBUTION + is_supported = 1; + if (link_type == 1 & link_power == -1.0) {print ("Binomial.inverse"); } else { + if (link_type == 1 & link_power == 0.0) {print ("Binomial.log"); } else { + if (link_type == 1 & link_power == 0.5) {print ("Binomial.sqrt"); } else { + if (link_type == 1 & link_power == 1.0) {print ("Binomial.id"); } else { + if (link_type == 1) {print ("Binomial.power_nonlog"); } else { + if (link_type == 2) {print ("Binomial.logit"); } else { + if (link_type == 3) {print ("Binomial.probit"); } else { + if (link_type == 4) {print ("Binomial.cloglog"); } else { + if (link_type == 5) {print ("Binomial.cauchit"); } + } }}}}} }}} + if (is_supported == 0) { + print ("Response matrix with " + ncol_y + " columns, distribution family (" + dist_type + ", " + var_power + + ") and link family (" + link_type + ", " + link_power + ") are NOT supported together."); + } +} + +glm_initialize = function (Matrix[double] X, Matrix[double] Y, int dist_type, double var_power, int link_type, double link_power, int icept_status, int max_iter_CG) +return (Matrix[double] beta, double saturated_log_l, int isNaN) +{ + saturated_log_l = 0.0; + isNaN = 0; + y_corr = Y [, 1]; + if (dist_type == 2) { + n_corr = rowSums (Y); + is_n_zero = ppred (n_corr, 0.0, "=="); + y_corr = Y [, 1] / (n_corr + is_n_zero) + (0.5 - Y [, 1]) * is_n_zero; + } + linear_terms = y_corr; + if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION + if (link_power == 0.0) { + if (sum (ppred (y_corr, 0.0, "<")) == 0) { + is_zero_y_corr = ppred (y_corr, 0.0, "=="); + linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { isNaN = 1; } + } else { if (link_power == 1.0) { + linear_terms = y_corr; + } else { if (link_power == -1.0) { + linear_terms = 1.0 / y_corr; + } else { if (link_power == 0.5) { + if (sum (ppred (y_corr, 0.0, "<")) == 0) { + linear_terms = sqrt (y_corr); + } else { isNaN = 1; } + } else { if (link_power > 0.0) { + if (sum (ppred (y_corr, 0.0, "<")) == 0) { + is_zero_y_corr = ppred (y_corr, 0.0, "=="); + linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr; + } else { isNaN = 1; } + } else { + if (sum (ppred (y_corr, 0.0, "<=")) == 0) { + linear_terms = y_corr ^ link_power; + } else { isNaN = 1; } + }}}}} + } + if (dist_type == 2 & link_type >= 1 & link_type <= 5) + { # BINOMIAL/BERNOULLI DISTRIBUTION + if (link_type == 1 & link_power == 0.0) { # Binomial.log + if (sum (ppred (y_corr, 0.0, "<")) == 0) { + is_zero_y_corr = ppred (y_corr, 0.0, "=="); + linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { isNaN = 1; } + } else { if (link_type == 1 & link_power > 0.0) { # Binomial.power_nonlog pos + if (sum (ppred (y_corr, 0.0, "<")) == 0) { + is_zero_y_corr = ppred (y_corr, 0.0, "=="); + linear_terms = (y_corr + is_zero_y_corr) ^ link_power - is_zero_y_corr; + } else { isNaN = 1; } + } else { if (link_type == 1) { # Binomial.power_nonlog neg + if (sum (ppred (y_corr, 0.0, "<=")) == 0) { + linear_terms = y_corr ^ link_power; + } else { isNaN = 1; } + } else { + is_zero_y_corr = ppred (y_corr, 0.0, "<="); + is_one_y_corr = ppred (y_corr, 1.0, ">="); + y_corr = y_corr * (1.0 - is_zero_y_corr) * (1.0 - is_one_y_corr) + 0.5 * (is_zero_y_corr + is_one_y_corr); + if (link_type == 2) { # Binomial.logit + linear_terms = log (y_corr / (1.0 - y_corr)) + + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { if (link_type == 3) { # Binomial.probit + y_below_half = y_corr + (1.0 - 2.0 * y_corr) * ppred (y_corr, 0.5, ">"); + t = sqrt (- 2.0 * log (y_below_half)); + approx_inv_Gauss_CDF = - t + (2.515517 + t * (0.802853 + t * 0.010328)) / (1.0 + t * (1.432788 + t * (0.189269 + t * 0.001308))); + linear_terms = approx_inv_Gauss_CDF * (1.0 - 2.0 * ppred (y_corr, 0.5, ">")) + + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { if (link_type == 4) { # Binomial.cloglog + linear_terms = log (- log (1.0 - y_corr)) + - log (- log (0.5)) * (is_zero_y_corr + is_one_y_corr) + + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + } else { if (link_type == 5) { # Binomial.cauchit + linear_terms = tan ((y_corr - 0.5) * 3.1415926535897932384626433832795) + + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - is_zero_y_corr); + }} }}}}} + } + + if (isNaN == 0) { + [saturated_log_l, isNaN] = + glm_log_likelihood_part (linear_terms, Y, dist_type, var_power, link_type, link_power); + } + + if ((dist_type == 1 & link_type == 1 & link_power == 0.0) | + (dist_type == 2 & link_type >= 2)) + { + desired_eta = 0.0; + } else { if (link_type == 1 & link_power == 0.0) { + desired_eta = log (0.5); + } else { if (link_type == 1) { + desired_eta = 0.5 ^ link_power; + } else { + desired_eta = 0.5; + }}} + + beta = matrix (0.0, rows = ncol(X), cols = 1); + + if (desired_eta != 0.0) { + if (icept_status == 1 | icept_status == 2) { + beta [nrow(beta), 1] = desired_eta; + } else { + # We want: avg (X %*% ssX_transform %*% beta) = desired_eta + # Note that "ssX_transform" is trivial here, hence ignored + + beta = straightenX (X, 0.000001, max_iter_CG); + beta = beta * desired_eta; +} } } + + +glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y, + int dist_type, double var_power, int link_type, double link_power) + return (Matrix[double] g_Y, Matrix[double] w) + # ORIGINALLY we returned more meaningful vectors, namely: + # Matrix[double] y_residual : y - y_mean, i.e. y observed - y predicted + # Matrix[double] link_gradient : derivative of the link function + # Matrix[double] var_function : variance without dispersion, i.e. the V(mu) function + # BUT, this caused roundoff errors, so we had to compute "directly useful" vectors + # and skip over the "meaningful intermediaries". Now we output these two variables: + # g_Y = y_residual / (var_function * link_gradient); + # w
<TRUNCATED>
