http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/StepGLM.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/StepGLM.dml b/scripts/algorithms/StepGLM.dml
index 443ae95..10737ff 100644
--- a/scripts/algorithms/StepGLM.dml
+++ b/scripts/algorithms/StepGLM.dml
@@ -1,1196 +1,1196 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#  
-# THIS SCRIPT CHOOSES A GLM REGRESSION MODEL IN A STEPWISE ALGIRITHM USING AIC
-# EACH GLM REGRESSION IS SOLVED USING NEWTON/FISHER SCORING WITH TRUST REGIONS
-#
-# INPUT PARAMETERS:
-# 
---------------------------------------------------------------------------------------------
-# NAME  TYPE     DEFAULT   MEANING
-# 
---------------------------------------------------------------------------------------------
-# X     String    ---      Location to read the matrix X of feature vectors
-# Y     String    ---      Location to read response matrix Y with 1 column
-# B     String    ---      Location to store estimated regression parameters 
(the betas)
-# S     String    ---      Location to write the selected features ordered as 
computed by the algorithm
-# O     String    " "      Location to write the printed statistics; by 
default is standard output
-# link  Int       2        Link function code: 1 = log, 2 = Logit, 3 = Probit, 
4 = Cloglog
-# yneg  Double    0.0      Response value for Bernoulli "No" label, usually 
0.0 or -1.0
-# icpt  Int       0        Intercept presence, X columns shifting and 
rescaling:
-#                          0 = no intercept, no shifting, no rescaling;
-#                          1 = add intercept, but neither shift nor rescale X;
-#                          2 = add intercept, shift & rescale X columns to 
mean = 0, variance = 1
-# tol   Double    0.000001 Tolerance (epsilon)
-# disp  Double    0.0      (Over-)dispersion value, or 0.0 to estimate it from 
data
-# moi   Int       200      Maximum number of outer (Newton / Fisher Scoring) 
iterations
-# mii   Int       0        Maximum number of inner (Conjugate Gradient) 
iterations, 0 = no maximum
-# thr   Double    0.01     Threshold to stop the algorithm: if the decrease in 
the value of AIC falls below thr
-#                          no further features are being checked and the 
algorithm stops 
-# fmt   String   "text"    The betas matrix output format, such as "text" or 
"csv"
-# 
---------------------------------------------------------------------------------------------
-# OUTPUT: Matrix beta, whose size depends on icpt:
-#     icpt=0: ncol(X) x 1;  icpt=1: (ncol(X) + 1) x 1;  icpt=2: (ncol(X) + 1) 
x 2
-#
-# In addition, in the last run of GLM some statistics are provided in CSV 
format, one comma-separated name-value
-# pair per each line, as follows:
-#
-# NAME                  MEANING
-# 
-------------------------------------------------------------------------------------------
-# TERMINATION_CODE      A positive integer indicating success/failure as 
follows:
-#                       1 = Converged successfully; 2 = Maximum number of 
iterations reached; 
-#                       3 = Input (X, Y) out of range; 4 = Distribution/link 
is not supported
-# BETA_MIN              Smallest beta value (regression coefficient), 
excluding the intercept
-# BETA_MIN_INDEX        Column index for the smallest beta value
-# BETA_MAX              Largest beta value (regression coefficient), excluding 
the intercept
-# BETA_MAX_INDEX        Column index for the largest beta value
-# INTERCEPT             Intercept value, or NaN if there is no intercept (if 
icpt=0)
-# DISPERSION            Dispersion used to scale deviance, provided as "disp" 
input parameter
-#                       or estimated (same as DISPERSION_EST) if the "disp" 
parameter is <= 0
-# DISPERSION_EST        Dispersion estimated from the dataset
-# DEVIANCE_UNSCALED     Deviance from the saturated model, assuming dispersion 
== 1.0
-# DEVIANCE_SCALED       Deviance from the saturated model, scaled by the 
DISPERSION value
-# 
-------------------------------------------------------------------------------------------
-#
-# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemML.jar -f StepGLM.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y 
B=OUTPUT_DIR/betas
-#                       S=OUTPUT_DIR_S/selected O=OUTPUT_DIR/stats link=2 
yneg=-1.0 icpt=2 tol=0.00000001 
-#                       disp=1.0 moi=100 mii=10 thr=0.01 fmt=csv  
-#
-# THE StepGLM SCRIPT CURRENTLY SUPPORTS BERNOULLI DISTRIBUTION FAMILY AND THE 
FOLLOWING LINK FUNCTIONS ONLY!
-#      - LOG
-#      - LOGIT
-#      - PROBIT
-#      - CLOGLOG
-       
-fileX = $X;
-fileY = $Y;
-fileB = $B;
-intercept_status = ifdef ($icpt, 0);   
-thr = ifdef ($thr, 0.01); 
-bernoulli_No_label = ifdef ($yneg, 0.0);    # $yneg = 0.0;
-distribution_type = 2;
-
-bernoulli_No_label = as.double (bernoulli_No_label);
-
-# currently only the forward selection strategy in supported: start from one 
feature and iteratively add 
-# features until AIC improves
-dir = "forward";
-
-print("BEGIN STEPWISE GLM SCRIPT");
-print ("Reading X and Y...");
-X_orig = read (fileX);
-Y = read (fileY);
-
-if (distribution_type == 2 & ncol(Y) == 1) {
-       is_Y_negative = ppred (Y, bernoulli_No_label, "==");
-       Y = append (1 - is_Y_negative, is_Y_negative);
-       count_Y_negative = sum (is_Y_negative);
-       if (count_Y_negative == 0) {
-               stop ("StepGLM Input Error: all Y-values encode Bernoulli 
YES-label, none encode NO-label");
-       }
-       if (count_Y_negative == nrow(Y)) {
-               stop ("StepGLM Input Error: all Y-values encode Bernoulli 
NO-label, none encode YES-label");
-       }
-}
-
-num_records = nrow (X_orig);
-num_features = ncol (X_orig);
-
-# BEGIN STEPWISE GENERALIZED LINEAR MODELS 
-
-if (dir == "forward") {  
-       
-       continue = TRUE;
-       columns_fixed = matrix (0, rows = 1, cols = num_features);
-       columns_fixed_ordered = matrix (0, rows = 1, cols = 1);
-  
-       # X_global stores the best model found at each step 
-       X_global = matrix (0, rows = num_records, cols = 1);
- 
-       if (intercept_status == 0) {
-               # Compute AIC of an empty model with no features and no 
intercept (all Ys are zero)
-               [AIC_best] = glm (X_global, Y, 0, num_features, 
columns_fixed_ordered, " ");
-       } else {
-               # compute AIC of an empty model with only intercept (all Ys are 
constant)
-               all_ones = matrix (1, rows = num_records, cols = 1);
-               [AIC_best] = glm (all_ones, Y, 0, num_features, 
columns_fixed_ordered, " ");
-       }
-       print ("Best AIC without any features: " + AIC_best);
-  
-       # First pass to examine single features
-       AICs = matrix (AIC_best, rows = 1, cols = num_features);
-       parfor (i in 1:num_features) {  
-               [AIC_1] = glm (X_orig[,i], Y, intercept_status, num_features, 
columns_fixed_ordered, " ");
-               AICs[1,i] = AIC_1;
-       }
-  
-       # Determine the best AIC 
-       column_best = 0;        
-       for (k in 1:num_features) {
-               AIC_cur = as.scalar (AICs[1,k]);
-               if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * 
AIC_best)) ) {
-                       column_best = k;
-                       AIC_best = as.scalar(AICs[1,k]);
-               }
-       }
-  
-       if (column_best == 0) {
-               print ("AIC of an empty model is " + AIC_best + " and adding no 
feature achieves more than " + (thr * 100) + "% decrease in AIC!");
-               if (intercept_status == 0) {
-                       # Compute AIC of an empty model with no features and no 
intercept (all Ys are zero)
-                       [AIC_best] = glm (X_global, Y, 0, num_features, 
columns_fixed_ordered, fileB);
-               } else {
-                       # compute AIC of an empty model with only intercept 
(all Ys are constant)
-                       ###all_ones = matrix (1, rows = num_records, cols = 1);
-                       [AIC_best] = glm (all_ones, Y, 0, num_features, 
columns_fixed_ordered, fileB);
-               }
-       };
-  
-       print ("Best AIC " + AIC_best + " achieved with feature: " + 
column_best);      
-       columns_fixed[1,column_best] = 1;
-       columns_fixed_ordered[1,1] = column_best;
-       X_global = X_orig[,column_best];                
-  
-       while (continue) {
-               # Subsequent passes over the features
-               parfor (i in 1:num_features) { 
-                       if (as.scalar(columns_fixed[1,i]) == 0) {       
-        
-                               # Construct the feature matrix
-                               X = append (X_global, X_orig[,i]);
-        
-                               [AIC_2] = glm (X, Y, intercept_status, 
num_features, columns_fixed_ordered, " ");
-                               AICs[1,i] = AIC_2;
-                       }               
-               }
-    
-               # Determine the best AIC
-               for (k in 1:num_features) {
-                       AIC_cur = as.scalar (AICs[1,k]);
-                       if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs 
(thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) {
-                               column_best = k;
-                               AIC_best = as.scalar(AICs[1,k]);
-                       }
-               }
-    
-               # Append best found features (i.e., columns) to X_global
-               if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best 
feature found
-                       print ("Best AIC " + AIC_best + " achieved with 
feature: " + column_best);
-                       columns_fixed[1,column_best] = 1;
-                       columns_fixed_ordered = append (columns_fixed_ordered, 
as.matrix(column_best));
-                       if (ncol(columns_fixed_ordered) == num_features) { # 
all features examined
-                               X_global = append (X_global, 
X_orig[,column_best]);
-                               continue = FALSE;
-                       } else {
-                               X_global = append (X_global, 
X_orig[,column_best]);
-                       }
-               } else {
-               continue = FALSE;
-               }
-       }
-  
-       # run GLM with selected set of features
-       print ("Running GLM with selected features...");
-       [AIC] = glm (X_global, Y, intercept_status, num_features, 
columns_fixed_ordered, fileB);
-  
-} else {
-       stop ("Currently only forward selection strategy is supported!");
-}
-
-
-################### UDFS USED IN THIS SCRIPT ##################
-
-glm = function (Matrix[Double] X, Matrix[Double] Y, Int intercept_status, 
Double num_features_orig, Matrix[Double] Selected, String fileB) return (Double 
AIC) {
-               
-       # distribution family code: 1 = Power, 2 = Bernoulli/Binomial; 
currently only Bernouli distribution family is supported!                
-       distribution_type = 2;                          # $dfam = 2;
-       variance_as_power_of_the_mean = 0.0;            # $vpow = 0.0;
-       # link function code: 0 = canonical (depends on distribution), 1 = 
Power, 2 = Logit, 3 = Probit, 4 = Cloglog, 5 = Cauchit;
-       # currently only log (link = 1), logit (link = 2), probit (link = 3), 
and cloglog (link = 4) are supported!
-       link_type = ifdef ($link, 2);                   # $link = 2;
-       link_as_power_of_the_mean = 0.0;                # $lpow = 0.0;
-
-       dispersion = ifdef ($disp, 0.0);            # $disp = 0.0;
-       eps = ifdef ($tol, 0.000001);               # $tol  = 0.000001;
-       max_iteration_IRLS = ifdef ($moi, 200);     # $moi  = 200;
-       max_iteration_CG = ifdef ($mii, 0);         # $mii  = 0;
-
-       variance_as_power_of_the_mean = as.double 
(variance_as_power_of_the_mean);
-       link_as_power_of_the_mean = as.double (link_as_power_of_the_mean);
-
-       dispersion = as.double (dispersion);
-       eps = as.double (eps);              
-
-       # Default values for output statistics:
-       regularization = 0.0;
-       termination_code     = 0.0;
-       min_beta             = 0.0 / 0.0;
-       i_min_beta           = 0.0 / 0.0;
-       max_beta             = 0.0 / 0.0;
-       i_max_beta           = 0.0 / 0.0;
-       intercept_value      = 0.0 / 0.0;
-       dispersion           = 0.0 / 0.0;
-       estimated_dispersion = 0.0 / 0.0;
-       deviance_nodisp      = 0.0 / 0.0;
-       deviance             = 0.0 / 0.0;                                 
-                  
-       #####   INITIALIZE THE PARAMETERS   #####
-                  
-    num_records  = nrow (X);
-    num_features = ncol (X);
-    zeros_r = matrix (0, rows = num_records, cols = 1);
-    ones_r = 1 + zeros_r;
-                  
-    # Introduce the intercept, shift and rescale the columns of X if needed
-                  
-    if (intercept_status == 1 | intercept_status == 2) { # add the intercept 
column
-               X = append (X, ones_r);
-        num_features = ncol (X);
-       }
-                  
-    scale_lambda = matrix (1, rows = num_features, cols = 1);
-    if (intercept_status == 1 | intercept_status == 2) {
-               scale_lambda [num_features, 1] = 0;
-    }
-                  
-    if (intercept_status == 2) {  # scale-&-shift X columns to mean 0, 
variance 1
-               # Important assumption: X [, num_features] = ones_r
-        avg_X_cols = t(colSums(X)) / num_records;
-        var_X_cols = (t(colSums (X ^ 2)) - num_records * (avg_X_cols ^ 2)) / 
(num_records - 1);
-        is_unsafe = ppred (var_X_cols, 0.0, "<=");
-        scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
-        scale_X [num_features, 1] = 1;
-        shift_X = - avg_X_cols * scale_X;
-        shift_X [num_features, 1] = 0;
-        rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * 
shift_X) + sum (shift_X ^ 2);
-       } else {
-        scale_X = matrix (1, rows = num_features, cols = 1);
-        shift_X = matrix (0, rows = num_features, cols = 1);
-        rowSums_X_sq = rowSums (X ^ 2);
-    }  
-                  
-    # Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and 
rowSums(X ^ 2)
-    # with "rowSums_X_sq" in order to preserve the sparsity of X under shift 
and scale.
-    # The transform is then associatively applied to the other side of the 
expression,
-    # and is rewritten via "scale_X" and "shift_X" as follows:
-    #
-    # ssX_A  = (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:
-    # ssX_A  = diag (scale_X) %*% A;
-    # ssX_A [num_features, ] = ssX_A [num_features, ] + t(shift_X) %*% A;
-    #
-    # tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:
-    # tssX_A = diag (scale_X) %*% A + shift_X %*% A [num_features, ];
-                  
-    # Initialize other input-dependent parameters
-                  
-    lambda = scale_lambda * regularization;
-    if (max_iteration_CG == 0) {
-               max_iteration_CG = num_features;
-    }
-                  
-    # Set up the canonical link, if requested [Then we have: Var(mu) * (d link 
/ d mu) = const]
-                  
-    if (link_type == 0) {
-               if (distribution_type == 1) {
-                       link_type = 1;
-            link_as_power_of_the_mean = 1.0 - variance_as_power_of_the_mean;
-               } else { 
-                       if (distribution_type == 2) {
-                               link_type = 2;
-                       }   
-               }   
-       }
-                                
-    # For power distributions and/or links, we use two constants,
-    # "variance as power of the mean" and "link_as_power_of_the_mean",
-    # to specify the variance and the link as arbitrary powers of the
-    # mean.  However, the variance-powers of 1.0 (Poisson family) and
-    # 2.0 (Gamma family) have to be treated as special cases, because
-    # these values integrate into logarithms.  The link-power of 0.0
-    # is also special as it represents the logarithm link.
-                  
-    num_response_columns = ncol (Y);
-    is_supported = 0;              
-       if (num_response_columns == 2 & distribution_type == 2 & link_type >= 1 
& link_type <= 4) { # BERNOULLI DISTRIBUTION
-               is_supported = 1;                         
-       }
-       if (num_response_columns == 1 & distribution_type == 2) {
-               print ("Error: Bernoulli response matrix has not been converted 
into two-column format.");
-    }
-
-       if (is_supported == 1) {
-                    
-               #####   INITIALIZE THE BETAS   #####
-                    
-        [beta, saturated_log_l, isNaN] = 
-                       glm_initialize (X, Y, distribution_type, 
variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean, 
intercept_status, max_iteration_CG);
-                                         
-               # print(" --- saturated logLik " + saturated_log_l);
-                                         
-        if (isNaN == 0) {
-                      
-                       #####  START OF THE MAIN PART  #####
-                      
-            sum_X_sq = sum (rowSums_X_sq);
-            trust_delta = 0.5 * sqrt (num_features) / max (sqrt 
(rowSums_X_sq));
-            ###  max_trust_delta = trust_delta * 10000.0;
-            log_l = 0.0;
-            deviance_nodisp = 0.0;
-            new_deviance_nodisp = 0.0;
-            isNaN_log_l = 2;
-            newbeta = beta;
-            g = matrix (0.0, rows = num_features, cols = 1);
-            g_norm = sqrt (sum ((g + lambda * beta) ^ 2));
-            accept_new_beta = 1;
-            reached_trust_boundary = 0;
-            neg_log_l_change_predicted = 0.0;
-            i_IRLS = 0;
-                      
-            # print ("BEGIN IRLS ITERATIONS...");
-                      
-            ssX_newbeta = diag (scale_X) %*% newbeta;
-            ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + 
t(shift_X) %*% newbeta;
-            all_linear_terms = X %*% ssX_newbeta;
-                      
-            [new_log_l, isNaN_new_log_l] = glm_log_likelihood_part
-                               (all_linear_terms, Y, distribution_type, 
variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
-                                         
-            if (isNaN_new_log_l == 0) {
-                               new_deviance_nodisp = 2.0 * (saturated_log_l - 
new_log_l);
-                new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2);
-            }
-                      
-            while (termination_code == 0) {
-                               accept_new_beta = 1;
-                        
-                if (i_IRLS > 0) {
-                                       if (isNaN_log_l == 0) {
-                                               accept_new_beta = 0;
-                    }
-                          
-                    # Decide whether to accept a new iteration point and 
update the trust region
-                    # See Alg. 4.1 on p. 69 of "Numerical Optimization" 2nd 
ed. by Nocedal and Wright
-                          
-                    rho = (- new_log_l + log_l) / neg_log_l_change_predicted;
-                                       if (rho < 0.25 | isNaN_new_log_l == 1) {
-                                               trust_delta = 0.25 * 
trust_delta;
-                                       }
-                                       if (rho > 0.75 & isNaN_new_log_l == 0 & 
reached_trust_boundary == 1) {
-                                               trust_delta = 2 * trust_delta;
-                            
-                                               ### if (trust_delta > 
max_trust_delta) {
-                                               ###     trust_delta = 
max_trust_delta;
-                                               ### }
-                                       }
-                                       if (rho > 0.1 & isNaN_new_log_l == 0) {
-                                               accept_new_beta = 1;
-                                       }
-                               }
-                        
-                if (accept_new_beta == 1) {
-                                       beta = newbeta;  log_l = new_log_l;  
deviance_nodisp = new_deviance_nodisp;  isNaN_log_l = isNaN_new_log_l;
-                          
-                                       [g_Y, w] = glm_dist (all_linear_terms, 
Y, distribution_type, variance_as_power_of_the_mean, link_type, 
link_as_power_of_the_mean);
-                          
-                                       # We introduced these variables to 
avoid roundoff errors:
-                                       #     g_Y = y_residual / (y_var * 
link_grad);
-                                       #     w   = 1.0 / (y_var * link_grad * 
link_grad);
-                          
-                                       gXY = - t(X) %*% g_Y;
-                                       g = diag (scale_X) %*% gXY + shift_X 
%*% gXY [num_features, ];
-                                       g_norm = sqrt (sum ((g + lambda * beta) 
^ 2));
-                               }
-                        
-                [z, neg_log_l_change_predicted, num_CG_iters, 
reached_trust_boundary] = 
-                                       get_CG_Steihaug_point (X, scale_X, 
shift_X, w, g, beta, lambda, trust_delta, max_iteration_CG);
-                        
-                               newbeta = beta + z;
-                        
-                               ssX_newbeta = diag (scale_X) %*% newbeta;
-                               ssX_newbeta [num_features, ] = ssX_newbeta 
[num_features, ] + t(shift_X) %*% newbeta;
-                               all_linear_terms = X %*% ssX_newbeta;
-                        
-                               [new_log_l, isNaN_new_log_l] = 
glm_log_likelihood_part
-                                       (all_linear_terms, Y, 
distribution_type, variance_as_power_of_the_mean, link_type, 
link_as_power_of_the_mean);
-                        
-                               if (isNaN_new_log_l == 0) {
-                                       new_deviance_nodisp = 2.0 * 
(saturated_log_l - new_log_l);
-                                       new_log_l = new_log_l - 0.5 * sum 
(lambda * newbeta ^ 2);
-                               }
-                        
-                               log_l_change = new_log_l - log_l;               
# R's criterion for termination: |dev - devold|/(|dev| + 0.1) < eps
-                       
-                               if (reached_trust_boundary == 0 & 
isNaN_new_log_l == 0 & 
-                                  (2.0 * abs (log_l_change) < eps * 
(deviance_nodisp + 0.1) | abs (log_l_change) < (abs (log_l) + abs (new_log_l)) 
* 0.00000000000001) ) {
-                                       termination_code = 1;
-                               }
-                               rho = - log_l_change / 
neg_log_l_change_predicted;
-                               z_norm = sqrt (sum (z * z));
-                                              
-                               i_IRLS = i_IRLS + 1;                        
-                        
-                               if (i_IRLS == max_iteration_IRLS) {
-                                       termination_code = 2;
-                               }
-                       }
-                      
-            beta = newbeta;
-            log_l = new_log_l;
-            deviance_nodisp = new_deviance_nodisp;
-                     
-            #---------------------------- last part
-
-                       if (termination_code != 1) {
-                               print ("One of the runs of GLM did not 
converged in " + i_IRLS + " steps!");
-                       }
-                      
-            ##### COMPUTE AIC ##### 
-            
-                       if (distribution_type == 2 & link_type >= 1 & link_type 
<= 4) {                 
-                               AIC = -2 * log_l;
-                               if (sum (X) != 0) {
-                                       AIC = AIC + 2 * num_features;   
-                               }
-                       } else {
-                               stop ("Currently only the Bernoulli 
distribution family the following link functions are supported: log, logit, 
probit, and cloglog!");
-                       }
-                                                  
-            if (fileB != " ") {
-                               fileO = ifdef ($O, " ");
-                               fileS = $S;
-                               fmt  = ifdef ($fmt, "text");    
-                       
-                               # Output which features give the best AIC and 
are being used for linear regression 
-                               write (Selected, fileS, format=fmt);
-               
-                               ssX_beta = diag (scale_X) %*% beta;
-                ssX_beta [num_features, ] = ssX_beta [num_features, ] + 
t(shift_X) %*% beta;
-                               if (intercept_status == 2) {
-                                       beta_out = append (ssX_beta, beta);
-                } else {
-                                       beta_out = ssX_beta;
-                }
-                
-                if (intercept_status == 0 & num_features == 1) {
-                                       p = sum (ppred (X, 1, "=="));
-                                       if (p == num_records) {
-                                               beta_out = beta_out[1,];
-                                       }                                       
-                } 
-
-                                                               
-                if (intercept_status == 1 | intercept_status == 2) {
-                                       intercept_value = castAsScalar 
(beta_out [num_features, 1]);
-                    beta_noicept = beta_out [1 : (num_features - 1), 1];
-                } else {
-                                       beta_noicept = beta_out [1 : 
num_features, 1];
-                }
-                min_beta = min (beta_noicept);
-                max_beta = max (beta_noicept);
-                tmp_i_min_beta = rowIndexMin (t(beta_noicept))
-                i_min_beta = castAsScalar (tmp_i_min_beta [1, 1]);
-                tmp_i_max_beta = rowIndexMax (t(beta_noicept))
-                i_max_beta = castAsScalar (tmp_i_max_beta [1, 1]);
-                        
-                #####  OVER-DISPERSION PART  #####
-                      
-                all_linear_terms = X %*% ssX_beta;
-                               [g_Y, w] = glm_dist (all_linear_terms, Y, 
distribution_type, variance_as_power_of_the_mean, link_type, 
link_as_power_of_the_mean);
-                        
-                pearson_residual_sq = g_Y ^ 2 / w;
-                pearson_residual_sq = replace (target = pearson_residual_sq, 
pattern = 0.0/0.0, replacement = 0);
-                # pearson_residual_sq = (y_residual ^ 2) / y_var;
-                        
-                if (num_records > num_features) {
-                                       estimated_dispersion = sum 
(pearson_residual_sq) / (num_records - num_features);
-                }
-                if (dispersion <= 0.0) {
-                                       dispersion = estimated_dispersion;
-                }
-                deviance = deviance_nodisp / dispersion;
-                        
-                #####  END OF THE MAIN PART  #####
-                        
-                str = "BETA_MIN," + min_beta;
-                str = append (str, "BETA_MIN_INDEX," + i_min_beta);
-                str = append (str, "BETA_MAX," + max_beta);
-                str = append (str, "BETA_MAX_INDEX," + i_max_beta);
-                str = append (str, "INTERCEPT," + intercept_value);
-                str = append (str, "DISPERSION," + dispersion);
-                str = append (str, "DISPERSION_EST," + estimated_dispersion);
-                str = append (str, "DEVIANCE_UNSCALED," + deviance_nodisp);
-                str = append (str, "DEVIANCE_SCALED," + deviance);
-                        
-                if (fileO != " ") {
-                                       write (str, fileO);
-                } 
-                               else {
-                                       print (str);
-                }
-                       
-                               # Prepare the output matrix
-                               print ("Writing the output matrix...");
-                if (intercept_status == 0 & num_features == 1) { 
-                                       if (p == num_records) {
-                                               beta_out_tmp = matrix (0, rows 
= num_features_orig + 1, cols = 1); 
-                                               beta_out_tmp[num_features_orig 
+ 1,] = beta_out;
-                                               beta_out = beta_out_tmp;
-                                               write (beta_out, fileB, 
format=fmt);
-                                               stop ("");
-                                       } else if (sum (X) == 0){
-                                               beta_out = matrix (0, rows = 
num_features_orig, cols = 1);
-                                               write (beta_out, fileB, 
format=fmt);
-                                               stop ("");
-                                       }
-                               }
-
-                               no_selected = ncol (Selected);
-                               max_selected = max (Selected);
-                               last = max_selected + 1;        
-               
-                               if (intercept_status != 0) {
-               
-                                       Selected_ext = append (Selected, 
as.matrix (last));                     
-                                       P1 = table (seq (1, ncol 
(Selected_ext)), t(Selected_ext)); 
-
-                                       if (intercept_status == 2) {
-                       
-                                               P1_ssX_beta = P1 * ssX_beta;
-                                               P2_ssX_beta = colSums 
(P1_ssX_beta);
-                                               P1_beta = P1 * beta;
-                                               P2_beta = colSums (P1_beta);
-                               
-                                               if (max_selected < 
num_features_orig) {
-                                               
-                                                       P2_ssX_beta = append 
(P2_ssX_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
-                                                       P2_beta = append 
(P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
-                                                       
-                                                       P2_ssX_beta[1, 
num_features_orig+1] = P2_ssX_beta[1, max_selected + 1]; 
-                                                       P2_ssX_beta[1, 
max_selected + 1] = 0;
-                                                       
-                                                       P2_beta[1, 
num_features_orig+1] = P2_beta[1, max_selected + 1]; 
-                                                       P2_beta[1, max_selected 
+ 1] = 0;
-
-                                               }
-                                               beta_out = append 
(t(P2_ssX_beta), t(P2_beta));
-                               
-                                       } else {
-                       
-                                               P1_beta = P1 * beta;
-                                               P2_beta = colSums (P1_beta);
-                               
-                                               if (max_selected < 
num_features_orig) {
-                                                       P2_beta = append 
(P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
-                                                       P2_beta[1, 
num_features_orig+1] = P2_beta[1, max_selected + 1] ; 
-                                                       P2_beta[1, max_selected 
+ 1] = 0;
-                                               }
-                                               beta_out = t(P2_beta);
-                               
-                                       }
-                               } else {
-               
-                                       P1 = table (seq (1, no_selected), 
t(Selected)); 
-                                       P1_beta = P1 * beta;
-                                       P2_beta = colSums (P1_beta);    
-
-                                       if (max_selected < num_features_orig) {
-                                               P2_beta = append (P2_beta, 
matrix (0, rows=1, cols=(num_features_orig - max_selected)));
-                                       }               
-
-                                       beta_out = t(P2_beta);  
-                               }
-       
-                               write ( beta_out, fileB, format=fmt );
-                       
-                       }
-                      
-               } else { 
-                       stop ("Input matrices X and/or Y are out of range!"); 
-        }
-       } else { 
-               stop ("Response matrix with " + num_response_columns + " 
columns, distribution family (" + distribution_type + ", " + 
variance_as_power_of_the_mean
-               + ") and link family (" + link_type + ", " + 
link_as_power_of_the_mean + ") are NOT supported together.");
-    }
-}
-
-glm_initialize = function (Matrix[double] X, Matrix[double] Y, int dist_type, 
double var_power, int link_type, double link_power, int icept_status, int 
max_iter_CG)
-  return (Matrix[double] beta, double saturated_log_l, int isNaN)
-{
-    saturated_log_l = 0.0;
-    isNaN = 0;
-    y_corr = Y [, 1];
-    if (dist_type == 2) {
-      n_corr = rowSums (Y);
-      is_n_zero = ppred (n_corr, 0.0, "==");
-      y_corr = Y [, 1] / (n_corr + is_n_zero) + (0.5 - Y [, 1]) * is_n_zero;   
 
-    }
-    linear_terms = y_corr;
-    if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
-      if          (link_power ==  0.0) {
-        if (sum (ppred (y_corr, 0.0, "<")) == 0) {
-          is_zero_y_corr = ppred (y_corr, 0.0, "==");
-          linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 
- is_zero_y_corr);
-        } else { isNaN = 1; }
-      } else { if (link_power ==  1.0) {
-        linear_terms = y_corr;
-      } else { if (link_power == -1.0) {
-        linear_terms = 1.0 / y_corr;
-      } else { if (link_power ==  0.5) {
-        if (sum (ppred (y_corr, 0.0, "<")) == 0) {
-          linear_terms = sqrt (y_corr);
-        } else { isNaN = 1; }
-      } else { if (link_power >   0.0) {
-        if (sum (ppred (y_corr, 0.0, "<")) == 0) {
-          is_zero_y_corr = ppred (y_corr, 0.0, "==");
-          linear_terms = (y_corr + is_zero_y_corr) ^ link_power - 
is_zero_y_corr;
-        } else { isNaN = 1; }
-      } else {
-        if (sum (ppred (y_corr, 0.0, "<=")) == 0) {
-          linear_terms = y_corr ^ link_power;
-        } else { isNaN = 1; }
-      }}}}}
-    }
-    if (dist_type == 2 & link_type >= 1 & link_type <= 5)
-    { # BINOMIAL/BERNOULLI DISTRIBUTION
-      if          (link_type == 1 & link_power == 0.0)  { # Binomial.log
-        if (sum (ppred (y_corr, 0.0, "<")) == 0) {
-          is_zero_y_corr = ppred (y_corr, 0.0, "==");
-          linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 
- is_zero_y_corr);
-        } else { isNaN = 1; }
-      } else { if (link_type == 1 & link_power >  0.0)  { # 
Binomial.power_nonlog pos
-        if (sum (ppred (y_corr, 0.0, "<")) == 0) {
-          is_zero_y_corr = ppred (y_corr, 0.0, "==");
-          linear_terms = (y_corr + is_zero_y_corr) ^ link_power - 
is_zero_y_corr;
-        } else { isNaN = 1; }
-      } else { if (link_type == 1)                      { # 
Binomial.power_nonlog neg
-        if (sum (ppred (y_corr, 0.0, "<=")) == 0) {
-          linear_terms = y_corr ^ link_power;
-        } else { isNaN = 1; }
-      } else { 
-        is_zero_y_corr = ppred (y_corr, 0.0, "<=");
-        is_one_y_corr  = ppred (y_corr, 1.0, ">=");
-        y_corr = y_corr * (1.0 - is_zero_y_corr) * (1.0 - is_one_y_corr) + 0.5 
* (is_zero_y_corr + is_one_y_corr);
-        if (link_type == 2)                           { # Binomial.logit
-          linear_terms = log (y_corr / (1.0 - y_corr)) 
-          + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - 
is_zero_y_corr);
-        } else { if (link_type == 3)                  { # Binomial.probit
-          y_below_half = y_corr + (1.0 - 2.0 * y_corr) * ppred (y_corr, 0.5, 
">");
-          t = sqrt (- 2.0 * log (y_below_half));
-          approx_inv_Gauss_CDF = - t + (2.515517 + t * (0.802853 + t * 
0.010328)) / (1.0 + t * (1.432788 + t * (0.189269 + t * 0.001308)));
-          linear_terms = approx_inv_Gauss_CDF * (1.0 - 2.0 * ppred (y_corr, 
0.5, ">"))
-          + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - 
is_zero_y_corr);
-        } else { if (link_type == 4)                  { # Binomial.cloglog
-          linear_terms = log (- log (1.0 - y_corr))
-          - log (- log (0.5)) * (is_zero_y_corr + is_one_y_corr)
-          + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - 
is_zero_y_corr);
-        } else { if (link_type == 5)                  { # Binomial.cauchit
-          linear_terms = tan ((y_corr - 0.5) * 
3.1415926535897932384626433832795)
-          + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - 
is_zero_y_corr);
-        }}  }}}}}
-    }
-    
-    if (isNaN == 0) {
-      [saturated_log_l, isNaN] = 
-        glm_log_likelihood_part (linear_terms, Y, dist_type, var_power, 
link_type, link_power);
-    }
-    
-    if ((dist_type == 1 & link_type == 1 & link_power == 0.0) |
-          (dist_type == 2 & link_type >= 2))
-    {    
-      desired_eta = 0.0;
-    } else { if (link_type == 1 & link_power == 0.0) {
-      desired_eta = log (0.5);
-    } else { if (link_type == 1) {
-      desired_eta = 0.5 ^ link_power;
-    } else {
-      desired_eta = 0.5;
-    }}}
-    
-    beta = matrix (0.0, rows = ncol(X), cols = 1);
-    
-    if (desired_eta != 0.0) {
-      if (icept_status == 1 | icept_status == 2) {
-        beta [nrow(beta), 1] = desired_eta;
-      } else {
-        # We want: avg (X %*% ssX_transform %*% beta) = desired_eta
-        # Note that "ssX_transform" is trivial here, hence ignored
-        
-        beta = straightenX (X, 0.000001, max_iter_CG);  
-        beta = beta * desired_eta;
-      }   }   }
-
-
-glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y,
-                     int dist_type, double var_power, int link_type, double 
link_power)
-  return (Matrix[double] g_Y, Matrix[double] w)
-# ORIGINALLY we returned more meaningful vectors, namely:
-# Matrix[double] y_residual    : y - y_mean, i.e. y observed - y predicted
-# Matrix[double] link_gradient : derivative of the link function
-# Matrix[double] var_function  : variance without dispersion, i.e. the V(mu) 
function
-# BUT, this caused roundoff errors, so we had to compute "directly useful" 
vectors
-# and skip over the "meaningful intermediaries".  Now we output these two 
variables:
-#     g_Y = y_residual / (var_function * link_gradient);
-#     w   = 1.0 / (var_function * link_gradient ^ 2);
-{
-    num_records = nrow (linear_terms);
-    zeros_r = matrix (0.0, rows = num_records, cols = 1);
-    ones_r = 1 + zeros_r;
-    g_Y  = zeros_r;
-    w  = zeros_r;
-    
-    # Some constants
-    
-    one_over_sqrt_two_pi = 0.39894228040143267793994605993438;
-    ones_2 = matrix (1.0, rows = 1, cols = 2);
-    p_one_m_one = ones_2;
-    p_one_m_one [1, 2] = -1.0;
-    m_one_p_one = ones_2;
-    m_one_p_one [1, 1] = -1.0;
-    zero_one = ones_2;
-    zero_one [1, 1] = 0.0;
-    one_zero = ones_2;
-    one_zero [1, 2] = 0.0;
-    flip_pos = matrix (0, rows = 2, cols = 2);
-    flip_neg = flip_pos;
-    flip_pos [1, 2] = 1;
-    flip_pos [2, 1] = 1;
-    flip_neg [1, 2] = -1;
-    flip_neg [2, 1] = 1;
-    
-    if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
-      y_mean = zeros_r;
-      if          (link_power ==  0.0) {
-        y_mean = exp (linear_terms);
-        y_mean_pow = y_mean ^ (1 - var_power);
-        w   = y_mean_pow * y_mean;
-        g_Y = y_mean_pow * (Y - y_mean);
-      } else { if (link_power ==  1.0) {
-        y_mean = linear_terms;
-        w   = y_mean ^ (- var_power);
-        g_Y = w * (Y - y_mean);
-      } else {
-        y_mean = linear_terms ^ (1.0 / link_power);
-        c1  = (1 - var_power) / link_power - 1;
-        c2  = (2 - var_power) / link_power - 2;
-        g_Y = (linear_terms ^ c1) * (Y - y_mean) / link_power;
-        w   = (linear_terms ^ c2) / (link_power ^ 2);
-      }   }}
-    if (dist_type == 2 & link_type >= 1 & link_type <= 5)
-    { # BINOMIAL/BERNOULLI DISTRIBUTION
-      if (link_type == 1) { # BINOMIAL.POWER LINKS
-        if (link_power == 0.0)  { # Binomial.log
-          vec1 = 1 / (exp (- linear_terms) - 1);
-          g_Y = Y [, 1] - Y [, 2] * vec1;
-          w   = rowSums (Y) * vec1;
-        } else {                  # Binomial.nonlog
-          vec1 = zeros_r;
-          if (link_power == 0.5)  {
-            vec1 = 1 / (1 - linear_terms ^ 2);
-          } else { if (sum (ppred (linear_terms, 0.0, "<")) == 0) {
-            vec1 = linear_terms ^ (- 2 + 1 / link_power) / (1 - linear_terms ^ 
(1 / link_power));
-          } else {isNaN = 1;}}
-          # We want a "zero-protected" version of
-          #     vec2 = Y [, 1] / linear_terms;
-          is_y_0 = ppred (Y [, 1], 0.0, "==");
-          vec2 = (Y [, 1] + is_y_0) / (linear_terms * (1 - is_y_0) + is_y_0) - 
is_y_0;
-          g_Y =  (vec2 - Y [, 2] * vec1 * linear_terms) / link_power;
-          w   =  rowSums (Y) * vec1 / link_power ^ 2;
-        }
-      } else {
-        is_LT_pos_infinite = ppred (linear_terms,  1.0/0.0, "==");
-        is_LT_neg_infinite = ppred (linear_terms, -1.0/0.0, "==");
-        is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite 
%*% zero_one;
-        finite_linear_terms = replace (target =        linear_terms, pattern = 
 1.0/0.0, replacement = 0);
-        finite_linear_terms = replace (target = finite_linear_terms, pattern = 
-1.0/0.0, replacement = 0);
-        if (link_type == 2)                           { # Binomial.logit
-          Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% 
zero_one;
-          Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2);
-          Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + 
is_LT_infinite;
-          g_Y = rowSums (Y * (Y_prob %*% flip_neg));           ### = 
y_residual;
-          w   = rowSums (Y * (Y_prob %*% flip_pos) * Y_prob);  ### = 
y_variance;
-        } else { if (link_type == 3)                  { # Binomial.probit
-          is_lt_pos = ppred (linear_terms, 0.0, ">=");
-          t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888);  # 
0.231641888 = 0.3275911 / sqrt (2.0)
-          pt_gp = t_gp * ( 0.254829592 
-                           + t_gp * (-0.284496736 # "Handbook of Mathematical 
Functions", ed. by M. Abramowitz and I.A. Stegun,
-                                     + t_gp * ( 1.421413741 # U.S. Nat-l 
Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
-                                                + t_gp * (-1.453152027 
-                                                          + t_gp *   
1.061405429))));
-          the_gauss_exp = exp (- (linear_terms ^ 2) / 2.0);
-          vec1 = 0.25 * pt_gp * (2 - the_gauss_exp * pt_gp);
-          vec2 = Y [, 1] - rowSums (Y) * is_lt_pos + the_gauss_exp * pt_gp * 
rowSums (Y) * (is_lt_pos - 0.5);
-          w   = the_gauss_exp * (one_over_sqrt_two_pi ^ 2) * rowSums (Y) / 
vec1;
-          g_Y = one_over_sqrt_two_pi * vec2 / vec1;
-        } else { if (link_type == 4)                  { # Binomial.cloglog
-          the_exp = exp (linear_terms)
-          the_exp_exp = exp (- the_exp);
-          is_too_small = ppred (10000000 + the_exp, 10000000, "==");
-          the_exp_ratio = (1 - is_too_small) * (1 - the_exp_exp) / (the_exp + 
is_too_small) + is_too_small * (1 - the_exp / 2);
-          g_Y =  (rowSums (Y) * the_exp_exp - Y [, 2]) / the_exp_ratio;
-          w   =  the_exp_exp * the_exp * rowSums (Y) / the_exp_ratio;
-        } else { if (link_type == 5)                  { # Binomial.cauchit
-          Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / 
3.1415926535897932384626433832795;
-          Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + 
is_LT_infinite;
-          y_residual = Y [, 1] * Y_prob [, 2] - Y [, 2] * Y_prob [, 1];
-          var_function = rowSums (Y) * Y_prob [, 1] * Y_prob [, 2];
-          link_gradient_normalized = (1 + linear_terms ^ 2) * 
3.1415926535897932384626433832795;
-          g_Y =  rowSums (Y) * y_residual / (var_function * 
link_gradient_normalized);
-          w   = (rowSums (Y) ^ 2) / (var_function * link_gradient_normalized ^ 
2);
-        }}}}   
-      }
-    }
-  }
-
-
-glm_log_likelihood_part = function (Matrix[double] linear_terms, 
Matrix[double] Y,
-                                    int dist_type, double var_power, int 
link_type, double link_power)
-  return (double log_l, int isNaN)
-{
-    isNaN = 0;
-    log_l = 0.0;
-    num_records = nrow (Y);
-    zeros_r = matrix (0.0, rows = num_records, cols = 1);
-    
-    if (dist_type == 1 & link_type == 1)
-    { # POWER DISTRIBUTION
-      b_cumulant = zeros_r;
-      natural_parameters = zeros_r;
-      is_natural_parameter_log_zero = zeros_r;
-      if          (var_power == 1.0 & link_power == 0.0)  { # Poisson.log
-        b_cumulant = exp (linear_terms);
-        is_natural_parameter_log_zero = ppred (linear_terms, -1.0/0.0, "==");
-        natural_parameters = replace (target = linear_terms, pattern = 
-1.0/0.0, replacement = 0);
-      } else { if (var_power == 1.0 & link_power == 1.0)  { # Poisson.id
-        if (sum (ppred (linear_terms, 0.0, "<")) == 0)  {
-          b_cumulant = linear_terms;
-          is_natural_parameter_log_zero = ppred (linear_terms, 0.0, "==");
-          natural_parameters = log (linear_terms + 
is_natural_parameter_log_zero);
-        } else {isNaN = 1;}
-      } else { if (var_power == 1.0 & link_power == 0.5)  { # Poisson.sqrt
-        if (sum (ppred (linear_terms, 0.0, "<")) == 0)  {
-          b_cumulant = linear_terms ^ 2;
-          is_natural_parameter_log_zero = ppred (linear_terms, 0.0, "==");
-          natural_parameters = 2.0 * log (linear_terms + 
is_natural_parameter_log_zero);
-        } else {isNaN = 1;}
-      } else { if (var_power == 1.0 & link_power  > 0.0)  { # 
Poisson.power_nonlog, pos
-        if (sum (ppred (linear_terms, 0.0, "<")) == 0)  {
-          is_natural_parameter_log_zero = ppred (linear_terms, 0.0, "==");
-          b_cumulant = (linear_terms + is_natural_parameter_log_zero) ^ (1.0 / 
link_power) - is_natural_parameter_log_zero;
-          natural_parameters = log (linear_terms + 
is_natural_parameter_log_zero) / link_power;
-        } else {isNaN = 1;}
-      } else { if (var_power == 1.0)                      { # 
Poisson.power_nonlog, neg
-        if (sum (ppred (linear_terms, 0.0, "<=")) == 0) {
-          b_cumulant = linear_terms ^ (1.0 / link_power);
-          natural_parameters = log (linear_terms) / link_power;
-        } else {isNaN = 1;}
-      } else { if (var_power == 2.0 & link_power == -1.0) { # Gamma.inverse
-        if (sum (ppred (linear_terms, 0.0, "<=")) == 0) {
-          b_cumulant = - log (linear_terms);
-          natural_parameters = - linear_terms;
-        } else {isNaN = 1;}
-      } else { if (var_power == 2.0 & link_power ==  1.0) { # Gamma.id
-        if (sum (ppred (linear_terms, 0.0, "<=")) == 0) {
-          b_cumulant = log (linear_terms);
-          natural_parameters = - 1.0 / linear_terms;
-        } else {isNaN = 1;}
-      } else { if (var_power == 2.0 & link_power ==  0.0) { # Gamma.log
-        b_cumulant = linear_terms;
-        natural_parameters = - exp (- linear_terms);
-      } else { if (var_power == 2.0)                      { # 
Gamma.power_nonlog
-        if (sum (ppred (linear_terms, 0.0, "<=")) == 0) {
-          b_cumulant = log (linear_terms) / link_power;
-          natural_parameters = - linear_terms ^ (- 1.0 / link_power);
-        } else {isNaN = 1;}
-      } else { if                    (link_power ==  0.0) { # PowerDist.log
-        natural_parameters = exp (linear_terms * (1.0 - var_power)) / (1.0 - 
var_power);
-        b_cumulant = exp (linear_terms * (2.0 - var_power)) / (2.0 - 
var_power);
-      } else {                                              # 
PowerDist.power_nonlog
-        if          (-2 * link_power == 1.0 - var_power) {
-          natural_parameters = 1.0 / (linear_terms ^ 2) / (1.0 - var_power);
-        } else { if (-1 * link_power == 1.0 - var_power) {
-          natural_parameters = 1.0 / linear_terms / (1.0 - var_power);
-        } else { if (     link_power == 1.0 - var_power) {
-          natural_parameters = linear_terms / (1.0 - var_power);
-        } else { if ( 2 * link_power == 1.0 - var_power) {
-          natural_parameters = linear_terms ^ 2 / (1.0 - var_power);
-        } else {
-          if (sum (ppred (linear_terms, 0.0, "<=")) == 0) {
-            power = (1.0 - var_power) / link_power;
-            natural_parameters = (linear_terms ^ power) / (1.0 - var_power);
-          } else {isNaN = 1;}
-        }}}}
-        if          (-2 * link_power == 2.0 - var_power) {
-          b_cumulant = 1.0 / (linear_terms ^ 2) / (2.0 - var_power);
-        } else { if (-1 * link_power == 2.0 - var_power) {
-          b_cumulant = 1.0 / linear_terms / (2.0 - var_power);
-        } else { if (     link_power == 2.0 - var_power) {
-          b_cumulant = linear_terms / (2.0 - var_power);
-        } else { if ( 2 * link_power == 2.0 - var_power) {
-          b_cumulant = linear_terms ^ 2 / (2.0 - var_power);
-        } else {
-          if (sum (ppred (linear_terms, 0.0, "<=")) == 0) {
-            power = (2.0 - var_power) / link_power;
-            b_cumulant = (linear_terms ^ power) / (2.0 - var_power);
-          } else {isNaN = 1;}
-        }}}}
-      }}}}} }}}}}
-      if (sum (is_natural_parameter_log_zero * abs (Y)) > 0.0) {
-        log_l = -1.0 / 0.0;
-        isNaN = 1;
-      }
-      if (isNaN == 0)
-      {
-        log_l = sum (Y * natural_parameters - b_cumulant);
-        if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) {
-          log_l = -1.0 / 0.0;
-          isNaN = 1;
-        }   }   }
-    
-    if (dist_type == 2 & link_type >= 1 & link_type <= 5)
-    { # BINOMIAL/BERNOULLI DISTRIBUTION
-      
-      [Y_prob, isNaN] = binomial_probability_two_column (linear_terms, 
link_type, link_power);
-      
-      if (isNaN == 0) {            
-        does_prob_contradict = ppred (Y_prob, 0.0, "<=");
-        if (sum (does_prob_contradict * abs (Y)) == 0.0) {
-          log_l = sum (Y * log (Y_prob * (1 - does_prob_contradict) + 
does_prob_contradict));
-          if (log_l != log_l | (log_l == log_l + 1.0 & log_l == log_l * 2.0)) {
-            isNaN = 1;
-          }
-        } else {
-          log_l = -1.0 / 0.0;
-          isNaN = 1;
-        }   }   }
-    
-    if (isNaN == 1) {
-      log_l = - 1.0 / 0.0; 
-    }
-  }
-
-
-
-binomial_probability_two_column =
-  function (Matrix[double] linear_terms, int link_type, double link_power)
-    return   (Matrix[double] Y_prob, int isNaN)
-{
-      isNaN = 0;
-      num_records = nrow (linear_terms);
-      
-      # Define some auxiliary matrices
-      
-      ones_2 = matrix (1.0, rows = 1, cols = 2);
-      p_one_m_one = ones_2;
-      p_one_m_one [1, 2] = -1.0;
-      m_one_p_one = ones_2;
-      m_one_p_one [1, 1] = -1.0;
-      zero_one = ones_2;
-      zero_one [1, 1] = 0.0;
-      one_zero = ones_2;
-      one_zero [1, 2] = 0.0;
-      
-      zeros_r = matrix (0.0, rows = num_records, cols = 1);
-      ones_r = 1.0 + zeros_r;
-      
-      # Begin the function body
-      
-      Y_prob = zeros_r %*% ones_2;
-      if (link_type == 1) { # Binomial.power
-        if          (link_power == 0.0) { # Binomial.log
-          Y_prob = exp (linear_terms) %*% p_one_m_one + ones_r %*% zero_one;   
 
-        } else { if (link_power == 0.5) { # Binomial.sqrt
-          Y_prob = (linear_terms ^ 2) %*% p_one_m_one + ones_r %*% zero_one;   
 
-        } else {                          # Binomial.power_nonlog
-          if (sum (ppred (linear_terms, 0.0, "<")) == 0) {
-            Y_prob = (linear_terms ^ (1.0 / link_power)) %*% p_one_m_one + 
ones_r %*% zero_one;    
-          } else {isNaN = 1;}
-        }}
-      } else {              # Binomial.non_power
-        is_LT_pos_infinite = ppred (linear_terms,  1.0/0.0, "==");
-        is_LT_neg_infinite = ppred (linear_terms, -1.0/0.0, "==");
-        is_LT_infinite = is_LT_pos_infinite %*% one_zero + is_LT_neg_infinite 
%*% zero_one;
-        finite_linear_terms = replace (target =        linear_terms, pattern = 
 1.0/0.0, replacement = 0);
-        finite_linear_terms = replace (target = finite_linear_terms, pattern = 
-1.0/0.0, replacement = 0);
-        if (link_type == 2)             { # Binomial.logit
-          Y_prob = exp (finite_linear_terms) %*% one_zero + ones_r %*% 
zero_one;
-          Y_prob = Y_prob / (rowSums (Y_prob) %*% ones_2);
-        } else { if (link_type == 3)    { # Binomial.probit
-          lt_pos_neg = ppred (finite_linear_terms, 0.0, ">=") %*% p_one_m_one 
+ ones_r %*% zero_one;
-          t_gp = 1.0 / (1.0 + abs (finite_linear_terms) * 0.231641888);  # 
0.231641888 = 0.3275911 / sqrt (2.0)
-          pt_gp = t_gp * ( 0.254829592 
-                           + t_gp * (-0.284496736 # "Handbook of Mathematical 
Functions", ed. by M. Abramowitz and I.A. Stegun,
-                                     + t_gp * ( 1.421413741 # U.S. Nat-l 
Bureau of Standards, 10th print (Dec 1972), Sec. 7.1.26, p. 299
-                                                + t_gp * (-1.453152027 
-                                                          + t_gp *   
1.061405429))));
-          the_gauss_exp = exp (- (finite_linear_terms ^ 2) / 2.0);
-          Y_prob = lt_pos_neg + ((the_gauss_exp * pt_gp) %*% ones_2) * (0.5 - 
lt_pos_neg);
-        } else { if (link_type == 4)    { # Binomial.cloglog
-          the_exp = exp (finite_linear_terms);
-          the_exp_exp = exp (- the_exp);
-          is_too_small = ppred (10000000 + the_exp, 10000000, "==");
-          Y_prob [, 1] = (1 - is_too_small) * (1 - the_exp_exp) + is_too_small 
* the_exp * (1 - the_exp / 2);
-          Y_prob [, 2] = the_exp_exp;
-        } else { if (link_type == 5)    { # Binomial.cauchit
-          Y_prob = 0.5 + (atan (finite_linear_terms) %*% p_one_m_one) / 
3.1415926535897932384626433832795;
-        } else {
-          isNaN = 1;
-        }}}}
-        Y_prob = Y_prob * ((1.0 - rowSums (is_LT_infinite)) %*% ones_2) + 
is_LT_infinite;
-      }   }            
-
-
-# THE CG-STEIHAUG PROCEDURE SCRIPT
-
-# Apply Conjugate Gradient - Steihaug algorithm in order to approximately 
minimize
-# 0.5 z^T (X^T diag(w) X + diag (lambda)) z + (g + lambda * beta)^T z
-# under constraint:  ||z|| <= trust_delta.
-# See Alg. 7.2 on p. 171 of "Numerical Optimization" 2nd ed. by Nocedal and 
Wright
-# IN THE ABOVE, "X" IS UNDERSTOOD TO BE "X %*% (SHIFT/SCALE TRANSFORM)"; this 
transform
-# is given separately because sparse "X" may become dense after applying the 
transform.
-#
-get_CG_Steihaug_point =
-  function (Matrix[double] X, Matrix[double] scale_X, Matrix[double] shift_X, 
Matrix[double] w,
-            Matrix[double] g, Matrix[double] beta, Matrix[double] lambda, 
double trust_delta, int max_iter_CG)
-    return (Matrix[double] z, double neg_log_l_change, int i_CG, int 
reached_trust_boundary)
-{
-      trust_delta_sq = trust_delta ^ 2;
-      size_CG = nrow (g);
-      z = matrix (0.0, rows = size_CG, cols = 1);
-      neg_log_l_change = 0.0;
-      reached_trust_boundary = 0;
-      g_reg = g + lambda * beta;
-      r_CG = g_reg;
-      p_CG = -r_CG;
-      rr_CG = sum(r_CG * r_CG);
-      eps_CG = rr_CG * min (0.25, sqrt (rr_CG));
-      converged_CG = 0;
-      if (rr_CG < eps_CG) {
-        converged_CG = 1;
-      }
-      
-      max_iteration_CG = max_iter_CG;
-      if (max_iteration_CG <= 0) {
-        max_iteration_CG = size_CG;
-      }
-      i_CG = 0;
-      while (converged_CG == 0)
-      {
-        i_CG = i_CG + 1;
-        ssX_p_CG = diag (scale_X) %*% p_CG;
-        ssX_p_CG [size_CG, ] = ssX_p_CG [size_CG, ] + t(shift_X) %*% p_CG;
-        temp_CG = t(X) %*% (w * (X %*% ssX_p_CG));
-        q_CG = (lambda * p_CG) + diag (scale_X) %*% temp_CG + shift_X %*% 
temp_CG [size_CG, ];
-        pq_CG = sum (p_CG * q_CG);
-        if (pq_CG <= 0) {
-          pp_CG = sum (p_CG * p_CG);  
-          if (pp_CG > 0) {
-            [z, neg_log_l_change] = 
-              get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, 
pq_CG, trust_delta_sq);
-            reached_trust_boundary = 1;
-          } else {
-            neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg));
-          }
-          converged_CG = 1;
-        }
-        if (converged_CG == 0) {
-          alpha_CG = rr_CG / pq_CG;
-          new_z = z + alpha_CG * p_CG;
-          if (sum(new_z * new_z) >= trust_delta_sq) {
-            pp_CG = sum (p_CG * p_CG);  
-            [z, neg_log_l_change] = 
-              get_trust_boundary_point (g_reg, z, p_CG, q_CG, r_CG, pp_CG, 
pq_CG, trust_delta_sq);
-            reached_trust_boundary = 1;
-            converged_CG = 1;
-          }
-          if (converged_CG == 0) {
-            z = new_z;
-            old_rr_CG = rr_CG;
-            r_CG = r_CG + alpha_CG * q_CG;
-            rr_CG = sum(r_CG * r_CG);
-            if (i_CG == max_iteration_CG | rr_CG < eps_CG) {
-              neg_log_l_change = 0.5 * sum (z * (r_CG + g_reg));
-              reached_trust_boundary = 0;
-              converged_CG = 1;
-            }
-            if (converged_CG == 0) {
-              p_CG = -r_CG + (rr_CG / old_rr_CG) * p_CG;
-            }   }   }   }   }
-
-
-# An auxiliary function used twice inside the CG-STEIHAUG loop:
-get_trust_boundary_point = 
-  function (Matrix[double] g, Matrix[double] z, Matrix[double] p, 
-            Matrix[double] q, Matrix[double] r, double pp, double pq, 
-            double trust_delta_sq)
-    return (Matrix[double] new_z, double f_change)
-{
-      zz = sum (z * z);  pz = sum (p * z);
-      sq_root_d = sqrt (pz * pz - pp * (zz - trust_delta_sq));
-      tau_1 = (- pz + sq_root_d) / pp;
-      tau_2 = (- pz - sq_root_d) / pp;
-      zq = sum (z * q);  gp = sum (g * p);
-      f_extra = 0.5 * sum (z * (r + g));
-      f_change_1 = f_extra + (0.5 * tau_1 * pq + zq + gp) * tau_1;
-      f_change_2 = f_extra + (0.5 * tau_2 * pq + zq + gp) * tau_2;
-      ind1 = as.integer(f_change_1 < f_change_2);
-      ind2 = as.integer(f_change_1 >= f_change_2);
-      new_z = z + ((ind1 * tau_1 + ind2 * tau_2) * p);
-      f_change = ind1 * f_change_1 + ind2 * f_change_2;
-}
-
-
-# Computes vector w such that  ||X %*% w - 1|| -> MIN  given  avg(X %*% w) = 1
-# We find z_LS such that ||X %*% z_LS - 1|| -> MIN unconditionally, then scale
-# it to compute  w = c * z_LS  such that  sum(X %*% w) = nrow(X).
-straightenX =
-  function (Matrix[double] X, double eps, int max_iter_CG)
-    return   (Matrix[double] w)
-{
-      w_X = t(colSums(X));
-      lambda_LS = 0.000001 * sum(X ^ 2) / ncol(X);
-      eps_LS = eps * nrow(X);
-      
-      # BEGIN LEAST SQUARES
-      
-      r_LS = - w_X;
-      z_LS = matrix (0.0, rows = ncol(X), cols = 1);
-      p_LS = - r_LS;
-      norm_r2_LS = sum (r_LS ^ 2);
-      i_LS = 0;
-      while (i_LS < max_iter_CG & i_LS < ncol(X) & norm_r2_LS >= eps_LS)
-      {
-        q_LS = t(X) %*% X %*% p_LS;
-        q_LS = q_LS + lambda_LS * p_LS;
-        alpha_LS = norm_r2_LS / sum (p_LS * q_LS);
-        z_LS = z_LS + alpha_LS * p_LS;
-        old_norm_r2_LS = norm_r2_LS;
-        r_LS = r_LS + alpha_LS * q_LS;
-        norm_r2_LS = sum (r_LS ^ 2);
-        p_LS = -r_LS + (norm_r2_LS / old_norm_r2_LS) * p_LS;
-        i_LS = i_LS + 1;
-      }
-      
-      # END LEAST SQUARES
-      
-      w = (nrow(X) / sum (w_X * z_LS)) * z_LS;
-    }
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#  
+# THIS SCRIPT CHOOSES A GLM REGRESSION MODEL IN A STEPWISE ALGIRITHM USING AIC
+# EACH GLM REGRESSION IS SOLVED USING NEWTON/FISHER SCORING WITH TRUST REGIONS
+#
+# INPUT PARAMETERS:
+# 
---------------------------------------------------------------------------------------------
+# NAME  TYPE     DEFAULT   MEANING
+# 
---------------------------------------------------------------------------------------------
+# X     String    ---      Location to read the matrix X of feature vectors
+# Y     String    ---      Location to read response matrix Y with 1 column
+# B     String    ---      Location to store estimated regression parameters 
(the betas)
+# S     String    ---      Location to write the selected features ordered as 
computed by the algorithm
+# O     String    " "      Location to write the printed statistics; by 
default is standard output
+# link  Int       2        Link function code: 1 = log, 2 = Logit, 3 = Probit, 
4 = Cloglog
+# yneg  Double    0.0      Response value for Bernoulli "No" label, usually 
0.0 or -1.0
+# icpt  Int       0        Intercept presence, X columns shifting and 
rescaling:
+#                          0 = no intercept, no shifting, no rescaling;
+#                          1 = add intercept, but neither shift nor rescale X;
+#                          2 = add intercept, shift & rescale X columns to 
mean = 0, variance = 1
+# tol   Double    0.000001 Tolerance (epsilon)
+# disp  Double    0.0      (Over-)dispersion value, or 0.0 to estimate it from 
data
+# moi   Int       200      Maximum number of outer (Newton / Fisher Scoring) 
iterations
+# mii   Int       0        Maximum number of inner (Conjugate Gradient) 
iterations, 0 = no maximum
+# thr   Double    0.01     Threshold to stop the algorithm: if the decrease in 
the value of AIC falls below thr
+#                          no further features are being checked and the 
algorithm stops 
+# fmt   String   "text"    The betas matrix output format, such as "text" or 
"csv"
+# 
---------------------------------------------------------------------------------------------
+# OUTPUT: Matrix beta, whose size depends on icpt:
+#     icpt=0: ncol(X) x 1;  icpt=1: (ncol(X) + 1) x 1;  icpt=2: (ncol(X) + 1) 
x 2
+#
+# In addition, in the last run of GLM some statistics are provided in CSV 
format, one comma-separated name-value
+# pair per each line, as follows:
+#
+# NAME                  MEANING
+# 
-------------------------------------------------------------------------------------------
+# TERMINATION_CODE      A positive integer indicating success/failure as 
follows:
+#                       1 = Converged successfully; 2 = Maximum number of 
iterations reached; 
+#                       3 = Input (X, Y) out of range; 4 = Distribution/link 
is not supported
+# BETA_MIN              Smallest beta value (regression coefficient), 
excluding the intercept
+# BETA_MIN_INDEX        Column index for the smallest beta value
+# BETA_MAX              Largest beta value (regression coefficient), excluding 
the intercept
+# BETA_MAX_INDEX        Column index for the largest beta value
+# INTERCEPT             Intercept value, or NaN if there is no intercept (if 
icpt=0)
+# DISPERSION            Dispersion used to scale deviance, provided as "disp" 
input parameter
+#                       or estimated (same as DISPERSION_EST) if the "disp" 
parameter is <= 0
+# DISPERSION_EST        Dispersion estimated from the dataset
+# DEVIANCE_UNSCALED     Deviance from the saturated model, assuming dispersion 
== 1.0
+# DEVIANCE_SCALED       Deviance from the saturated model, scaled by the 
DISPERSION value
+# 
-------------------------------------------------------------------------------------------
+#
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f StepGLM.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y 
B=OUTPUT_DIR/betas
+#                       S=OUTPUT_DIR_S/selected O=OUTPUT_DIR/stats link=2 
yneg=-1.0 icpt=2 tol=0.00000001 
+#                       disp=1.0 moi=100 mii=10 thr=0.01 fmt=csv  
+#
+# THE StepGLM SCRIPT CURRENTLY SUPPORTS BERNOULLI DISTRIBUTION FAMILY AND THE 
FOLLOWING LINK FUNCTIONS ONLY!
+#      - LOG
+#      - LOGIT
+#      - PROBIT
+#      - CLOGLOG
+       
+fileX = $X;
+fileY = $Y;
+fileB = $B;
+intercept_status = ifdef ($icpt, 0);   
+thr = ifdef ($thr, 0.01); 
+bernoulli_No_label = ifdef ($yneg, 0.0);    # $yneg = 0.0;
+distribution_type = 2;
+
+bernoulli_No_label = as.double (bernoulli_No_label);
+
+# currently only the forward selection strategy in supported: start from one 
feature and iteratively add 
+# features until AIC improves
+dir = "forward";
+
+print("BEGIN STEPWISE GLM SCRIPT");
+print ("Reading X and Y...");
+X_orig = read (fileX);
+Y = read (fileY);
+
+if (distribution_type == 2 & ncol(Y) == 1) {
+       is_Y_negative = ppred (Y, bernoulli_No_label, "==");
+       Y = append (1 - is_Y_negative, is_Y_negative);
+       count_Y_negative = sum (is_Y_negative);
+       if (count_Y_negative == 0) {
+               stop ("StepGLM Input Error: all Y-values encode Bernoulli 
YES-label, none encode NO-label");
+       }
+       if (count_Y_negative == nrow(Y)) {
+               stop ("StepGLM Input Error: all Y-values encode Bernoulli 
NO-label, none encode YES-label");
+       }
+}
+
+num_records = nrow (X_orig);
+num_features = ncol (X_orig);
+
+# BEGIN STEPWISE GENERALIZED LINEAR MODELS 
+
+if (dir == "forward") {  
+       
+       continue = TRUE;
+       columns_fixed = matrix (0, rows = 1, cols = num_features);
+       columns_fixed_ordered = matrix (0, rows = 1, cols = 1);
+  
+       # X_global stores the best model found at each step 
+       X_global = matrix (0, rows = num_records, cols = 1);
+ 
+       if (intercept_status == 0) {
+               # Compute AIC of an empty model with no features and no 
intercept (all Ys are zero)
+               [AIC_best] = glm (X_global, Y, 0, num_features, 
columns_fixed_ordered, " ");
+       } else {
+               # compute AIC of an empty model with only intercept (all Ys are 
constant)
+               all_ones = matrix (1, rows = num_records, cols = 1);
+               [AIC_best] = glm (all_ones, Y, 0, num_features, 
columns_fixed_ordered, " ");
+       }
+       print ("Best AIC without any features: " + AIC_best);
+  
+       # First pass to examine single features
+       AICs = matrix (AIC_best, rows = 1, cols = num_features);
+       parfor (i in 1:num_features) {  
+               [AIC_1] = glm (X_orig[,i], Y, intercept_status, num_features, 
columns_fixed_ordered, " ");
+               AICs[1,i] = AIC_1;
+       }
+  
+       # Determine the best AIC 
+       column_best = 0;        
+       for (k in 1:num_features) {
+               AIC_cur = as.scalar (AICs[1,k]);
+               if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs (thr * 
AIC_best)) ) {
+                       column_best = k;
+                       AIC_best = as.scalar(AICs[1,k]);
+               }
+       }
+  
+       if (column_best == 0) {
+               print ("AIC of an empty model is " + AIC_best + " and adding no 
feature achieves more than " + (thr * 100) + "% decrease in AIC!");
+               if (intercept_status == 0) {
+                       # Compute AIC of an empty model with no features and no 
intercept (all Ys are zero)
+                       [AIC_best] = glm (X_global, Y, 0, num_features, 
columns_fixed_ordered, fileB);
+               } else {
+                       # compute AIC of an empty model with only intercept 
(all Ys are constant)
+                       ###all_ones = matrix (1, rows = num_records, cols = 1);
+                       [AIC_best] = glm (all_ones, Y, 0, num_features, 
columns_fixed_ordered, fileB);
+               }
+       };
+  
+       print ("Best AIC " + AIC_best + " achieved with feature: " + 
column_best);      
+       columns_fixed[1,column_best] = 1;
+       columns_fixed_ordered[1,1] = column_best;
+       X_global = X_orig[,column_best];                
+  
+       while (continue) {
+               # Subsequent passes over the features
+               parfor (i in 1:num_features) { 
+                       if (as.scalar(columns_fixed[1,i]) == 0) {       
+        
+                               # Construct the feature matrix
+                               X = append (X_global, X_orig[,i]);
+        
+                               [AIC_2] = glm (X, Y, intercept_status, 
num_features, columns_fixed_ordered, " ");
+                               AICs[1,i] = AIC_2;
+                       }               
+               }
+    
+               # Determine the best AIC
+               for (k in 1:num_features) {
+                       AIC_cur = as.scalar (AICs[1,k]);
+                       if ( (AIC_cur < AIC_best) & ((AIC_best - AIC_cur) > abs 
(thr * AIC_best)) & (as.scalar(columns_fixed[1,k]) == 0) ) {
+                               column_best = k;
+                               AIC_best = as.scalar(AICs[1,k]);
+                       }
+               }
+    
+               # Append best found features (i.e., columns) to X_global
+               if (as.scalar(columns_fixed[1,column_best]) == 0) { # new best 
feature found
+                       print ("Best AIC " + AIC_best + " achieved with 
feature: " + column_best);
+                       columns_fixed[1,column_best] = 1;
+                       columns_fixed_ordered = append (columns_fixed_ordered, 
as.matrix(column_best));
+                       if (ncol(columns_fixed_ordered) == num_features) { # 
all features examined
+                               X_global = append (X_global, 
X_orig[,column_best]);
+                               continue = FALSE;
+                       } else {
+                               X_global = append (X_global, 
X_orig[,column_best]);
+                       }
+               } else {
+               continue = FALSE;
+               }
+       }
+  
+       # run GLM with selected set of features
+       print ("Running GLM with selected features...");
+       [AIC] = glm (X_global, Y, intercept_status, num_features, 
columns_fixed_ordered, fileB);
+  
+} else {
+       stop ("Currently only forward selection strategy is supported!");
+}
+
+
+################### UDFS USED IN THIS SCRIPT ##################
+
+glm = function (Matrix[Double] X, Matrix[Double] Y, Int intercept_status, 
Double num_features_orig, Matrix[Double] Selected, String fileB) return (Double 
AIC) {
+               
+       # distribution family code: 1 = Power, 2 = Bernoulli/Binomial; 
currently only Bernouli distribution family is supported!                
+       distribution_type = 2;                          # $dfam = 2;
+       variance_as_power_of_the_mean = 0.0;            # $vpow = 0.0;
+       # link function code: 0 = canonical (depends on distribution), 1 = 
Power, 2 = Logit, 3 = Probit, 4 = Cloglog, 5 = Cauchit;
+       # currently only log (link = 1), logit (link = 2), probit (link = 3), 
and cloglog (link = 4) are supported!
+       link_type = ifdef ($link, 2);                   # $link = 2;
+       link_as_power_of_the_mean = 0.0;                # $lpow = 0.0;
+
+       dispersion = ifdef ($disp, 0.0);            # $disp = 0.0;
+       eps = ifdef ($tol, 0.000001);               # $tol  = 0.000001;
+       max_iteration_IRLS = ifdef ($moi, 200);     # $moi  = 200;
+       max_iteration_CG = ifdef ($mii, 0);         # $mii  = 0;
+
+       variance_as_power_of_the_mean = as.double 
(variance_as_power_of_the_mean);
+       link_as_power_of_the_mean = as.double (link_as_power_of_the_mean);
+
+       dispersion = as.double (dispersion);
+       eps = as.double (eps);              
+
+       # Default values for output statistics:
+       regularization = 0.0;
+       termination_code     = 0.0;
+       min_beta             = 0.0 / 0.0;
+       i_min_beta           = 0.0 / 0.0;
+       max_beta             = 0.0 / 0.0;
+       i_max_beta           = 0.0 / 0.0;
+       intercept_value      = 0.0 / 0.0;
+       dispersion           = 0.0 / 0.0;
+       estimated_dispersion = 0.0 / 0.0;
+       deviance_nodisp      = 0.0 / 0.0;
+       deviance             = 0.0 / 0.0;                                 
+                  
+       #####   INITIALIZE THE PARAMETERS   #####
+                  
+    num_records  = nrow (X);
+    num_features = ncol (X);
+    zeros_r = matrix (0, rows = num_records, cols = 1);
+    ones_r = 1 + zeros_r;
+                  
+    # Introduce the intercept, shift and rescale the columns of X if needed
+                  
+    if (intercept_status == 1 | intercept_status == 2) { # add the intercept 
column
+               X = append (X, ones_r);
+        num_features = ncol (X);
+       }
+                  
+    scale_lambda = matrix (1, rows = num_features, cols = 1);
+    if (intercept_status == 1 | intercept_status == 2) {
+               scale_lambda [num_features, 1] = 0;
+    }
+                  
+    if (intercept_status == 2) {  # scale-&-shift X columns to mean 0, 
variance 1
+               # Important assumption: X [, num_features] = ones_r
+        avg_X_cols = t(colSums(X)) / num_records;
+        var_X_cols = (t(colSums (X ^ 2)) - num_records * (avg_X_cols ^ 2)) / 
(num_records - 1);
+        is_unsafe = ppred (var_X_cols, 0.0, "<=");
+        scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe);
+        scale_X [num_features, 1] = 1;
+        shift_X = - avg_X_cols * scale_X;
+        shift_X [num_features, 1] = 0;
+        rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * 
shift_X) + sum (shift_X ^ 2);
+       } else {
+        scale_X = matrix (1, rows = num_features, cols = 1);
+        shift_X = matrix (0, rows = num_features, cols = 1);
+        rowSums_X_sq = rowSums (X ^ 2);
+    }  
+                  
+    # Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and 
rowSums(X ^ 2)
+    # with "rowSums_X_sq" in order to preserve the sparsity of X under shift 
and scale.
+    # The transform is then associatively applied to the other side of the 
expression,
+    # and is rewritten via "scale_X" and "shift_X" as follows:
+    #
+    # ssX_A  = (SHIFT/SCALE TRANSFORM) %*% A    --- is rewritten as:
+    # ssX_A  = diag (scale_X) %*% A;
+    # ssX_A [num_features, ] = ssX_A [num_features, ] + t(shift_X) %*% A;
+    #
+    # tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A   --- is rewritten as:
+    # tssX_A = diag (scale_X) %*% A + shift_X %*% A [num_features, ];
+                  
+    # Initialize other input-dependent parameters
+                  
+    lambda = scale_lambda * regularization;
+    if (max_iteration_CG == 0) {
+               max_iteration_CG = num_features;
+    }
+                  
+    # Set up the canonical link, if requested [Then we have: Var(mu) * (d link 
/ d mu) = const]
+                  
+    if (link_type == 0) {
+               if (distribution_type == 1) {
+                       link_type = 1;
+            link_as_power_of_the_mean = 1.0 - variance_as_power_of_the_mean;
+               } else { 
+                       if (distribution_type == 2) {
+                               link_type = 2;
+                       }   
+               }   
+       }
+                                
+    # For power distributions and/or links, we use two constants,
+    # "variance as power of the mean" and "link_as_power_of_the_mean",
+    # to specify the variance and the link as arbitrary powers of the
+    # mean.  However, the variance-powers of 1.0 (Poisson family) and
+    # 2.0 (Gamma family) have to be treated as special cases, because
+    # these values integrate into logarithms.  The link-power of 0.0
+    # is also special as it represents the logarithm link.
+                  
+    num_response_columns = ncol (Y);
+    is_supported = 0;              
+       if (num_response_columns == 2 & distribution_type == 2 & link_type >= 1 
& link_type <= 4) { # BERNOULLI DISTRIBUTION
+               is_supported = 1;                         
+       }
+       if (num_response_columns == 1 & distribution_type == 2) {
+               print ("Error: Bernoulli response matrix has not been converted 
into two-column format.");
+    }
+
+       if (is_supported == 1) {
+                    
+               #####   INITIALIZE THE BETAS   #####
+                    
+        [beta, saturated_log_l, isNaN] = 
+                       glm_initialize (X, Y, distribution_type, 
variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean, 
intercept_status, max_iteration_CG);
+                                         
+               # print(" --- saturated logLik " + saturated_log_l);
+                                         
+        if (isNaN == 0) {
+                      
+                       #####  START OF THE MAIN PART  #####
+                      
+            sum_X_sq = sum (rowSums_X_sq);
+            trust_delta = 0.5 * sqrt (num_features) / max (sqrt 
(rowSums_X_sq));
+            ###  max_trust_delta = trust_delta * 10000.0;
+            log_l = 0.0;
+            deviance_nodisp = 0.0;
+            new_deviance_nodisp = 0.0;
+            isNaN_log_l = 2;
+            newbeta = beta;
+            g = matrix (0.0, rows = num_features, cols = 1);
+            g_norm = sqrt (sum ((g + lambda * beta) ^ 2));
+            accept_new_beta = 1;
+            reached_trust_boundary = 0;
+            neg_log_l_change_predicted = 0.0;
+            i_IRLS = 0;
+                      
+            # print ("BEGIN IRLS ITERATIONS...");
+                      
+            ssX_newbeta = diag (scale_X) %*% newbeta;
+            ssX_newbeta [num_features, ] = ssX_newbeta [num_features, ] + 
t(shift_X) %*% newbeta;
+            all_linear_terms = X %*% ssX_newbeta;
+                      
+            [new_log_l, isNaN_new_log_l] = glm_log_likelihood_part
+                               (all_linear_terms, Y, distribution_type, 
variance_as_power_of_the_mean, link_type, link_as_power_of_the_mean);
+                                         
+            if (isNaN_new_log_l == 0) {
+                               new_deviance_nodisp = 2.0 * (saturated_log_l - 
new_log_l);
+                new_log_l = new_log_l - 0.5 * sum (lambda * newbeta ^ 2);
+            }
+                      
+            while (termination_code == 0) {
+                               accept_new_beta = 1;
+                        
+                if (i_IRLS > 0) {
+                                       if (isNaN_log_l == 0) {
+                                               accept_new_beta = 0;
+                    }
+                          
+                    # Decide whether to accept a new iteration point and 
update the trust region
+                    # See Alg. 4.1 on p. 69 of "Numerical Optimization" 2nd 
ed. by Nocedal and Wright
+                          
+                    rho = (- new_log_l + log_l) / neg_log_l_change_predicted;
+                                       if (rho < 0.25 | isNaN_new_log_l == 1) {
+                                               trust_delta = 0.25 * 
trust_delta;
+                                       }
+                                       if (rho > 0.75 & isNaN_new_log_l == 0 & 
reached_trust_boundary == 1) {
+                                               trust_delta = 2 * trust_delta;
+                            
+                                               ### if (trust_delta > 
max_trust_delta) {
+                                               ###     trust_delta = 
max_trust_delta;
+                                               ### }
+                                       }
+                                       if (rho > 0.1 & isNaN_new_log_l == 0) {
+                                               accept_new_beta = 1;
+                                       }
+                               }
+                        
+                if (accept_new_beta == 1) {
+                                       beta = newbeta;  log_l = new_log_l;  
deviance_nodisp = new_deviance_nodisp;  isNaN_log_l = isNaN_new_log_l;
+                          
+                                       [g_Y, w] = glm_dist (all_linear_terms, 
Y, distribution_type, variance_as_power_of_the_mean, link_type, 
link_as_power_of_the_mean);
+                          
+                                       # We introduced these variables to 
avoid roundoff errors:
+                                       #     g_Y = y_residual / (y_var * 
link_grad);
+                                       #     w   = 1.0 / (y_var * link_grad * 
link_grad);
+                          
+                                       gXY = - t(X) %*% g_Y;
+                                       g = diag (scale_X) %*% gXY + shift_X 
%*% gXY [num_features, ];
+                                       g_norm = sqrt (sum ((g + lambda * beta) 
^ 2));
+                               }
+                        
+                [z, neg_log_l_change_predicted, num_CG_iters, 
reached_trust_boundary] = 
+                                       get_CG_Steihaug_point (X, scale_X, 
shift_X, w, g, beta, lambda, trust_delta, max_iteration_CG);
+                        
+                               newbeta = beta + z;
+                        
+                               ssX_newbeta = diag (scale_X) %*% newbeta;
+                               ssX_newbeta [num_features, ] = ssX_newbeta 
[num_features, ] + t(shift_X) %*% newbeta;
+                               all_linear_terms = X %*% ssX_newbeta;
+                        
+                               [new_log_l, isNaN_new_log_l] = 
glm_log_likelihood_part
+                                       (all_linear_terms, Y, 
distribution_type, variance_as_power_of_the_mean, link_type, 
link_as_power_of_the_mean);
+                        
+                               if (isNaN_new_log_l == 0) {
+                                       new_deviance_nodisp = 2.0 * 
(saturated_log_l - new_log_l);
+                                       new_log_l = new_log_l - 0.5 * sum 
(lambda * newbeta ^ 2);
+                               }
+                        
+                               log_l_change = new_log_l - log_l;               
# R's criterion for termination: |dev - devold|/(|dev| + 0.1) < eps
+                       
+                               if (reached_trust_boundary == 0 & 
isNaN_new_log_l == 0 & 
+                                  (2.0 * abs (log_l_change) < eps * 
(deviance_nodisp + 0.1) | abs (log_l_change) < (abs (log_l) + abs (new_log_l)) 
* 0.00000000000001) ) {
+                                       termination_code = 1;
+                               }
+                               rho = - log_l_change / 
neg_log_l_change_predicted;
+                               z_norm = sqrt (sum (z * z));
+                                              
+                               i_IRLS = i_IRLS + 1;                        
+                        
+                               if (i_IRLS == max_iteration_IRLS) {
+                                       termination_code = 2;
+                               }
+                       }
+                      
+            beta = newbeta;
+            log_l = new_log_l;
+            deviance_nodisp = new_deviance_nodisp;
+                     
+            #---------------------------- last part
+
+                       if (termination_code != 1) {
+                               print ("One of the runs of GLM did not 
converged in " + i_IRLS + " steps!");
+                       }
+                      
+            ##### COMPUTE AIC ##### 
+            
+                       if (distribution_type == 2 & link_type >= 1 & link_type 
<= 4) {                 
+                               AIC = -2 * log_l;
+                               if (sum (X) != 0) {
+                                       AIC = AIC + 2 * num_features;   
+                               }
+                       } else {
+                               stop ("Currently only the Bernoulli 
distribution family the following link functions are supported: log, logit, 
probit, and cloglog!");
+                       }
+                                                  
+            if (fileB != " ") {
+                               fileO = ifdef ($O, " ");
+                               fileS = $S;
+                               fmt  = ifdef ($fmt, "text");    
+                       
+                               # Output which features give the best AIC and 
are being used for linear regression 
+                               write (Selected, fileS, format=fmt);
+               
+                               ssX_beta = diag (scale_X) %*% beta;
+                ssX_beta [num_features, ] = ssX_beta [num_features, ] + 
t(shift_X) %*% beta;
+                               if (intercept_status == 2) {
+                                       beta_out = append (ssX_beta, beta);
+                } else {
+                                       beta_out = ssX_beta;
+                }
+                
+                if (intercept_status == 0 & num_features == 1) {
+                                       p = sum (ppred (X, 1, "=="));
+                                       if (p == num_records) {
+                                               beta_out = beta_out[1,];
+                                       }                                       
+                } 
+
+                                                               
+                if (intercept_status == 1 | intercept_status == 2) {
+                                       intercept_value = castAsScalar 
(beta_out [num_features, 1]);
+                    beta_noicept = beta_out [1 : (num_features - 1), 1];
+                } else {
+                                       beta_noicept = beta_out [1 : 
num_features, 1];
+                }
+                min_beta = min (beta_noicept);
+                max_beta = max (beta_noicept);
+                tmp_i_min_beta = rowIndexMin (t(beta_noicept))
+                i_min_beta = castAsScalar (tmp_i_min_beta [1, 1]);
+                tmp_i_max_beta = rowIndexMax (t(beta_noicept))
+                i_max_beta = castAsScalar (tmp_i_max_beta [1, 1]);
+                        
+                #####  OVER-DISPERSION PART  #####
+                      
+                all_linear_terms = X %*% ssX_beta;
+                               [g_Y, w] = glm_dist (all_linear_terms, Y, 
distribution_type, variance_as_power_of_the_mean, link_type, 
link_as_power_of_the_mean);
+                        
+                pearson_residual_sq = g_Y ^ 2 / w;
+                pearson_residual_sq = replace (target = pearson_residual_sq, 
pattern = 0.0/0.0, replacement = 0);
+                # pearson_residual_sq = (y_residual ^ 2) / y_var;
+                        
+                if (num_records > num_features) {
+                                       estimated_dispersion = sum 
(pearson_residual_sq) / (num_records - num_features);
+                }
+                if (dispersion <= 0.0) {
+                                       dispersion = estimated_dispersion;
+                }
+                deviance = deviance_nodisp / dispersion;
+                        
+                #####  END OF THE MAIN PART  #####
+                        
+                str = "BETA_MIN," + min_beta;
+                str = append (str, "BETA_MIN_INDEX," + i_min_beta);
+                str = append (str, "BETA_MAX," + max_beta);
+                str = append (str, "BETA_MAX_INDEX," + i_max_beta);
+                str = append (str, "INTERCEPT," + intercept_value);
+                str = append (str, "DISPERSION," + dispersion);
+                str = append (str, "DISPERSION_EST," + estimated_dispersion);
+                str = append (str, "DEVIANCE_UNSCALED," + deviance_nodisp);
+                str = append (str, "DEVIANCE_SCALED," + deviance);
+                        
+                if (fileO != " ") {
+                                       write (str, fileO);
+                } 
+                               else {
+                                       print (str);
+                }
+                       
+                               # Prepare the output matrix
+                               print ("Writing the output matrix...");
+                if (intercept_status == 0 & num_features == 1) { 
+                                       if (p == num_records) {
+                                               beta_out_tmp = matrix (0, rows 
= num_features_orig + 1, cols = 1); 
+                                               beta_out_tmp[num_features_orig 
+ 1,] = beta_out;
+                                               beta_out = beta_out_tmp;
+                                               write (beta_out, fileB, 
format=fmt);
+                                               stop ("");
+                                       } else if (sum (X) == 0){
+                                               beta_out = matrix (0, rows = 
num_features_orig, cols = 1);
+                                               write (beta_out, fileB, 
format=fmt);
+                                               stop ("");
+                                       }
+                               }
+
+                               no_selected = ncol (Selected);
+                               max_selected = max (Selected);
+                               last = max_selected + 1;        
+               
+                               if (intercept_status != 0) {
+               
+                                       Selected_ext = append (Selected, 
as.matrix (last));                     
+                                       P1 = table (seq (1, ncol 
(Selected_ext)), t(Selected_ext)); 
+
+                                       if (intercept_status == 2) {
+                       
+                                               P1_ssX_beta = P1 * ssX_beta;
+                                               P2_ssX_beta = colSums 
(P1_ssX_beta);
+                                               P1_beta = P1 * beta;
+                                               P2_beta = colSums (P1_beta);
+                               
+                                               if (max_selected < 
num_features_orig) {
+                                               
+                                                       P2_ssX_beta = append 
(P2_ssX_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
+                                                       P2_beta = append 
(P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
+                                                       
+                                                       P2_ssX_beta[1, 
num_features_orig+1] = P2_ssX_beta[1, max_selected + 1]; 
+                                                       P2_ssX_beta[1, 
max_selected + 1] = 0;
+                                                       
+                                                       P2_beta[1, 
num_features_orig+1] = P2_beta[1, max_selected + 1]; 
+                                                       P2_beta[1, max_selected 
+ 1] = 0;
+
+                                               }
+                                               beta_out = append 
(t(P2_ssX_beta), t(P2_beta));
+                               
+                                       } else {
+                       
+                                               P1_beta = P1 * beta;
+                                               P2_beta = colSums (P1_beta);
+                               
+                                               if (max_selected < 
num_features_orig) {
+                                                       P2_beta = append 
(P2_beta, matrix (0, rows=1, cols=(num_features_orig - max_selected)));
+                                                       P2_beta[1, 
num_features_orig+1] = P2_beta[1, max_selected + 1] ; 
+                                                       P2_beta[1, max_selected 
+ 1] = 0;
+                                               }
+                                               beta_out = t(P2_beta);
+                               
+                                       }
+                               } else {
+               
+                                       P1 = table (seq (1, no_selected), 
t(Selected)); 
+                                       P1_beta = P1 * beta;
+                                       P2_beta = colSums (P1_beta);    
+
+                                       if (max_selected < num_features_orig) {
+                                               P2_beta = append (P2_beta, 
matrix (0, rows=1, cols=(num_features_orig - max_selected)));
+                                       }               
+
+                                       beta_out = t(P2_beta);  
+                               }
+       
+                               write ( beta_out, fileB, format=fmt );
+                       
+                       }
+                      
+               } else { 
+                       stop ("Input matrices X and/or Y are out of range!"); 
+        }
+       } else { 
+               stop ("Response matrix with " + num_response_columns + " 
columns, distribution family (" + distribution_type + ", " + 
variance_as_power_of_the_mean
+               + ") and link family (" + link_type + ", " + 
link_as_power_of_the_mean + ") are NOT supported together.");
+    }
+}
+
+glm_initialize = function (Matrix[double] X, Matrix[double] Y, int dist_type, 
double var_power, int link_type, double link_power, int icept_status, int 
max_iter_CG)
+  return (Matrix[double] beta, double saturated_log_l, int isNaN)
+{
+    saturated_log_l = 0.0;
+    isNaN = 0;
+    y_corr = Y [, 1];
+    if (dist_type == 2) {
+      n_corr = rowSums (Y);
+      is_n_zero = ppred (n_corr, 0.0, "==");
+      y_corr = Y [, 1] / (n_corr + is_n_zero) + (0.5 - Y [, 1]) * is_n_zero;   
 
+    }
+    linear_terms = y_corr;
+    if (dist_type == 1 & link_type == 1) { # POWER DISTRIBUTION
+      if          (link_power ==  0.0) {
+        if (sum (ppred (y_corr, 0.0, "<")) == 0) {
+          is_zero_y_corr = ppred (y_corr, 0.0, "==");
+          linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 
- is_zero_y_corr);
+        } else { isNaN = 1; }
+      } else { if (link_power ==  1.0) {
+        linear_terms = y_corr;
+      } else { if (link_power == -1.0) {
+        linear_terms = 1.0 / y_corr;
+      } else { if (link_power ==  0.5) {
+        if (sum (ppred (y_corr, 0.0, "<")) == 0) {
+          linear_terms = sqrt (y_corr);
+        } else { isNaN = 1; }
+      } else { if (link_power >   0.0) {
+        if (sum (ppred (y_corr, 0.0, "<")) == 0) {
+          is_zero_y_corr = ppred (y_corr, 0.0, "==");
+          linear_terms = (y_corr + is_zero_y_corr) ^ link_power - 
is_zero_y_corr;
+        } else { isNaN = 1; }
+      } else {
+        if (sum (ppred (y_corr, 0.0, "<=")) == 0) {
+          linear_terms = y_corr ^ link_power;
+        } else { isNaN = 1; }
+      }}}}}
+    }
+    if (dist_type == 2 & link_type >= 1 & link_type <= 5)
+    { # BINOMIAL/BERNOULLI DISTRIBUTION
+      if          (link_type == 1 & link_power == 0.0)  { # Binomial.log
+        if (sum (ppred (y_corr, 0.0, "<")) == 0) {
+          is_zero_y_corr = ppred (y_corr, 0.0, "==");
+          linear_terms = log (y_corr + is_zero_y_corr) - is_zero_y_corr / (1.0 
- is_zero_y_corr);
+        } else { isNaN = 1; }
+      } else { if (link_type == 1 & link_power >  0.0)  { # 
Binomial.power_nonlog pos
+        if (sum (ppred (y_corr, 0.0, "<")) == 0) {
+          is_zero_y_corr = ppred (y_corr, 0.0, "==");
+          linear_terms = (y_corr + is_zero_y_corr) ^ link_power - 
is_zero_y_corr;
+        } else { isNaN = 1; }
+      } else { if (link_type == 1)                      { # 
Binomial.power_nonlog neg
+        if (sum (ppred (y_corr, 0.0, "<=")) == 0) {
+          linear_terms = y_corr ^ link_power;
+        } else { isNaN = 1; }
+      } else { 
+        is_zero_y_corr = ppred (y_corr, 0.0, "<=");
+        is_one_y_corr  = ppred (y_corr, 1.0, ">=");
+        y_corr = y_corr * (1.0 - is_zero_y_corr) * (1.0 - is_one_y_corr) + 0.5 
* (is_zero_y_corr + is_one_y_corr);
+        if (link_type == 2)                           { # Binomial.logit
+          linear_terms = log (y_corr / (1.0 - y_corr)) 
+          + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - 
is_zero_y_corr);
+        } else { if (link_type == 3)                  { # Binomial.probit
+          y_below_half = y_corr + (1.0 - 2.0 * y_corr) * ppred (y_corr, 0.5, 
">");
+          t = sqrt (- 2.0 * log (y_below_half));
+          approx_inv_Gauss_CDF = - t + (2.515517 + t * (0.802853 + t * 
0.010328)) / (1.0 + t * (1.432788 + t * (0.189269 + t * 0.001308)));
+          linear_terms = approx_inv_Gauss_CDF * (1.0 - 2.0 * ppred (y_corr, 
0.5, ">"))
+          + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - 
is_zero_y_corr);
+        } else { if (link_type == 4)                  { # Binomial.cloglog
+          linear_terms = log (- log (1.0 - y_corr))
+          - log (- log (0.5)) * (is_zero_y_corr + is_one_y_corr)
+          + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - 
is_zero_y_corr);
+        } else { if (link_type == 5)                  { # Binomial.cauchit
+          linear_terms = tan ((y_corr - 0.5) * 
3.1415926535897932384626433832795)
+          + is_one_y_corr / (1.0 - is_one_y_corr) - is_zero_y_corr / (1.0 - 
is_zero_y_corr);
+        }}  }}}}}
+    }
+    
+    if (isNaN == 0) {
+      [saturated_log_l, isNaN] = 
+        glm_log_likelihood_part (linear_terms, Y, dist_type, var_power, 
link_type, link_power);
+    }
+    
+    if ((dist_type == 1 & link_type == 1 & link_power == 0.0) |
+          (dist_type == 2 & link_type >= 2))
+    {    
+      desired_eta = 0.0;
+    } else { if (link_type == 1 & link_power == 0.0) {
+      desired_eta = log (0.5);
+    } else { if (link_type == 1) {
+      desired_eta = 0.5 ^ link_power;
+    } else {
+      desired_eta = 0.5;
+    }}}
+    
+    beta = matrix (0.0, rows = ncol(X), cols = 1);
+    
+    if (desired_eta != 0.0) {
+      if (icept_status == 1 | icept_status == 2) {
+        beta [nrow(beta), 1] = desired_eta;
+      } else {
+        # We want: avg (X %*% ssX_transform %*% beta) = desired_eta
+        # Note that "ssX_transform" is trivial here, hence ignored
+        
+        beta = straightenX (X, 0.000001, max_iter_CG);  
+        beta = beta * desired_eta;
+      }   }   }
+
+
+glm_dist = function (Matrix[double] linear_terms, Matrix[double] Y,
+                     int dist_type, double var_power, int link_type, double 
link_power)
+  return (Matrix[double] g_Y, Matrix[double] w)
+# ORIGINALLY we returned more meaningful vectors, namely:
+# Matrix[double] y_residual    : y - y_mean, i.e. y observed - y predicted
+# Matrix[double] link_gradient : derivative of the link function
+# Matrix[double] var_function  : variance without dispersion, i.e. the V(mu) 
function
+# BUT, this caused roundoff errors, so we had to compute "directly useful" 
vectors
+# and skip over the "meaningful intermediaries".  Now we output these two 
variables:
+#     g_Y = y_residual / (var_function * link_gradient);
+#     w   = 1.0 / (var_function * link_gradient ^ 2);
+{
+    num_records = nrow (linear_terms);
+    zeros_r = matrix (0.0, rows = num_records, cols = 1);
+    ones_r = 1 + zeros_r;
+    g_Y  = zeros_r;
+    w  = zeros_r;
+    
+    # Some constants
+    
+    one_over_sqrt_two_pi = 0.39894228040143267793994605993438;
+    ones_2 = matrix (1.0, rows = 1, cols = 2);
+    p_one_m_one = ones_2;
+    p_one_m_one [1, 2] = -1.0;
+    m_one_p_one = ones_2;
+    m_one_p_one [1, 1] = -1.0;
+    zero_one = ones_2;
+    zero_one [1, 1] = 0.0;
+    one_zero = ones_2;
+    one_zero [1, 2] =

<TRUNCATED>

Reply via email to