[50/51] [partial] incubator-systemml git commit: [SYSTEMML-482] [SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line endings, and normalizing all of the line endings.

dusenberrymw Fri, 22 Jan 2016 08:35:05 -0800

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/Cox-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/Cox-predict.dml 
b/scripts/algorithms/Cox-predict.dml
index 6f2bec0..7d444fd 100644
--- a/scripts/algorithms/Cox-predict.dml
+++ b/scripts/algorithms/Cox-predict.dml
@@ -1,181 +1,181 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#  
-# THIS SCRIPT APPLIES THE ESTIMATED PARAMETERS OF A COX PROPORTIONAL HAZARD 
REGRESSION MODEL TO A NEW (TEST) DATASET
-# 
-# INPUT   PARAMETERS:
-# 
---------------------------------------------------------------------------------------------
-# NAME    TYPE     DEFAULT      MEANING
-# 
---------------------------------------------------------------------------------------------
-# X       String   ---          Location to read the input matrix containing 
the survival data with the following schema:
-#                                                               - X[,1]: 
timestamps 
-#                                                               - X[,2]: 
whether an event occurred (1) or data is censored (0)
-#                                                               - X[,3:]: 
feature vectors (excluding the baseline columns) used for model fitting       
-# RT      String   ---                 Location to read column matrix RT 
containing the (order preserving) recoded timestamps from X   
-# M       String   ---                 Location to read matrix M containing 
the fitted Cox model with the following schema:
-#                                                               - M[,1]: betas 
-#                                                               - M[,2]: 
exp(betas)
-#                                                               - M[,3]: 
standard error of betas
-#                                                               - M[,4]: Z 
-#                                                               - M[,5]: 
p-value
-#                                                               - M[,6]: lower 
100*(1-alpha)% confidence interval of betas
-#                                                               - M[,7]: upper 
100*(1-alpha)% confidence interval of betas
-# Y       String   ---                         Location to read matrix Y used 
for prediction  
-# COV    String   ---                  Location to read the 
variance-covariance matrix of the betas    
-# MF      String   ---          Location to read column indices of X excluding 
the baseline factors if available
-# P       String   ---          Location to store matrix P containing the 
results of prediction
-# fmt     String   "text"       Matrix output format, usually "text" or "csv" 
(for matrices only)
-# 
---------------------------------------------------------------------------------------------
-# OUTPUT: 
-# 1- A matrix P with the following schema:
-#      P[,1]: linear predictors relative to a baseline which contains the mean 
values for each feature
-#                 i.e., (Y[3:] - colMeans (X[3:])) %*% b
-#      P[,2]: standard error of linear predictors
-#      P[,3]: risk relative to a baseline which contains the mean values for 
each feature
-#                 i.e., exp ((Y[3:] - colMeans (X[3:])) %*% b)
-#      P[,4]: standard error of risk 
-#      P[,5]: estimates of cumulative hazard
-#      P[,6]: standard error of the estimates of cumulative hazard
-# 
-------------------------------------------------------------------------------------------
-# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemML.jar -f cox-predict.dml -nvargs X=INPUT_DIR/X 
RT=INPUT_DIR/RT M=INPUT_DIR/M Y=INPUT_DIR/Y 
-#                                                                              
          COV=INTPUT_DIR/COV MF=INPUT_DIR/MF P=OUTPUT_DIR/P fmt=csv
-
-fileX = $X;
-fileRT = $RT;
-fileMF = $MF;
-fileY = $Y; 
-fileM = $M;
-fileCOV = $COV;
-fileP = $P; 
-
-# Default values of some parameters
-fmtO = ifdef ($fmt, "text");       # $fmt="text" 
-
-X_orig = read (fileX);
-RT_X = read (fileRT); 
-Y_orig = read (fileY);
-M = read (fileM);
-b = M[,1];
-COV = read (fileCOV);
-
-col_ind = read (fileMF);
-tab = table (col_ind, seq (1, nrow (col_ind)), ncol (Y_orig), nrow (col_ind));
-Y_orig = Y_orig %*% tab;
-
-
-# Y and X have the same dimensions and schema
-if (ncol (Y_orig) != ncol (X_orig)) { 
-       stop ("Y has a wrong number of columns!");
-}
-
-X = X_orig[,3:ncol (X_orig)];
-T_X = X_orig[,1]; 
-E_X = X_orig[,2];
-D = ncol (X);
-N = nrow (X);
-Y_orig = order (target = Y_orig, by = 1);
-Y = Y_orig[,3:ncol (X_orig)];
-T_Y = Y_orig[,1];
-
-col_means = colMeans (X);
-ones = matrix (1, rows = nrow (Y), cols = 1);
-Y_rel = Y - (ones %*% col_means);
-
-##### compute linear predictors
-LP = Y_rel %*% b; 
-# compute standard error of linear predictors using the Delta method
-se_LP = diag(sqrt (Y_rel %*% COV %*% t(Y_rel)));
-
-##### compute risk
-R = exp (Y_rel %*% b);
-# compute standard error of risk using the Delta method
-se_R = diag(sqrt ((Y_rel * R) %*% COV %*% t(Y_rel * R))) / sqrt (exp (LP));
-
-##### compute estimates of cumulative hazard together with their standard 
errors: 
-# 1. col contains cumulative hazard estimates
-# 2. col contains standard errors for cumulative hazard estimates
- 
-d_r = aggregate (target = E_X, groups = RT_X, fn = "sum"); 
-e_r = aggregate (target = RT_X, groups = RT_X, fn = "count");
-Idx = cumsum (e_r); 
-all_times = table (seq (1, nrow (Idx), 1), Idx) %*% T_X; # distinct event 
times 
-
-event_times = removeEmpty (target = ppred (d_r, 0, ">") * all_times, margin = 
"rows");
-num_distinct_event = nrow (event_times);
-
-num_distinct = nrow (all_times); # no. of distinct timestamps censored or 
uncensored
-I_rev = table (seq (1, num_distinct, 1), seq (num_distinct, 1, -1));
-e_r_rev_agg = cumsum (I_rev %*% e_r);
-select = t (colSums (table (seq (1, num_distinct), e_r_rev_agg))); 
-
-min_event_time = min (event_times);
-max_event_time = max (event_times);
-T_Y = T_Y + (min_event_time * ppred (T_Y, min_event_time, "<"));
-T_Y = T_Y + (max_event_time * ppred (T_Y, max_event_time, ">"));
-
-Ind = outer (T_Y, t (event_times), ">=");
-Ind = table (seq (1, nrow (T_Y)), rowIndexMax (Ind), nrow (T_Y), 
num_distinct_event);
-
-exp_Xb = exp (X %*% b);
-exp_Xb_agg = aggregate (target = exp_Xb, groups = RT_X, fn = "sum");
-exp_Xb_cum = I_rev %*% cumsum (I_rev %*% exp_Xb_agg);
-
-H0 = cumsum (removeEmpty (target = d_r / exp_Xb_cum, margin = "rows"));
-P1 = cumsum (removeEmpty (target = d_r / exp_Xb_cum ^ 2, margin = "rows"));
-X_exp_Xb = X * exp (X %*% b);
-
-I_rev_all = table (seq (1, N, 1), seq (N, 1, -1));
-X_exp_Xb_rev_agg = cumsum (I_rev_all %*% X_exp_Xb);
-X_exp_Xb_rev_agg = removeEmpty (target = X_exp_Xb_rev_agg * select, margin = 
"rows"); 
-X_exp_Xb_cum = I_rev %*% X_exp_Xb_rev_agg; 
-P2 = cumsum (removeEmpty (target = (X_exp_Xb_cum * d_r) / exp_Xb_cum ^ 2, 
margin = "rows"));
-
-exp_Yb = exp (Y %*% b);        
-exp_Yb_2 = exp_Yb ^ 2;
-Y_exp_Yb = Y * exp (Y %*% b);
-
-# estimates of cumulative hazard
-H = exp_Yb * (Ind %*% H0);
-
-# term1
-term1 = exp_Yb_2 * (Ind %*% P1);
-
-# term2
-P3 = cumsum (removeEmpty (target = (exp_Xb_cum * d_r) / exp_Xb_cum ^ 2, margin 
= "rows"));
-P4 = (Ind %*% P2) * exp_Yb;
-P5 = Y_exp_Yb * (Ind %*% P3);
-term2 =  P4 - P5; 
-
-# standard error of the estimates of cumulative hazard
-se_H = sqrt (term1 + rowSums((term2 %*% COV) * term2));
-
-# prepare output matrix
-P = matrix (0, rows = nrow (Y), cols = 6);
-P[,1] = LP;
-P[,2] = se_LP;
-P[,3] = R; 
-P[,4] = se_R;
-P[,5] = H;
-P[,6] = se_H;
-write (P, fileP, format=fmtO);
-
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#  
+# THIS SCRIPT APPLIES THE ESTIMATED PARAMETERS OF A COX PROPORTIONAL HAZARD 
REGRESSION MODEL TO A NEW (TEST) DATASET
+# 
+# INPUT   PARAMETERS:
+# 
---------------------------------------------------------------------------------------------
+# NAME    TYPE     DEFAULT      MEANING
+# 
---------------------------------------------------------------------------------------------
+# X       String   ---          Location to read the input matrix containing 
the survival data with the following schema:
+#                                                               - X[,1]: 
timestamps 
+#                                                               - X[,2]: 
whether an event occurred (1) or data is censored (0)
+#                                                               - X[,3:]: 
feature vectors (excluding the baseline columns) used for model fitting       
+# RT      String   ---                 Location to read column matrix RT 
containing the (order preserving) recoded timestamps from X   
+# M       String   ---                 Location to read matrix M containing 
the fitted Cox model with the following schema:
+#                                                               - M[,1]: betas 
+#                                                               - M[,2]: 
exp(betas)
+#                                                               - M[,3]: 
standard error of betas
+#                                                               - M[,4]: Z 
+#                                                               - M[,5]: 
p-value
+#                                                               - M[,6]: lower 
100*(1-alpha)% confidence interval of betas
+#                                                               - M[,7]: upper 
100*(1-alpha)% confidence interval of betas
+# Y       String   ---                         Location to read matrix Y used 
for prediction  
+# COV    String   ---                  Location to read the 
variance-covariance matrix of the betas    
+# MF      String   ---          Location to read column indices of X excluding 
the baseline factors if available
+# P       String   ---          Location to store matrix P containing the 
results of prediction
+# fmt     String   "text"       Matrix output format, usually "text" or "csv" 
(for matrices only)
+# 
---------------------------------------------------------------------------------------------
+# OUTPUT: 
+# 1- A matrix P with the following schema:
+#      P[,1]: linear predictors relative to a baseline which contains the mean 
values for each feature
+#                 i.e., (Y[3:] - colMeans (X[3:])) %*% b
+#      P[,2]: standard error of linear predictors
+#      P[,3]: risk relative to a baseline which contains the mean values for 
each feature
+#                 i.e., exp ((Y[3:] - colMeans (X[3:])) %*% b)
+#      P[,4]: standard error of risk 
+#      P[,5]: estimates of cumulative hazard
+#      P[,6]: standard error of the estimates of cumulative hazard
+# 
-------------------------------------------------------------------------------------------
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f cox-predict.dml -nvargs X=INPUT_DIR/X 
RT=INPUT_DIR/RT M=INPUT_DIR/M Y=INPUT_DIR/Y 
+#                                                                              
          COV=INTPUT_DIR/COV MF=INPUT_DIR/MF P=OUTPUT_DIR/P fmt=csv
+
+fileX = $X;
+fileRT = $RT;
+fileMF = $MF;
+fileY = $Y; 
+fileM = $M;
+fileCOV = $COV;
+fileP = $P; 
+
+# Default values of some parameters
+fmtO = ifdef ($fmt, "text");       # $fmt="text" 
+
+X_orig = read (fileX);
+RT_X = read (fileRT); 
+Y_orig = read (fileY);
+M = read (fileM);
+b = M[,1];
+COV = read (fileCOV);
+
+col_ind = read (fileMF);
+tab = table (col_ind, seq (1, nrow (col_ind)), ncol (Y_orig), nrow (col_ind));
+Y_orig = Y_orig %*% tab;
+
+
+# Y and X have the same dimensions and schema
+if (ncol (Y_orig) != ncol (X_orig)) { 
+       stop ("Y has a wrong number of columns!");
+}
+
+X = X_orig[,3:ncol (X_orig)];
+T_X = X_orig[,1]; 
+E_X = X_orig[,2];
+D = ncol (X);
+N = nrow (X);
+Y_orig = order (target = Y_orig, by = 1);
+Y = Y_orig[,3:ncol (X_orig)];
+T_Y = Y_orig[,1];
+
+col_means = colMeans (X);
+ones = matrix (1, rows = nrow (Y), cols = 1);
+Y_rel = Y - (ones %*% col_means);
+
+##### compute linear predictors
+LP = Y_rel %*% b; 
+# compute standard error of linear predictors using the Delta method
+se_LP = diag(sqrt (Y_rel %*% COV %*% t(Y_rel)));
+
+##### compute risk
+R = exp (Y_rel %*% b);
+# compute standard error of risk using the Delta method
+se_R = diag(sqrt ((Y_rel * R) %*% COV %*% t(Y_rel * R))) / sqrt (exp (LP));
+
+##### compute estimates of cumulative hazard together with their standard 
errors: 
+# 1. col contains cumulative hazard estimates
+# 2. col contains standard errors for cumulative hazard estimates
+ 
+d_r = aggregate (target = E_X, groups = RT_X, fn = "sum"); 
+e_r = aggregate (target = RT_X, groups = RT_X, fn = "count");
+Idx = cumsum (e_r); 
+all_times = table (seq (1, nrow (Idx), 1), Idx) %*% T_X; # distinct event 
times 
+
+event_times = removeEmpty (target = ppred (d_r, 0, ">") * all_times, margin = 
"rows");
+num_distinct_event = nrow (event_times);
+
+num_distinct = nrow (all_times); # no. of distinct timestamps censored or 
uncensored
+I_rev = table (seq (1, num_distinct, 1), seq (num_distinct, 1, -1));
+e_r_rev_agg = cumsum (I_rev %*% e_r);
+select = t (colSums (table (seq (1, num_distinct), e_r_rev_agg))); 
+
+min_event_time = min (event_times);
+max_event_time = max (event_times);
+T_Y = T_Y + (min_event_time * ppred (T_Y, min_event_time, "<"));
+T_Y = T_Y + (max_event_time * ppred (T_Y, max_event_time, ">"));
+
+Ind = outer (T_Y, t (event_times), ">=");
+Ind = table (seq (1, nrow (T_Y)), rowIndexMax (Ind), nrow (T_Y), 
num_distinct_event);
+
+exp_Xb = exp (X %*% b);
+exp_Xb_agg = aggregate (target = exp_Xb, groups = RT_X, fn = "sum");
+exp_Xb_cum = I_rev %*% cumsum (I_rev %*% exp_Xb_agg);
+
+H0 = cumsum (removeEmpty (target = d_r / exp_Xb_cum, margin = "rows"));
+P1 = cumsum (removeEmpty (target = d_r / exp_Xb_cum ^ 2, margin = "rows"));
+X_exp_Xb = X * exp (X %*% b);
+
+I_rev_all = table (seq (1, N, 1), seq (N, 1, -1));
+X_exp_Xb_rev_agg = cumsum (I_rev_all %*% X_exp_Xb);
+X_exp_Xb_rev_agg = removeEmpty (target = X_exp_Xb_rev_agg * select, margin = 
"rows"); 
+X_exp_Xb_cum = I_rev %*% X_exp_Xb_rev_agg; 
+P2 = cumsum (removeEmpty (target = (X_exp_Xb_cum * d_r) / exp_Xb_cum ^ 2, 
margin = "rows"));
+
+exp_Yb = exp (Y %*% b);        
+exp_Yb_2 = exp_Yb ^ 2;
+Y_exp_Yb = Y * exp (Y %*% b);
+
+# estimates of cumulative hazard
+H = exp_Yb * (Ind %*% H0);
+
+# term1
+term1 = exp_Yb_2 * (Ind %*% P1);
+
+# term2
+P3 = cumsum (removeEmpty (target = (exp_Xb_cum * d_r) / exp_Xb_cum ^ 2, margin 
= "rows"));
+P4 = (Ind %*% P2) * exp_Yb;
+P5 = Y_exp_Yb * (Ind %*% P3);
+term2 =  P4 - P5; 
+
+# standard error of the estimates of cumulative hazard
+se_H = sqrt (term1 + rowSums((term2 %*% COV) * term2));
+
+# prepare output matrix
+P = matrix (0, rows = nrow (Y), cols = 6);
+P[,1] = LP;
+P[,2] = se_LP;
+P[,3] = R; 
+P[,4] = se_R;
+P[,5] = H;
+P[,6] = se_H;
+write (P, fileP, format=fmtO);
+


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/Cox.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/Cox.dml b/scripts/algorithms/Cox.dml
index 3cab74d..6da22ce 100644
--- a/scripts/algorithms/Cox.dml
+++ b/scripts/algorithms/Cox.dml
@@ -1,502 +1,502 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#  
-# THIS SCRIPT FITS A COX PROPORTIONAL HAZARD REGRESSION MODEL.
-# The Breslow method is used for handling ties and the regression parameters 
-# are computed using trust region newton method with conjugate gradient 
-# 
-# INPUT   PARAMETERS:
-# 
---------------------------------------------------------------------------------------------
-# NAME    TYPE     DEFAULT      MEANING
-# 
---------------------------------------------------------------------------------------------
-# X       String   ---          Location to read the input matrix X containing 
the survival data containing the following information
-#                                                               - 1: 
timestamps 
-#                                                               - 2: whether 
an event occurred (1) or data is censored (0)
-#                                                               - 3: feature 
vectors   
-# TE     String   ---          Column indices of X as a column vector which 
contain timestamp (first row) and event information (second row)
-# F      String   " "                  Column indices of X as a column vector 
which are to be used for fitting the Cox model
-# R      String   " "          If factors (categorical variables) are 
available in the input matrix X, location to read matrix R containing 
-#                                                              the start and 
end indices of the factors in X
-#                                                               - R[,1]: start 
indices
-#                                                               - R[,2]: end 
indices   
-#                                                              Alternatively, 
user can specify the indices of the baseline level of each factor which needs 
to be removed from X; 
-#                                                              in this case 
the start and end indices corresponding to the baseline level need to be the 
same;
-#                                                              if R is not 
provided by default all variables are considered to be continuous 
-# M       String   ---          Location to store the results of Cox 
regression analysis including estimated regression parameters of the fitted 
-#                                                              Cox model (the 
betas), their standard errors, confidence intervals, and P-values 
-# S       String   " "          Location to store a summary of some statistics 
of the fitted cox proportional hazard model including                   
-#                                                              no. of records, 
no. of events, log-likelihood, AIC, Rsquare (Cox & Snell), and max possible 
Rsquare; 
-#                                                              by default is 
standard output  
-# T       String   " "          Location to store the results of Likelihood 
ratio test, Wald test, and Score (log-rank) test of the fitted model;
-#                                                              by default is 
standard output 
-# COV    String   ---                  Location to store the 
variance-covariance matrix of the betas
-# RT      String   ---                 Location to store matrix RT containing 
the order-preserving recoded timestamps from X 
-# XO      String   ---                 Location to store sorted input matrix 
by the timestamps 
-# MF      String   ---          Location to store column indices of X 
excluding the baseline factors if available
-# alpha   Double   0.05         Parameter to compute a 100*(1-alpha)% 
confidence interval for the betas  
-# tol     Double   0.000001     Tolerance ("epsilon")
-# moi     Int      100                 Max. number of outer (Newton) iterations
-# mii     Int      0                   Max. number of inner (conjugate 
gradient) iterations, 0 = no max   
-# fmt     String   "text"       Matrix output format, usually "text" or "csv" 
(for matrices only)
-# 
---------------------------------------------------------------------------------------------
-# OUTPUT: 
-# 1- A D x 7 matrix M, where D denotes the number of covariates, with the 
following schema:
-#      M[,1]: betas    
-#      M[,2]: exp(betas)
-#      M[,3]: standard error of betas
-#      M[,4]: Z 
-#      M[,5]: P-value
-#      M[,6]: lower 100*(1-alpha)% confidence interval of betas
-#      M[,7]: upper 100*(1-alpha)% confidence interval of betas
-#
-# Two log files containing a summary of some statistics of the fitted model:
-# 1- File S with the following format 
-#      - line 1: no. of observations
-#      - line 2: no. of events
-#   - line 3: log-likelihood 
-#      - line 4: AIC
-#      - line 5: Rsquare (Cox & Snell)
-#      - line 6: max possible Rsquare
-# 2- File T with the following format
-#      - line 1: Likelihood ratio test statistic, degree of freedom, P-value
-#      - line 2: Wald test statistic, degree of freedom, P-value
-#      - line 3: Score (log-rank) test statistic, degree of freedom, P-value
-# 
-# Additionally, the following matrices are stored (needed for prediction)
-# 1- A column matrix RT that contains the order-preserving recoded timestamps 
from X 
-# 2- Matrix XO which is matrix X with sorted timestamps  
-# 3- Variance-covariance matrix of the betas COV
-# 4- A column matrix MF that contains the column indices of X with the 
baseline factors removed (if available)
-# 
-------------------------------------------------------------------------------------------
-# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemML.jar -f Cox.dml -nvargs X=INPUT_DIR/X TE=INPUT_DIR/TE 
F=INTPUT_DIR/F R=INTPUT_DIR/R 
-#                                                                        
M=OUTPUT_DIR/M S=OUTPUT_DIR/S T=OUTPUT_DIR/T COV=OUTPUT_DIR/COV 
RT=OUTPUT_DIR/RT 
-#                                                                        
XO=OUTPUT_DIR/XO MF=OUTPUT/MF alpha=0.05 tol=0.000001 moi=100 mii=20 fmt=csv
-
-fileX = $X;
-fileTE = $TE;
-fileRT = $RT;
-fileMF = $MF;
-fileM = $M;
-fileXO = $XO;
-fileCOV = $COV;
-
-# Default values of some parameters
-fileF = ifdef ($F, " ");                        # $F=" "
-fileR = ifdef ($R, " ");                # $R=" " 
-fileS = ifdef ($S, " ");                # $S=" " 
-fileT = ifdef ($T, " ");                # $T=" " 
-fmtO = ifdef ($fmt, "text");         # $fmt="text" 
-alpha = ifdef ($alpha, 0.05);        # $alpha=0.05
-tol = ifdef ($tol, 0.000001);           # $tol=0.000001;
-maxiter = ifdef ($moi, 100);            # $moi=100;
-maxinneriter = ifdef ($mii, 0);      # $mii=0;
-
-X_orig = read (fileX);
-
-TE = read (fileTE);
-if (fileF != " ") {
-       F = read (fileF);
-}
-
-######## CHECK FOR FACTORS AND REMOVE THE BASELINE OF EACH FACTOR FROM THE 
DATASET
-
-if (fileR != " ") { # factors available
-       R = read (fileR);
-       if (ncol (R) != 2) {
-               stop ("Matrix R has wrong dimensions!");
-       }
-       print ("REMOVING BASLINE LEVEL OF EACH FACTOR...");     
-       # identify baseline columns to be removed from X_orig
-       col_sum = colSums (X_orig);
-       col_seq = t (seq(1, ncol (X_orig)));
-       parfor (i in 1:nrow (R), check = 0) {
-               start_ind = as.scalar (R[i,1]);
-               end_ind = as.scalar (R[i,2]);
-               baseline_ind = as.scalar (rowIndexMax (col_sum[1, 
start_ind:end_ind])) + start_ind - 1;
-               col_seq[,baseline_ind] = 0;
-       }
-       ones = matrix (1, rows = nrow (F), cols = 1);
-       F_filter = table (ones, F, 1, ncol (X_orig));
-       F_filter = removeEmpty (target = F_filter * col_seq, margin = "cols");
-       TE_F = t(append (t (TE), F_filter));
-} else if (fileF != " ") { # all features scale
-       TE_F = t(append (t (TE), t(F)));
-} else { # no features available 
-       TE_F = TE;
-}
-
-write (TE_F, fileMF, format = fmtO);
-
-X_orig = X_orig %*% table (TE_F, seq (1, nrow (TE_F)), ncol (X_orig), nrow 
(TE_F));
-
-######## RECODING TIMESTAMPS PRESERVING THE ORDER
-print ("RECODING TIMESTAMPS...");
-
-N = nrow (X_orig);
-X_orig = order (target = X_orig, by = 1);
-Idx = matrix (1, rows = N, cols = 1);
-num_timestamps = 1;
-if (N == 1) {
-       RT = matrix (1, rows = 1, cols = 1);
-} else {
-       Idx[2:N,1] = ppred (X_orig[1:(N - 1),1], X_orig[2:N,1], "!=");
-       num_timestamps = sum (Idx);
-       A = removeEmpty (target = diag (Idx), margin = "cols");
-       if (ncol (A) > 1) {
-               A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
-               B = cumsum (A);
-               RT = B %*% seq(1, ncol(B));
-       } else { # there is only one group
-               RT = matrix (1, rows = N, cols = 1);
-       }
-}
-E = X_orig[,2]; 
-
-print ("BEGIN COX PROPORTIONAL HAZARD SCRIPT");
-
-######## PARAMETERS OF THE TRUST REGION NEWTON METHOD WITH CONJUGATE GRADIENT
-#  b: the regression betas
-#  o: loss function value
-#  g: loss function gradient
-#  H: loss function Hessian
-# sb: shift of b in one iteration
-# so: shift of o in one iteration
-#  r: CG residual = H %*% sb + g
-#  d: CG direction vector
-# Hd: = H %*% d
-#  c: scalar coefficient in CG
-# delta: trust region size
-# tol: tolerance value
-#  i: outer (Newton) iteration count
-#  j: inner (CG) iteration count
-
-# computing initial coefficients b (all initialized to 0)
-if (ncol (X_orig) > 2) {
-       X = X_orig[,3:ncol(X_orig)];
-       D = ncol (X);
-       zeros_D = matrix (0, rows = D, cols = 1);
-       b = zeros_D; 
-}
-d_r = aggregate (target = E, groups = RT, fn = "sum"); 
-e_r = aggregate (target = RT, groups = RT, fn = "count");
-
-# computing initial loss function value o
-num_distinct = nrow (d_r); # no. of distinct timestamps
-e_r_rev_agg = cumsum (rev(e_r));
-d_r_rev = rev(d_r);
-o = sum (d_r_rev * log (e_r_rev_agg));
-o_init = o;
-if (ncol (X_orig) < 3) {
-       loglik = -o;
-       S_str = "no. of records " + N + " loglik " + loglik;
-       if (fileS != " ") {
-               write (S_str, fileS, format = fmtO);
-       } else {
-               print (S_str);
-       }
-       stop ("No features are selected!");
-}
-
-# computing initial gradient g 
-# part 1 g0_1
-g0_1 = - t (colSums (X * E)); # g_1
-# part 2 g0_2
-X_rev_agg = cumsum (rev(X));
-select = table (seq (1, num_distinct), e_r_rev_agg);
-X_agg = select %*% X_rev_agg;
-g0_2 = t (colSums ((X_agg * d_r_rev)/ e_r_rev_agg));
-#
-g0 = g0_1 + g0_2;
-g = g0; 
-
-# initialization for trust region Newton method
-delta = 0.5 * sqrt (D) / max (sqrt (rowSums (X ^ 2))); 
-initial_g2 = sum (g ^ 2);
-exit_g2 = initial_g2 * tol ^ 2;
-maxiter = 100; 
-maxinneriter = min (D, 100); 
-i = 0;
-sum_g2 = sum (g ^ 2);
-while (sum_g2 > exit_g2 & i < maxiter) {  
-       i = i + 1;
-    sb = zeros_D;  
-    r = g;  
-    r2 = sum (r ^ 2);  
-    exit_r2 = 0.01 * r2;
-    d = - r;  
-    trust_bound_reached = FALSE;  
-    j = 0;
-
-       exp_Xb = exp (X %*% b);
-       exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
-       D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
-       X_exp_Xb = X * exp_Xb;
-       X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
-       X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
-
-    while (r2 > exit_r2 & (! trust_bound_reached) & j < maxinneriter) { 
-        j = j + 1;
-               # computing Hessian times d (Hd)
-               # part 1 Hd_1
-               Xd = X %*% d;
-               X_Xd_exp_Xb = X * (Xd * exp_Xb);
-               X_Xd_exp_Xb_rev_agg = cumsum (rev(X_Xd_exp_Xb));
-               X_Xd_exp_Xb_rev_agg = select %*% X_Xd_exp_Xb_rev_agg;
-               
-               Hd_1 = X_Xd_exp_Xb_rev_agg / D_r_rev;
-               # part 2 Hd_2
-               
-               Xd_exp_Xb = Xd * exp_Xb;
-               Xd_exp_Xb_rev_agg = cumsum (rev(Xd_exp_Xb));
-               Xd_exp_Xb_rev_agg = select %*% Xd_exp_Xb_rev_agg;
-               
-               Hd_2_num = X_exp_Xb_rev_agg * Xd_exp_Xb_rev_agg; # numerator
-               Hd_2 = Hd_2_num / (D_r_rev ^ 2);
-               
-               Hd = t (colSums ((Hd_1 - Hd_2) * d_r_rev));
-
-               c = r2 / sum (d * Hd);
-        [c, trust_bound_reached] = ensure_trust_bound (c, sum(d ^ 2), 2 * 
sum(sb * d), sum(sb ^ 2) - delta ^ 2);
-        sb = sb + c * d;  
-        r = r + c * Hd;  
-        r2_new = sum (r ^ 2);
-        d = - r + (r2_new / r2) * d;  
-        r2 = r2_new;
-    }
-
-    # computing loss change in 1 iteration (so)
-       # part 1 so_1
-       so_1 = - as.scalar (colSums (X * E) %*% (b + sb)); 
-       # part 2 so_2
-       exp_Xbsb = exp (X %*% (b + sb));
-       exp_Xbsb_agg = aggregate (target = exp_Xbsb, groups = RT, fn = "sum");
-       so_2 = sum (d_r_rev * log (cumsum (rev(exp_Xbsb_agg))));
-       #
-       so = so_1 + so_2;
-       so = so - o; 
-       
-       delta = update_trust_bound (delta, sqrt (sum (sb ^ 2)), so, sum (sb * 
g), 0.5 * sum (sb * (r + g)));
-    if (so < 0) {
-        b = b + sb;  
-        o = o + so;  
-               # compute new gradient g
-               exp_Xb = exp (X %*% b);
-               exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = 
"sum");
-               X_exp_Xb = X * exp_Xb;
-               X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
-               X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
-               
-               D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
-               g_2 = t (colSums ((X_exp_Xb_rev_agg / D_r_rev) * d_r_rev));
-               g = g0_1 + g_2;
-               sum_g2 = sum (g ^ 2);
-    }
-}
-
-if (sum_g2 > exit_g2 & i >= maxiter) {
-       print ("Trust region Newton method did not converge!");
-}
-
-
-print ("COMPUTING HESSIAN...");
-
-H0 = matrix (0, rows = D, cols = D);
-H = matrix (0, rows = D, cols = D);
-
-X_exp_Xb_rev_2 = rev(X_exp_Xb);
-X_rev_2 = rev(X);
-
-X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
-X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg; 
-
-parfor (i in 1:D, check = 0) {
-       Xi = X[,i];
-       Xi_rev = rev(Xi);
-
-       ## ----------Start calculating H0-------------- 
-       # part 1 H0_1
-       Xi_X = X_rev_2[,i:D] * Xi_rev;
-       Xi_X_rev_agg = cumsum (Xi_X);
-       Xi_X_rev_agg = select %*% Xi_X_rev_agg;
-       H0_1 = Xi_X_rev_agg / e_r_rev_agg;
-
-       # part 2 H0_2
-       Xi_agg = aggregate (target = Xi, groups = RT, fn = "sum");
-       Xi_agg_rev_agg = cumsum (rev(Xi_agg));  
-       H0_2_num = X_agg[,i:D] * Xi_agg_rev_agg; # numerator
-       H0_2 = H0_2_num / (e_r_rev_agg ^ 2);
-
-       H0[i,i:D] = colSums ((H0_1 - H0_2) * d_r_rev);
-       #-----------End calculating H0--------------------
-               
-       ## ----------Start calculating H--------------
-       # part 1 H_1
-       Xi_X_exp_Xb = X_exp_Xb_rev_2[,i:D] * Xi_rev;
-       Xi_X_exp_Xb_rev_agg = cumsum (Xi_X_exp_Xb);
-       Xi_X_exp_Xb_rev_agg = select %*% Xi_X_exp_Xb_rev_agg;
-       H_1 = Xi_X_exp_Xb_rev_agg / D_r_rev;
-
-       # part 2 H_2
-       Xi_exp_Xb = exp_Xb * Xi;
-       Xi_exp_Xb_agg = aggregate (target = Xi_exp_Xb, groups = RT, fn = 
"sum"); 
-       
-       Xi_exp_Xb_agg_rev_agg = cumsum (rev(Xi_exp_Xb_agg));
-       H_2_num = X_exp_Xb_rev_agg[,i:D] * Xi_exp_Xb_agg_rev_agg; # numerator
-       H_2 = H_2_num / (D_r_rev ^ 2);
-       H[i,i:D] = colSums ((H_1 - H_2) * d_r_rev);
-       #-----------End calculating H--------------------
-}
-H = H + t(H) - diag( diag (H));
-H0 = H0 + t(H0) - diag( diag (H0));
-
-
-# compute standard error for betas
-H_inv = inv (H);
-se_b = sqrt (diag (H_inv));
-
-# compute exp(b), Z, Pr[>|Z|]
-exp_b = exp (b);
-Z = b / se_b;
-P = matrix (0, rows = D, cols = 1);
-parfor (i in 1:D) {
-       P[i,1] = 2 - 2 * (cdf (target = abs (as.scalar (Z[i,1])), dist = 
"normal"));
-}
-
-# compute confidence intervals for b
-z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");
-CI_l = b - se_b * z_alpha_2;
-CI_r = b - se_b + z_alpha_2;
-
-######## SOME STATISTICS AND TESTS
-# no. of records
-S_str = "no. of records " + N;
-
-# no.of events
-S_str = append (S_str, "no. of events " + sum (E));
-
-# log-likelihood
-loglik = -o;
-S_str = append (S_str, "loglik " + loglik + " ");
-
-# AIC = -2 * loglik + 2 * D
-AIC = -2 * loglik + 2 * D;
-S_str = append (S_str, "AIC " + AIC + " ");
-
-# Wald test
-wald_t = as.scalar (t(b) %*% H %*% b);
-wald_p = 1 - cdf (target = wald_t, dist = "chisq", df = D);
-T_str = "Wald test = " + wald_t + " on " + D + " df, p = " + wald_p + " ";
-
-# Likelihood ratio test
-lratio_t = 2 * o_init - 2 * o;
-lratio_p = 1 - cdf (target = lratio_t, dist = "chisq", df = D);
-T_str = append (T_str, "Likelihood ratio test = " + lratio_t + " on " + D + " 
df, p = " + lratio_p + " ");
-
-
-H0_inv = inv (H0);
-score_t = as.scalar (t (g0) %*% H0_inv %*% g0);
-score_p = 1 - cdf (target = score_t, dist = "chisq", df = D);
-T_str = append (T_str, "Score (logrank) test = " + score_t + " on " + D + " 
df, p = " + score_p + " ");
-
-# Rsquare (Cox & Snell)
-Rsquare = 1 - exp (-lratio_t / N);  
-Rsquare_max = 1 - exp (-2 * o_init / N);
-S_str = append (S_str, "Rsquare (Cox & Snell): " + Rsquare + " ");
-S_str = append (S_str, "max possible Rsquare: " + Rsquare_max);
-
-M = matrix (0, rows = D, cols = 7);
-M[,1] = b;
-M[,2] = exp_b;
-M[,3] = se_b;
-M[,4] = Z;
-M[,5] = P;
-M[,6] = CI_l;
-M[,7] = CI_r;
-
-write (M, fileM, format = fmtO);
-if (fileS != " ") {
-       write (S_str, fileS, format = fmtO);
-} else {
-       print (S_str);
-}
-if (fileT != " ") {
-       write (T_str, fileT, format = fmtO);
-} else {
-       print (T_str);
-}
-# needed for prediction
-write (RT, fileRT, format = fmtO);
-write (H_inv, fileCOV, format = fmtO);
-write (X_orig, fileXO, format = fmtO);
-
-
-####### UDFS FOR TRUST REGION NEWTON METHOD
-
-ensure_trust_bound =
-    function (double x, double a, double b, double c)
-    return (double x_new, boolean is_violated)
-{
-    if (a * x^2 + b * x + c > 0)
-    {
-        is_violated = TRUE;
-        rad = sqrt (b ^ 2 - 4 * a * c);
-        if (b >= 0) {
-            x_new = - (2 * c) / (b + rad);
-        } else {
-            x_new = - (b - rad) / (2 * a);
-        }
-    } else {
-        is_violated = FALSE;
-        x_new = x;
-    }
-}
-
-update_trust_bound =
-    function (double delta,
-              double sb_distance,
-              double so_exact,
-              double so_linear_approx,
-              double so_quadratic_approx)
-    return   (double delta)
-{
-    sigma1 = 0.25;
-    sigma2 = 0.5;
-    sigma3 = 4.0;
-
-    if (so_exact <= so_linear_approx) {
-       alpha = sigma3;
-    } else {
-       alpha = max (sigma1, - 0.5 * so_linear_approx / (so_exact - 
so_linear_approx));
-    }
-
-    rho = so_exact / so_quadratic_approx;
-    if (rho < 0.0001) {
-        delta = min (max (alpha, sigma1) * sb_distance, sigma2 * delta);
-    } else { if (rho < 0.25) {
-        delta = max (sigma1 * delta, min (alpha * sb_distance, sigma2 * 
delta));
-    } else { if (rho < 0.75) {
-        delta = max (sigma1 * delta, min (alpha * sb_distance, sigma3 * 
delta));
-    } else {
-        delta = max (delta, min (alpha * sb_distance, sigma3 * delta));
-    }}} 
-}
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#  
+# THIS SCRIPT FITS A COX PROPORTIONAL HAZARD REGRESSION MODEL.
+# The Breslow method is used for handling ties and the regression parameters 
+# are computed using trust region newton method with conjugate gradient 
+# 
+# INPUT   PARAMETERS:
+# 
---------------------------------------------------------------------------------------------
+# NAME    TYPE     DEFAULT      MEANING
+# 
---------------------------------------------------------------------------------------------
+# X       String   ---          Location to read the input matrix X containing 
the survival data containing the following information
+#                                                               - 1: 
timestamps 
+#                                                               - 2: whether 
an event occurred (1) or data is censored (0)
+#                                                               - 3: feature 
vectors   
+# TE     String   ---          Column indices of X as a column vector which 
contain timestamp (first row) and event information (second row)
+# F      String   " "                  Column indices of X as a column vector 
which are to be used for fitting the Cox model
+# R      String   " "          If factors (categorical variables) are 
available in the input matrix X, location to read matrix R containing 
+#                                                              the start and 
end indices of the factors in X
+#                                                               - R[,1]: start 
indices
+#                                                               - R[,2]: end 
indices   
+#                                                              Alternatively, 
user can specify the indices of the baseline level of each factor which needs 
to be removed from X; 
+#                                                              in this case 
the start and end indices corresponding to the baseline level need to be the 
same;
+#                                                              if R is not 
provided by default all variables are considered to be continuous 
+# M       String   ---          Location to store the results of Cox 
regression analysis including estimated regression parameters of the fitted 
+#                                                              Cox model (the 
betas), their standard errors, confidence intervals, and P-values 
+# S       String   " "          Location to store a summary of some statistics 
of the fitted cox proportional hazard model including                   
+#                                                              no. of records, 
no. of events, log-likelihood, AIC, Rsquare (Cox & Snell), and max possible 
Rsquare; 
+#                                                              by default is 
standard output  
+# T       String   " "          Location to store the results of Likelihood 
ratio test, Wald test, and Score (log-rank) test of the fitted model;
+#                                                              by default is 
standard output 
+# COV    String   ---                  Location to store the 
variance-covariance matrix of the betas
+# RT      String   ---                 Location to store matrix RT containing 
the order-preserving recoded timestamps from X 
+# XO      String   ---                 Location to store sorted input matrix 
by the timestamps 
+# MF      String   ---          Location to store column indices of X 
excluding the baseline factors if available
+# alpha   Double   0.05         Parameter to compute a 100*(1-alpha)% 
confidence interval for the betas  
+# tol     Double   0.000001     Tolerance ("epsilon")
+# moi     Int      100                 Max. number of outer (Newton) iterations
+# mii     Int      0                   Max. number of inner (conjugate 
gradient) iterations, 0 = no max   
+# fmt     String   "text"       Matrix output format, usually "text" or "csv" 
(for matrices only)
+# 
---------------------------------------------------------------------------------------------
+# OUTPUT: 
+# 1- A D x 7 matrix M, where D denotes the number of covariates, with the 
following schema:
+#      M[,1]: betas    
+#      M[,2]: exp(betas)
+#      M[,3]: standard error of betas
+#      M[,4]: Z 
+#      M[,5]: P-value
+#      M[,6]: lower 100*(1-alpha)% confidence interval of betas
+#      M[,7]: upper 100*(1-alpha)% confidence interval of betas
+#
+# Two log files containing a summary of some statistics of the fitted model:
+# 1- File S with the following format 
+#      - line 1: no. of observations
+#      - line 2: no. of events
+#   - line 3: log-likelihood 
+#      - line 4: AIC
+#      - line 5: Rsquare (Cox & Snell)
+#      - line 6: max possible Rsquare
+# 2- File T with the following format
+#      - line 1: Likelihood ratio test statistic, degree of freedom, P-value
+#      - line 2: Wald test statistic, degree of freedom, P-value
+#      - line 3: Score (log-rank) test statistic, degree of freedom, P-value
+# 
+# Additionally, the following matrices are stored (needed for prediction)
+# 1- A column matrix RT that contains the order-preserving recoded timestamps 
from X 
+# 2- Matrix XO which is matrix X with sorted timestamps  
+# 3- Variance-covariance matrix of the betas COV
+# 4- A column matrix MF that contains the column indices of X with the 
baseline factors removed (if available)
+# 
-------------------------------------------------------------------------------------------
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f Cox.dml -nvargs X=INPUT_DIR/X TE=INPUT_DIR/TE 
F=INTPUT_DIR/F R=INTPUT_DIR/R 
+#                                                                        
M=OUTPUT_DIR/M S=OUTPUT_DIR/S T=OUTPUT_DIR/T COV=OUTPUT_DIR/COV 
RT=OUTPUT_DIR/RT 
+#                                                                        
XO=OUTPUT_DIR/XO MF=OUTPUT/MF alpha=0.05 tol=0.000001 moi=100 mii=20 fmt=csv
+
+fileX = $X;
+fileTE = $TE;
+fileRT = $RT;
+fileMF = $MF;
+fileM = $M;
+fileXO = $XO;
+fileCOV = $COV;
+
+# Default values of some parameters
+fileF = ifdef ($F, " ");                        # $F=" "
+fileR = ifdef ($R, " ");                # $R=" " 
+fileS = ifdef ($S, " ");                # $S=" " 
+fileT = ifdef ($T, " ");                # $T=" " 
+fmtO = ifdef ($fmt, "text");         # $fmt="text" 
+alpha = ifdef ($alpha, 0.05);        # $alpha=0.05
+tol = ifdef ($tol, 0.000001);           # $tol=0.000001;
+maxiter = ifdef ($moi, 100);            # $moi=100;
+maxinneriter = ifdef ($mii, 0);      # $mii=0;
+
+X_orig = read (fileX);
+
+TE = read (fileTE);
+if (fileF != " ") {
+       F = read (fileF);
+}
+
+######## CHECK FOR FACTORS AND REMOVE THE BASELINE OF EACH FACTOR FROM THE 
DATASET
+
+if (fileR != " ") { # factors available
+       R = read (fileR);
+       if (ncol (R) != 2) {
+               stop ("Matrix R has wrong dimensions!");
+       }
+       print ("REMOVING BASLINE LEVEL OF EACH FACTOR...");     
+       # identify baseline columns to be removed from X_orig
+       col_sum = colSums (X_orig);
+       col_seq = t (seq(1, ncol (X_orig)));
+       parfor (i in 1:nrow (R), check = 0) {
+               start_ind = as.scalar (R[i,1]);
+               end_ind = as.scalar (R[i,2]);
+               baseline_ind = as.scalar (rowIndexMax (col_sum[1, 
start_ind:end_ind])) + start_ind - 1;
+               col_seq[,baseline_ind] = 0;
+       }
+       ones = matrix (1, rows = nrow (F), cols = 1);
+       F_filter = table (ones, F, 1, ncol (X_orig));
+       F_filter = removeEmpty (target = F_filter * col_seq, margin = "cols");
+       TE_F = t(append (t (TE), F_filter));
+} else if (fileF != " ") { # all features scale
+       TE_F = t(append (t (TE), t(F)));
+} else { # no features available 
+       TE_F = TE;
+}
+
+write (TE_F, fileMF, format = fmtO);
+
+X_orig = X_orig %*% table (TE_F, seq (1, nrow (TE_F)), ncol (X_orig), nrow 
(TE_F));
+
+######## RECODING TIMESTAMPS PRESERVING THE ORDER
+print ("RECODING TIMESTAMPS...");
+
+N = nrow (X_orig);
+X_orig = order (target = X_orig, by = 1);
+Idx = matrix (1, rows = N, cols = 1);
+num_timestamps = 1;
+if (N == 1) {
+       RT = matrix (1, rows = 1, cols = 1);
+} else {
+       Idx[2:N,1] = ppred (X_orig[1:(N - 1),1], X_orig[2:N,1], "!=");
+       num_timestamps = sum (Idx);
+       A = removeEmpty (target = diag (Idx), margin = "cols");
+       if (ncol (A) > 1) {
+               A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)];
+               B = cumsum (A);
+               RT = B %*% seq(1, ncol(B));
+       } else { # there is only one group
+               RT = matrix (1, rows = N, cols = 1);
+       }
+}
+E = X_orig[,2]; 
+
+print ("BEGIN COX PROPORTIONAL HAZARD SCRIPT");
+
+######## PARAMETERS OF THE TRUST REGION NEWTON METHOD WITH CONJUGATE GRADIENT
+#  b: the regression betas
+#  o: loss function value
+#  g: loss function gradient
+#  H: loss function Hessian
+# sb: shift of b in one iteration
+# so: shift of o in one iteration
+#  r: CG residual = H %*% sb + g
+#  d: CG direction vector
+# Hd: = H %*% d
+#  c: scalar coefficient in CG
+# delta: trust region size
+# tol: tolerance value
+#  i: outer (Newton) iteration count
+#  j: inner (CG) iteration count
+
+# computing initial coefficients b (all initialized to 0)
+if (ncol (X_orig) > 2) {
+       X = X_orig[,3:ncol(X_orig)];
+       D = ncol (X);
+       zeros_D = matrix (0, rows = D, cols = 1);
+       b = zeros_D; 
+}
+d_r = aggregate (target = E, groups = RT, fn = "sum"); 
+e_r = aggregate (target = RT, groups = RT, fn = "count");
+
+# computing initial loss function value o
+num_distinct = nrow (d_r); # no. of distinct timestamps
+e_r_rev_agg = cumsum (rev(e_r));
+d_r_rev = rev(d_r);
+o = sum (d_r_rev * log (e_r_rev_agg));
+o_init = o;
+if (ncol (X_orig) < 3) {
+       loglik = -o;
+       S_str = "no. of records " + N + " loglik " + loglik;
+       if (fileS != " ") {
+               write (S_str, fileS, format = fmtO);
+       } else {
+               print (S_str);
+       }
+       stop ("No features are selected!");
+}
+
+# computing initial gradient g 
+# part 1 g0_1
+g0_1 = - t (colSums (X * E)); # g_1
+# part 2 g0_2
+X_rev_agg = cumsum (rev(X));
+select = table (seq (1, num_distinct), e_r_rev_agg);
+X_agg = select %*% X_rev_agg;
+g0_2 = t (colSums ((X_agg * d_r_rev)/ e_r_rev_agg));
+#
+g0 = g0_1 + g0_2;
+g = g0; 
+
+# initialization for trust region Newton method
+delta = 0.5 * sqrt (D) / max (sqrt (rowSums (X ^ 2))); 
+initial_g2 = sum (g ^ 2);
+exit_g2 = initial_g2 * tol ^ 2;
+maxiter = 100; 
+maxinneriter = min (D, 100); 
+i = 0;
+sum_g2 = sum (g ^ 2);
+while (sum_g2 > exit_g2 & i < maxiter) {  
+       i = i + 1;
+    sb = zeros_D;  
+    r = g;  
+    r2 = sum (r ^ 2);  
+    exit_r2 = 0.01 * r2;
+    d = - r;  
+    trust_bound_reached = FALSE;  
+    j = 0;
+
+       exp_Xb = exp (X %*% b);
+       exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum");
+       D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
+       X_exp_Xb = X * exp_Xb;
+       X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
+       X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
+
+    while (r2 > exit_r2 & (! trust_bound_reached) & j < maxinneriter) { 
+        j = j + 1;
+               # computing Hessian times d (Hd)
+               # part 1 Hd_1
+               Xd = X %*% d;
+               X_Xd_exp_Xb = X * (Xd * exp_Xb);
+               X_Xd_exp_Xb_rev_agg = cumsum (rev(X_Xd_exp_Xb));
+               X_Xd_exp_Xb_rev_agg = select %*% X_Xd_exp_Xb_rev_agg;
+               
+               Hd_1 = X_Xd_exp_Xb_rev_agg / D_r_rev;
+               # part 2 Hd_2
+               
+               Xd_exp_Xb = Xd * exp_Xb;
+               Xd_exp_Xb_rev_agg = cumsum (rev(Xd_exp_Xb));
+               Xd_exp_Xb_rev_agg = select %*% Xd_exp_Xb_rev_agg;
+               
+               Hd_2_num = X_exp_Xb_rev_agg * Xd_exp_Xb_rev_agg; # numerator
+               Hd_2 = Hd_2_num / (D_r_rev ^ 2);
+               
+               Hd = t (colSums ((Hd_1 - Hd_2) * d_r_rev));
+
+               c = r2 / sum (d * Hd);
+        [c, trust_bound_reached] = ensure_trust_bound (c, sum(d ^ 2), 2 * 
sum(sb * d), sum(sb ^ 2) - delta ^ 2);
+        sb = sb + c * d;  
+        r = r + c * Hd;  
+        r2_new = sum (r ^ 2);
+        d = - r + (r2_new / r2) * d;  
+        r2 = r2_new;
+    }
+
+    # computing loss change in 1 iteration (so)
+       # part 1 so_1
+       so_1 = - as.scalar (colSums (X * E) %*% (b + sb)); 
+       # part 2 so_2
+       exp_Xbsb = exp (X %*% (b + sb));
+       exp_Xbsb_agg = aggregate (target = exp_Xbsb, groups = RT, fn = "sum");
+       so_2 = sum (d_r_rev * log (cumsum (rev(exp_Xbsb_agg))));
+       #
+       so = so_1 + so_2;
+       so = so - o; 
+       
+       delta = update_trust_bound (delta, sqrt (sum (sb ^ 2)), so, sum (sb * 
g), 0.5 * sum (sb * (r + g)));
+    if (so < 0) {
+        b = b + sb;  
+        o = o + so;  
+               # compute new gradient g
+               exp_Xb = exp (X %*% b);
+               exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = 
"sum");
+               X_exp_Xb = X * exp_Xb;
+               X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
+               X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg;
+               
+               D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator
+               g_2 = t (colSums ((X_exp_Xb_rev_agg / D_r_rev) * d_r_rev));
+               g = g0_1 + g_2;
+               sum_g2 = sum (g ^ 2);
+    }
+}
+
+if (sum_g2 > exit_g2 & i >= maxiter) {
+       print ("Trust region Newton method did not converge!");
+}
+
+
+print ("COMPUTING HESSIAN...");
+
+H0 = matrix (0, rows = D, cols = D);
+H = matrix (0, rows = D, cols = D);
+
+X_exp_Xb_rev_2 = rev(X_exp_Xb);
+X_rev_2 = rev(X);
+
+X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb));
+X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg; 
+
+parfor (i in 1:D, check = 0) {
+       Xi = X[,i];
+       Xi_rev = rev(Xi);
+
+       ## ----------Start calculating H0-------------- 
+       # part 1 H0_1
+       Xi_X = X_rev_2[,i:D] * Xi_rev;
+       Xi_X_rev_agg = cumsum (Xi_X);
+       Xi_X_rev_agg = select %*% Xi_X_rev_agg;
+       H0_1 = Xi_X_rev_agg / e_r_rev_agg;
+
+       # part 2 H0_2
+       Xi_agg = aggregate (target = Xi, groups = RT, fn = "sum");
+       Xi_agg_rev_agg = cumsum (rev(Xi_agg));  
+       H0_2_num = X_agg[,i:D] * Xi_agg_rev_agg; # numerator
+       H0_2 = H0_2_num / (e_r_rev_agg ^ 2);
+
+       H0[i,i:D] = colSums ((H0_1 - H0_2) * d_r_rev);
+       #-----------End calculating H0--------------------
+               
+       ## ----------Start calculating H--------------
+       # part 1 H_1
+       Xi_X_exp_Xb = X_exp_Xb_rev_2[,i:D] * Xi_rev;
+       Xi_X_exp_Xb_rev_agg = cumsum (Xi_X_exp_Xb);
+       Xi_X_exp_Xb_rev_agg = select %*% Xi_X_exp_Xb_rev_agg;
+       H_1 = Xi_X_exp_Xb_rev_agg / D_r_rev;
+
+       # part 2 H_2
+       Xi_exp_Xb = exp_Xb * Xi;
+       Xi_exp_Xb_agg = aggregate (target = Xi_exp_Xb, groups = RT, fn = 
"sum"); 
+       
+       Xi_exp_Xb_agg_rev_agg = cumsum (rev(Xi_exp_Xb_agg));
+       H_2_num = X_exp_Xb_rev_agg[,i:D] * Xi_exp_Xb_agg_rev_agg; # numerator
+       H_2 = H_2_num / (D_r_rev ^ 2);
+       H[i,i:D] = colSums ((H_1 - H_2) * d_r_rev);
+       #-----------End calculating H--------------------
+}
+H = H + t(H) - diag( diag (H));
+H0 = H0 + t(H0) - diag( diag (H0));
+
+
+# compute standard error for betas
+H_inv = inv (H);
+se_b = sqrt (diag (H_inv));
+
+# compute exp(b), Z, Pr[>|Z|]
+exp_b = exp (b);
+Z = b / se_b;
+P = matrix (0, rows = D, cols = 1);
+parfor (i in 1:D) {
+       P[i,1] = 2 - 2 * (cdf (target = abs (as.scalar (Z[i,1])), dist = 
"normal"));
+}
+
+# compute confidence intervals for b
+z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal");
+CI_l = b - se_b * z_alpha_2;
+CI_r = b - se_b + z_alpha_2;
+
+######## SOME STATISTICS AND TESTS
+# no. of records
+S_str = "no. of records " + N;
+
+# no.of events
+S_str = append (S_str, "no. of events " + sum (E));
+
+# log-likelihood
+loglik = -o;
+S_str = append (S_str, "loglik " + loglik + " ");
+
+# AIC = -2 * loglik + 2 * D
+AIC = -2 * loglik + 2 * D;
+S_str = append (S_str, "AIC " + AIC + " ");
+
+# Wald test
+wald_t = as.scalar (t(b) %*% H %*% b);
+wald_p = 1 - cdf (target = wald_t, dist = "chisq", df = D);
+T_str = "Wald test = " + wald_t + " on " + D + " df, p = " + wald_p + " ";
+
+# Likelihood ratio test
+lratio_t = 2 * o_init - 2 * o;
+lratio_p = 1 - cdf (target = lratio_t, dist = "chisq", df = D);
+T_str = append (T_str, "Likelihood ratio test = " + lratio_t + " on " + D + " 
df, p = " + lratio_p + " ");
+
+
+H0_inv = inv (H0);
+score_t = as.scalar (t (g0) %*% H0_inv %*% g0);
+score_p = 1 - cdf (target = score_t, dist = "chisq", df = D);
+T_str = append (T_str, "Score (logrank) test = " + score_t + " on " + D + " 
df, p = " + score_p + " ");
+
+# Rsquare (Cox & Snell)
+Rsquare = 1 - exp (-lratio_t / N);  
+Rsquare_max = 1 - exp (-2 * o_init / N);
+S_str = append (S_str, "Rsquare (Cox & Snell): " + Rsquare + " ");
+S_str = append (S_str, "max possible Rsquare: " + Rsquare_max);
+
+M = matrix (0, rows = D, cols = 7);
+M[,1] = b;
+M[,2] = exp_b;
+M[,3] = se_b;
+M[,4] = Z;
+M[,5] = P;
+M[,6] = CI_l;
+M[,7] = CI_r;
+
+write (M, fileM, format = fmtO);
+if (fileS != " ") {
+       write (S_str, fileS, format = fmtO);
+} else {
+       print (S_str);
+}
+if (fileT != " ") {
+       write (T_str, fileT, format = fmtO);
+} else {
+       print (T_str);
+}
+# needed for prediction
+write (RT, fileRT, format = fmtO);
+write (H_inv, fileCOV, format = fmtO);
+write (X_orig, fileXO, format = fmtO);
+
+
+####### UDFS FOR TRUST REGION NEWTON METHOD
+
+ensure_trust_bound =
+    function (double x, double a, double b, double c)
+    return (double x_new, boolean is_violated)
+{
+    if (a * x^2 + b * x + c > 0)
+    {
+        is_violated = TRUE;
+        rad = sqrt (b ^ 2 - 4 * a * c);
+        if (b >= 0) {
+            x_new = - (2 * c) / (b + rad);
+        } else {
+            x_new = - (b - rad) / (2 * a);
+        }
+    } else {
+        is_violated = FALSE;
+        x_new = x;
+    }
+}
+
+update_trust_bound =
+    function (double delta,
+              double sb_distance,
+              double so_exact,
+              double so_linear_approx,
+              double so_quadratic_approx)
+    return   (double delta)
+{
+    sigma1 = 0.25;
+    sigma2 = 0.5;
+    sigma3 = 4.0;
+
+    if (so_exact <= so_linear_approx) {
+       alpha = sigma3;
+    } else {
+       alpha = max (sigma1, - 0.5 * so_linear_approx / (so_exact - 
so_linear_approx));
+    }
+
+    rho = so_exact / so_quadratic_approx;
+    if (rho < 0.0001) {
+        delta = min (max (alpha, sigma1) * sb_distance, sigma2 * delta);
+    } else { if (rho < 0.25) {
+        delta = max (sigma1 * delta, min (alpha * sb_distance, sigma2 * 
delta));
+    } else { if (rho < 0.75) {
+        delta = max (sigma1 * delta, min (alpha * sb_distance, sigma3 * 
delta));
+    } else {
+        delta = max (delta, min (alpha * sb_distance, sigma3 * delta));
+    }}} 
+}

[50/51] [partial] incubator-systemml git commit: [SYSTEMML-482] [SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line endings, and normalizing all of the line endings.

Reply via email to