http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/Cox-predict.dml ---------------------------------------------------------------------- diff --git a/scripts/algorithms/Cox-predict.dml b/scripts/algorithms/Cox-predict.dml index 6f2bec0..7d444fd 100644 --- a/scripts/algorithms/Cox-predict.dml +++ b/scripts/algorithms/Cox-predict.dml @@ -1,181 +1,181 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# THIS SCRIPT APPLIES THE ESTIMATED PARAMETERS OF A COX PROPORTIONAL HAZARD REGRESSION MODEL TO A NEW (TEST) DATASET -# -# INPUT PARAMETERS: -# --------------------------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# --------------------------------------------------------------------------------------------- -# X String --- Location to read the input matrix containing the survival data with the following schema: -# - X[,1]: timestamps -# - X[,2]: whether an event occurred (1) or data is censored (0) -# - X[,3:]: feature vectors (excluding the baseline columns) used for model fitting -# RT String --- Location to read column matrix RT containing the (order preserving) recoded timestamps from X -# M String --- Location to read matrix M containing the fitted Cox model with the following schema: -# - M[,1]: betas -# - M[,2]: exp(betas) -# - M[,3]: standard error of betas -# - M[,4]: Z -# - M[,5]: p-value -# - M[,6]: lower 100*(1-alpha)% confidence interval of betas -# - M[,7]: upper 100*(1-alpha)% confidence interval of betas -# Y String --- Location to read matrix Y used for prediction -# COV String --- Location to read the variance-covariance matrix of the betas -# MF String --- Location to read column indices of X excluding the baseline factors if available -# P String --- Location to store matrix P containing the results of prediction -# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only) -# --------------------------------------------------------------------------------------------- -# OUTPUT: -# 1- A matrix P with the following schema: -# P[,1]: linear predictors relative to a baseline which contains the mean values for each feature -# i.e., (Y[3:] - colMeans (X[3:])) %*% b -# P[,2]: standard error of linear predictors -# P[,3]: risk relative to a baseline which contains the mean values for each feature -# i.e., exp ((Y[3:] - colMeans (X[3:])) %*% b) -# P[,4]: standard error of risk -# P[,5]: estimates of cumulative hazard -# P[,6]: standard error of the estimates of cumulative hazard -# ------------------------------------------------------------------------------------------- -# HOW TO INVOKE THIS SCRIPT - EXAMPLE: -# hadoop jar SystemML.jar -f cox-predict.dml -nvargs X=INPUT_DIR/X RT=INPUT_DIR/RT M=INPUT_DIR/M Y=INPUT_DIR/Y -# COV=INTPUT_DIR/COV MF=INPUT_DIR/MF P=OUTPUT_DIR/P fmt=csv - -fileX = $X; -fileRT = $RT; -fileMF = $MF; -fileY = $Y; -fileM = $M; -fileCOV = $COV; -fileP = $P; - -# Default values of some parameters -fmtO = ifdef ($fmt, "text"); # $fmt="text" - -X_orig = read (fileX); -RT_X = read (fileRT); -Y_orig = read (fileY); -M = read (fileM); -b = M[,1]; -COV = read (fileCOV); - -col_ind = read (fileMF); -tab = table (col_ind, seq (1, nrow (col_ind)), ncol (Y_orig), nrow (col_ind)); -Y_orig = Y_orig %*% tab; - - -# Y and X have the same dimensions and schema -if (ncol (Y_orig) != ncol (X_orig)) { - stop ("Y has a wrong number of columns!"); -} - -X = X_orig[,3:ncol (X_orig)]; -T_X = X_orig[,1]; -E_X = X_orig[,2]; -D = ncol (X); -N = nrow (X); -Y_orig = order (target = Y_orig, by = 1); -Y = Y_orig[,3:ncol (X_orig)]; -T_Y = Y_orig[,1]; - -col_means = colMeans (X); -ones = matrix (1, rows = nrow (Y), cols = 1); -Y_rel = Y - (ones %*% col_means); - -##### compute linear predictors -LP = Y_rel %*% b; -# compute standard error of linear predictors using the Delta method -se_LP = diag(sqrt (Y_rel %*% COV %*% t(Y_rel))); - -##### compute risk -R = exp (Y_rel %*% b); -# compute standard error of risk using the Delta method -se_R = diag(sqrt ((Y_rel * R) %*% COV %*% t(Y_rel * R))) / sqrt (exp (LP)); - -##### compute estimates of cumulative hazard together with their standard errors: -# 1. col contains cumulative hazard estimates -# 2. col contains standard errors for cumulative hazard estimates - -d_r = aggregate (target = E_X, groups = RT_X, fn = "sum"); -e_r = aggregate (target = RT_X, groups = RT_X, fn = "count"); -Idx = cumsum (e_r); -all_times = table (seq (1, nrow (Idx), 1), Idx) %*% T_X; # distinct event times - -event_times = removeEmpty (target = ppred (d_r, 0, ">") * all_times, margin = "rows"); -num_distinct_event = nrow (event_times); - -num_distinct = nrow (all_times); # no. of distinct timestamps censored or uncensored -I_rev = table (seq (1, num_distinct, 1), seq (num_distinct, 1, -1)); -e_r_rev_agg = cumsum (I_rev %*% e_r); -select = t (colSums (table (seq (1, num_distinct), e_r_rev_agg))); - -min_event_time = min (event_times); -max_event_time = max (event_times); -T_Y = T_Y + (min_event_time * ppred (T_Y, min_event_time, "<")); -T_Y = T_Y + (max_event_time * ppred (T_Y, max_event_time, ">")); - -Ind = outer (T_Y, t (event_times), ">="); -Ind = table (seq (1, nrow (T_Y)), rowIndexMax (Ind), nrow (T_Y), num_distinct_event); - -exp_Xb = exp (X %*% b); -exp_Xb_agg = aggregate (target = exp_Xb, groups = RT_X, fn = "sum"); -exp_Xb_cum = I_rev %*% cumsum (I_rev %*% exp_Xb_agg); - -H0 = cumsum (removeEmpty (target = d_r / exp_Xb_cum, margin = "rows")); -P1 = cumsum (removeEmpty (target = d_r / exp_Xb_cum ^ 2, margin = "rows")); -X_exp_Xb = X * exp (X %*% b); - -I_rev_all = table (seq (1, N, 1), seq (N, 1, -1)); -X_exp_Xb_rev_agg = cumsum (I_rev_all %*% X_exp_Xb); -X_exp_Xb_rev_agg = removeEmpty (target = X_exp_Xb_rev_agg * select, margin = "rows"); -X_exp_Xb_cum = I_rev %*% X_exp_Xb_rev_agg; -P2 = cumsum (removeEmpty (target = (X_exp_Xb_cum * d_r) / exp_Xb_cum ^ 2, margin = "rows")); - -exp_Yb = exp (Y %*% b); -exp_Yb_2 = exp_Yb ^ 2; -Y_exp_Yb = Y * exp (Y %*% b); - -# estimates of cumulative hazard -H = exp_Yb * (Ind %*% H0); - -# term1 -term1 = exp_Yb_2 * (Ind %*% P1); - -# term2 -P3 = cumsum (removeEmpty (target = (exp_Xb_cum * d_r) / exp_Xb_cum ^ 2, margin = "rows")); -P4 = (Ind %*% P2) * exp_Yb; -P5 = Y_exp_Yb * (Ind %*% P3); -term2 = P4 - P5; - -# standard error of the estimates of cumulative hazard -se_H = sqrt (term1 + rowSums((term2 %*% COV) * term2)); - -# prepare output matrix -P = matrix (0, rows = nrow (Y), cols = 6); -P[,1] = LP; -P[,2] = se_LP; -P[,3] = R; -P[,4] = se_R; -P[,5] = H; -P[,6] = se_H; -write (P, fileP, format=fmtO); - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# THIS SCRIPT APPLIES THE ESTIMATED PARAMETERS OF A COX PROPORTIONAL HAZARD REGRESSION MODEL TO A NEW (TEST) DATASET +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X String --- Location to read the input matrix containing the survival data with the following schema: +# - X[,1]: timestamps +# - X[,2]: whether an event occurred (1) or data is censored (0) +# - X[,3:]: feature vectors (excluding the baseline columns) used for model fitting +# RT String --- Location to read column matrix RT containing the (order preserving) recoded timestamps from X +# M String --- Location to read matrix M containing the fitted Cox model with the following schema: +# - M[,1]: betas +# - M[,2]: exp(betas) +# - M[,3]: standard error of betas +# - M[,4]: Z +# - M[,5]: p-value +# - M[,6]: lower 100*(1-alpha)% confidence interval of betas +# - M[,7]: upper 100*(1-alpha)% confidence interval of betas +# Y String --- Location to read matrix Y used for prediction +# COV String --- Location to read the variance-covariance matrix of the betas +# MF String --- Location to read column indices of X excluding the baseline factors if available +# P String --- Location to store matrix P containing the results of prediction +# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only) +# --------------------------------------------------------------------------------------------- +# OUTPUT: +# 1- A matrix P with the following schema: +# P[,1]: linear predictors relative to a baseline which contains the mean values for each feature +# i.e., (Y[3:] - colMeans (X[3:])) %*% b +# P[,2]: standard error of linear predictors +# P[,3]: risk relative to a baseline which contains the mean values for each feature +# i.e., exp ((Y[3:] - colMeans (X[3:])) %*% b) +# P[,4]: standard error of risk +# P[,5]: estimates of cumulative hazard +# P[,6]: standard error of the estimates of cumulative hazard +# ------------------------------------------------------------------------------------------- +# HOW TO INVOKE THIS SCRIPT - EXAMPLE: +# hadoop jar SystemML.jar -f cox-predict.dml -nvargs X=INPUT_DIR/X RT=INPUT_DIR/RT M=INPUT_DIR/M Y=INPUT_DIR/Y +# COV=INTPUT_DIR/COV MF=INPUT_DIR/MF P=OUTPUT_DIR/P fmt=csv + +fileX = $X; +fileRT = $RT; +fileMF = $MF; +fileY = $Y; +fileM = $M; +fileCOV = $COV; +fileP = $P; + +# Default values of some parameters +fmtO = ifdef ($fmt, "text"); # $fmt="text" + +X_orig = read (fileX); +RT_X = read (fileRT); +Y_orig = read (fileY); +M = read (fileM); +b = M[,1]; +COV = read (fileCOV); + +col_ind = read (fileMF); +tab = table (col_ind, seq (1, nrow (col_ind)), ncol (Y_orig), nrow (col_ind)); +Y_orig = Y_orig %*% tab; + + +# Y and X have the same dimensions and schema +if (ncol (Y_orig) != ncol (X_orig)) { + stop ("Y has a wrong number of columns!"); +} + +X = X_orig[,3:ncol (X_orig)]; +T_X = X_orig[,1]; +E_X = X_orig[,2]; +D = ncol (X); +N = nrow (X); +Y_orig = order (target = Y_orig, by = 1); +Y = Y_orig[,3:ncol (X_orig)]; +T_Y = Y_orig[,1]; + +col_means = colMeans (X); +ones = matrix (1, rows = nrow (Y), cols = 1); +Y_rel = Y - (ones %*% col_means); + +##### compute linear predictors +LP = Y_rel %*% b; +# compute standard error of linear predictors using the Delta method +se_LP = diag(sqrt (Y_rel %*% COV %*% t(Y_rel))); + +##### compute risk +R = exp (Y_rel %*% b); +# compute standard error of risk using the Delta method +se_R = diag(sqrt ((Y_rel * R) %*% COV %*% t(Y_rel * R))) / sqrt (exp (LP)); + +##### compute estimates of cumulative hazard together with their standard errors: +# 1. col contains cumulative hazard estimates +# 2. col contains standard errors for cumulative hazard estimates + +d_r = aggregate (target = E_X, groups = RT_X, fn = "sum"); +e_r = aggregate (target = RT_X, groups = RT_X, fn = "count"); +Idx = cumsum (e_r); +all_times = table (seq (1, nrow (Idx), 1), Idx) %*% T_X; # distinct event times + +event_times = removeEmpty (target = ppred (d_r, 0, ">") * all_times, margin = "rows"); +num_distinct_event = nrow (event_times); + +num_distinct = nrow (all_times); # no. of distinct timestamps censored or uncensored +I_rev = table (seq (1, num_distinct, 1), seq (num_distinct, 1, -1)); +e_r_rev_agg = cumsum (I_rev %*% e_r); +select = t (colSums (table (seq (1, num_distinct), e_r_rev_agg))); + +min_event_time = min (event_times); +max_event_time = max (event_times); +T_Y = T_Y + (min_event_time * ppred (T_Y, min_event_time, "<")); +T_Y = T_Y + (max_event_time * ppred (T_Y, max_event_time, ">")); + +Ind = outer (T_Y, t (event_times), ">="); +Ind = table (seq (1, nrow (T_Y)), rowIndexMax (Ind), nrow (T_Y), num_distinct_event); + +exp_Xb = exp (X %*% b); +exp_Xb_agg = aggregate (target = exp_Xb, groups = RT_X, fn = "sum"); +exp_Xb_cum = I_rev %*% cumsum (I_rev %*% exp_Xb_agg); + +H0 = cumsum (removeEmpty (target = d_r / exp_Xb_cum, margin = "rows")); +P1 = cumsum (removeEmpty (target = d_r / exp_Xb_cum ^ 2, margin = "rows")); +X_exp_Xb = X * exp (X %*% b); + +I_rev_all = table (seq (1, N, 1), seq (N, 1, -1)); +X_exp_Xb_rev_agg = cumsum (I_rev_all %*% X_exp_Xb); +X_exp_Xb_rev_agg = removeEmpty (target = X_exp_Xb_rev_agg * select, margin = "rows"); +X_exp_Xb_cum = I_rev %*% X_exp_Xb_rev_agg; +P2 = cumsum (removeEmpty (target = (X_exp_Xb_cum * d_r) / exp_Xb_cum ^ 2, margin = "rows")); + +exp_Yb = exp (Y %*% b); +exp_Yb_2 = exp_Yb ^ 2; +Y_exp_Yb = Y * exp (Y %*% b); + +# estimates of cumulative hazard +H = exp_Yb * (Ind %*% H0); + +# term1 +term1 = exp_Yb_2 * (Ind %*% P1); + +# term2 +P3 = cumsum (removeEmpty (target = (exp_Xb_cum * d_r) / exp_Xb_cum ^ 2, margin = "rows")); +P4 = (Ind %*% P2) * exp_Yb; +P5 = Y_exp_Yb * (Ind %*% P3); +term2 = P4 - P5; + +# standard error of the estimates of cumulative hazard +se_H = sqrt (term1 + rowSums((term2 %*% COV) * term2)); + +# prepare output matrix +P = matrix (0, rows = nrow (Y), cols = 6); +P[,1] = LP; +P[,2] = se_LP; +P[,3] = R; +P[,4] = se_R; +P[,5] = H; +P[,6] = se_H; +write (P, fileP, format=fmtO); +
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/Cox.dml ---------------------------------------------------------------------- diff --git a/scripts/algorithms/Cox.dml b/scripts/algorithms/Cox.dml index 3cab74d..6da22ce 100644 --- a/scripts/algorithms/Cox.dml +++ b/scripts/algorithms/Cox.dml @@ -1,502 +1,502 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# THIS SCRIPT FITS A COX PROPORTIONAL HAZARD REGRESSION MODEL. -# The Breslow method is used for handling ties and the regression parameters -# are computed using trust region newton method with conjugate gradient -# -# INPUT PARAMETERS: -# --------------------------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# --------------------------------------------------------------------------------------------- -# X String --- Location to read the input matrix X containing the survival data containing the following information -# - 1: timestamps -# - 2: whether an event occurred (1) or data is censored (0) -# - 3: feature vectors -# TE String --- Column indices of X as a column vector which contain timestamp (first row) and event information (second row) -# F String " " Column indices of X as a column vector which are to be used for fitting the Cox model -# R String " " If factors (categorical variables) are available in the input matrix X, location to read matrix R containing -# the start and end indices of the factors in X -# - R[,1]: start indices -# - R[,2]: end indices -# Alternatively, user can specify the indices of the baseline level of each factor which needs to be removed from X; -# in this case the start and end indices corresponding to the baseline level need to be the same; -# if R is not provided by default all variables are considered to be continuous -# M String --- Location to store the results of Cox regression analysis including estimated regression parameters of the fitted -# Cox model (the betas), their standard errors, confidence intervals, and P-values -# S String " " Location to store a summary of some statistics of the fitted cox proportional hazard model including -# no. of records, no. of events, log-likelihood, AIC, Rsquare (Cox & Snell), and max possible Rsquare; -# by default is standard output -# T String " " Location to store the results of Likelihood ratio test, Wald test, and Score (log-rank) test of the fitted model; -# by default is standard output -# COV String --- Location to store the variance-covariance matrix of the betas -# RT String --- Location to store matrix RT containing the order-preserving recoded timestamps from X -# XO String --- Location to store sorted input matrix by the timestamps -# MF String --- Location to store column indices of X excluding the baseline factors if available -# alpha Double 0.05 Parameter to compute a 100*(1-alpha)% confidence interval for the betas -# tol Double 0.000001 Tolerance ("epsilon") -# moi Int 100 Max. number of outer (Newton) iterations -# mii Int 0 Max. number of inner (conjugate gradient) iterations, 0 = no max -# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only) -# --------------------------------------------------------------------------------------------- -# OUTPUT: -# 1- A D x 7 matrix M, where D denotes the number of covariates, with the following schema: -# M[,1]: betas -# M[,2]: exp(betas) -# M[,3]: standard error of betas -# M[,4]: Z -# M[,5]: P-value -# M[,6]: lower 100*(1-alpha)% confidence interval of betas -# M[,7]: upper 100*(1-alpha)% confidence interval of betas -# -# Two log files containing a summary of some statistics of the fitted model: -# 1- File S with the following format -# - line 1: no. of observations -# - line 2: no. of events -# - line 3: log-likelihood -# - line 4: AIC -# - line 5: Rsquare (Cox & Snell) -# - line 6: max possible Rsquare -# 2- File T with the following format -# - line 1: Likelihood ratio test statistic, degree of freedom, P-value -# - line 2: Wald test statistic, degree of freedom, P-value -# - line 3: Score (log-rank) test statistic, degree of freedom, P-value -# -# Additionally, the following matrices are stored (needed for prediction) -# 1- A column matrix RT that contains the order-preserving recoded timestamps from X -# 2- Matrix XO which is matrix X with sorted timestamps -# 3- Variance-covariance matrix of the betas COV -# 4- A column matrix MF that contains the column indices of X with the baseline factors removed (if available) -# ------------------------------------------------------------------------------------------- -# HOW TO INVOKE THIS SCRIPT - EXAMPLE: -# hadoop jar SystemML.jar -f Cox.dml -nvargs X=INPUT_DIR/X TE=INPUT_DIR/TE F=INTPUT_DIR/F R=INTPUT_DIR/R -# M=OUTPUT_DIR/M S=OUTPUT_DIR/S T=OUTPUT_DIR/T COV=OUTPUT_DIR/COV RT=OUTPUT_DIR/RT -# XO=OUTPUT_DIR/XO MF=OUTPUT/MF alpha=0.05 tol=0.000001 moi=100 mii=20 fmt=csv - -fileX = $X; -fileTE = $TE; -fileRT = $RT; -fileMF = $MF; -fileM = $M; -fileXO = $XO; -fileCOV = $COV; - -# Default values of some parameters -fileF = ifdef ($F, " "); # $F=" " -fileR = ifdef ($R, " "); # $R=" " -fileS = ifdef ($S, " "); # $S=" " -fileT = ifdef ($T, " "); # $T=" " -fmtO = ifdef ($fmt, "text"); # $fmt="text" -alpha = ifdef ($alpha, 0.05); # $alpha=0.05 -tol = ifdef ($tol, 0.000001); # $tol=0.000001; -maxiter = ifdef ($moi, 100); # $moi=100; -maxinneriter = ifdef ($mii, 0); # $mii=0; - -X_orig = read (fileX); - -TE = read (fileTE); -if (fileF != " ") { - F = read (fileF); -} - -######## CHECK FOR FACTORS AND REMOVE THE BASELINE OF EACH FACTOR FROM THE DATASET - -if (fileR != " ") { # factors available - R = read (fileR); - if (ncol (R) != 2) { - stop ("Matrix R has wrong dimensions!"); - } - print ("REMOVING BASLINE LEVEL OF EACH FACTOR..."); - # identify baseline columns to be removed from X_orig - col_sum = colSums (X_orig); - col_seq = t (seq(1, ncol (X_orig))); - parfor (i in 1:nrow (R), check = 0) { - start_ind = as.scalar (R[i,1]); - end_ind = as.scalar (R[i,2]); - baseline_ind = as.scalar (rowIndexMax (col_sum[1, start_ind:end_ind])) + start_ind - 1; - col_seq[,baseline_ind] = 0; - } - ones = matrix (1, rows = nrow (F), cols = 1); - F_filter = table (ones, F, 1, ncol (X_orig)); - F_filter = removeEmpty (target = F_filter * col_seq, margin = "cols"); - TE_F = t(append (t (TE), F_filter)); -} else if (fileF != " ") { # all features scale - TE_F = t(append (t (TE), t(F))); -} else { # no features available - TE_F = TE; -} - -write (TE_F, fileMF, format = fmtO); - -X_orig = X_orig %*% table (TE_F, seq (1, nrow (TE_F)), ncol (X_orig), nrow (TE_F)); - -######## RECODING TIMESTAMPS PRESERVING THE ORDER -print ("RECODING TIMESTAMPS..."); - -N = nrow (X_orig); -X_orig = order (target = X_orig, by = 1); -Idx = matrix (1, rows = N, cols = 1); -num_timestamps = 1; -if (N == 1) { - RT = matrix (1, rows = 1, cols = 1); -} else { - Idx[2:N,1] = ppred (X_orig[1:(N - 1),1], X_orig[2:N,1], "!="); - num_timestamps = sum (Idx); - A = removeEmpty (target = diag (Idx), margin = "cols"); - if (ncol (A) > 1) { - A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)]; - B = cumsum (A); - RT = B %*% seq(1, ncol(B)); - } else { # there is only one group - RT = matrix (1, rows = N, cols = 1); - } -} -E = X_orig[,2]; - -print ("BEGIN COX PROPORTIONAL HAZARD SCRIPT"); - -######## PARAMETERS OF THE TRUST REGION NEWTON METHOD WITH CONJUGATE GRADIENT -# b: the regression betas -# o: loss function value -# g: loss function gradient -# H: loss function Hessian -# sb: shift of b in one iteration -# so: shift of o in one iteration -# r: CG residual = H %*% sb + g -# d: CG direction vector -# Hd: = H %*% d -# c: scalar coefficient in CG -# delta: trust region size -# tol: tolerance value -# i: outer (Newton) iteration count -# j: inner (CG) iteration count - -# computing initial coefficients b (all initialized to 0) -if (ncol (X_orig) > 2) { - X = X_orig[,3:ncol(X_orig)]; - D = ncol (X); - zeros_D = matrix (0, rows = D, cols = 1); - b = zeros_D; -} -d_r = aggregate (target = E, groups = RT, fn = "sum"); -e_r = aggregate (target = RT, groups = RT, fn = "count"); - -# computing initial loss function value o -num_distinct = nrow (d_r); # no. of distinct timestamps -e_r_rev_agg = cumsum (rev(e_r)); -d_r_rev = rev(d_r); -o = sum (d_r_rev * log (e_r_rev_agg)); -o_init = o; -if (ncol (X_orig) < 3) { - loglik = -o; - S_str = "no. of records " + N + " loglik " + loglik; - if (fileS != " ") { - write (S_str, fileS, format = fmtO); - } else { - print (S_str); - } - stop ("No features are selected!"); -} - -# computing initial gradient g -# part 1 g0_1 -g0_1 = - t (colSums (X * E)); # g_1 -# part 2 g0_2 -X_rev_agg = cumsum (rev(X)); -select = table (seq (1, num_distinct), e_r_rev_agg); -X_agg = select %*% X_rev_agg; -g0_2 = t (colSums ((X_agg * d_r_rev)/ e_r_rev_agg)); -# -g0 = g0_1 + g0_2; -g = g0; - -# initialization for trust region Newton method -delta = 0.5 * sqrt (D) / max (sqrt (rowSums (X ^ 2))); -initial_g2 = sum (g ^ 2); -exit_g2 = initial_g2 * tol ^ 2; -maxiter = 100; -maxinneriter = min (D, 100); -i = 0; -sum_g2 = sum (g ^ 2); -while (sum_g2 > exit_g2 & i < maxiter) { - i = i + 1; - sb = zeros_D; - r = g; - r2 = sum (r ^ 2); - exit_r2 = 0.01 * r2; - d = - r; - trust_bound_reached = FALSE; - j = 0; - - exp_Xb = exp (X %*% b); - exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum"); - D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator - X_exp_Xb = X * exp_Xb; - X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb)); - X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg; - - while (r2 > exit_r2 & (! trust_bound_reached) & j < maxinneriter) { - j = j + 1; - # computing Hessian times d (Hd) - # part 1 Hd_1 - Xd = X %*% d; - X_Xd_exp_Xb = X * (Xd * exp_Xb); - X_Xd_exp_Xb_rev_agg = cumsum (rev(X_Xd_exp_Xb)); - X_Xd_exp_Xb_rev_agg = select %*% X_Xd_exp_Xb_rev_agg; - - Hd_1 = X_Xd_exp_Xb_rev_agg / D_r_rev; - # part 2 Hd_2 - - Xd_exp_Xb = Xd * exp_Xb; - Xd_exp_Xb_rev_agg = cumsum (rev(Xd_exp_Xb)); - Xd_exp_Xb_rev_agg = select %*% Xd_exp_Xb_rev_agg; - - Hd_2_num = X_exp_Xb_rev_agg * Xd_exp_Xb_rev_agg; # numerator - Hd_2 = Hd_2_num / (D_r_rev ^ 2); - - Hd = t (colSums ((Hd_1 - Hd_2) * d_r_rev)); - - c = r2 / sum (d * Hd); - [c, trust_bound_reached] = ensure_trust_bound (c, sum(d ^ 2), 2 * sum(sb * d), sum(sb ^ 2) - delta ^ 2); - sb = sb + c * d; - r = r + c * Hd; - r2_new = sum (r ^ 2); - d = - r + (r2_new / r2) * d; - r2 = r2_new; - } - - # computing loss change in 1 iteration (so) - # part 1 so_1 - so_1 = - as.scalar (colSums (X * E) %*% (b + sb)); - # part 2 so_2 - exp_Xbsb = exp (X %*% (b + sb)); - exp_Xbsb_agg = aggregate (target = exp_Xbsb, groups = RT, fn = "sum"); - so_2 = sum (d_r_rev * log (cumsum (rev(exp_Xbsb_agg)))); - # - so = so_1 + so_2; - so = so - o; - - delta = update_trust_bound (delta, sqrt (sum (sb ^ 2)), so, sum (sb * g), 0.5 * sum (sb * (r + g))); - if (so < 0) { - b = b + sb; - o = o + so; - # compute new gradient g - exp_Xb = exp (X %*% b); - exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum"); - X_exp_Xb = X * exp_Xb; - X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb)); - X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg; - - D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator - g_2 = t (colSums ((X_exp_Xb_rev_agg / D_r_rev) * d_r_rev)); - g = g0_1 + g_2; - sum_g2 = sum (g ^ 2); - } -} - -if (sum_g2 > exit_g2 & i >= maxiter) { - print ("Trust region Newton method did not converge!"); -} - - -print ("COMPUTING HESSIAN..."); - -H0 = matrix (0, rows = D, cols = D); -H = matrix (0, rows = D, cols = D); - -X_exp_Xb_rev_2 = rev(X_exp_Xb); -X_rev_2 = rev(X); - -X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb)); -X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg; - -parfor (i in 1:D, check = 0) { - Xi = X[,i]; - Xi_rev = rev(Xi); - - ## ----------Start calculating H0-------------- - # part 1 H0_1 - Xi_X = X_rev_2[,i:D] * Xi_rev; - Xi_X_rev_agg = cumsum (Xi_X); - Xi_X_rev_agg = select %*% Xi_X_rev_agg; - H0_1 = Xi_X_rev_agg / e_r_rev_agg; - - # part 2 H0_2 - Xi_agg = aggregate (target = Xi, groups = RT, fn = "sum"); - Xi_agg_rev_agg = cumsum (rev(Xi_agg)); - H0_2_num = X_agg[,i:D] * Xi_agg_rev_agg; # numerator - H0_2 = H0_2_num / (e_r_rev_agg ^ 2); - - H0[i,i:D] = colSums ((H0_1 - H0_2) * d_r_rev); - #-----------End calculating H0-------------------- - - ## ----------Start calculating H-------------- - # part 1 H_1 - Xi_X_exp_Xb = X_exp_Xb_rev_2[,i:D] * Xi_rev; - Xi_X_exp_Xb_rev_agg = cumsum (Xi_X_exp_Xb); - Xi_X_exp_Xb_rev_agg = select %*% Xi_X_exp_Xb_rev_agg; - H_1 = Xi_X_exp_Xb_rev_agg / D_r_rev; - - # part 2 H_2 - Xi_exp_Xb = exp_Xb * Xi; - Xi_exp_Xb_agg = aggregate (target = Xi_exp_Xb, groups = RT, fn = "sum"); - - Xi_exp_Xb_agg_rev_agg = cumsum (rev(Xi_exp_Xb_agg)); - H_2_num = X_exp_Xb_rev_agg[,i:D] * Xi_exp_Xb_agg_rev_agg; # numerator - H_2 = H_2_num / (D_r_rev ^ 2); - H[i,i:D] = colSums ((H_1 - H_2) * d_r_rev); - #-----------End calculating H-------------------- -} -H = H + t(H) - diag( diag (H)); -H0 = H0 + t(H0) - diag( diag (H0)); - - -# compute standard error for betas -H_inv = inv (H); -se_b = sqrt (diag (H_inv)); - -# compute exp(b), Z, Pr[>|Z|] -exp_b = exp (b); -Z = b / se_b; -P = matrix (0, rows = D, cols = 1); -parfor (i in 1:D) { - P[i,1] = 2 - 2 * (cdf (target = abs (as.scalar (Z[i,1])), dist = "normal")); -} - -# compute confidence intervals for b -z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal"); -CI_l = b - se_b * z_alpha_2; -CI_r = b - se_b + z_alpha_2; - -######## SOME STATISTICS AND TESTS -# no. of records -S_str = "no. of records " + N; - -# no.of events -S_str = append (S_str, "no. of events " + sum (E)); - -# log-likelihood -loglik = -o; -S_str = append (S_str, "loglik " + loglik + " "); - -# AIC = -2 * loglik + 2 * D -AIC = -2 * loglik + 2 * D; -S_str = append (S_str, "AIC " + AIC + " "); - -# Wald test -wald_t = as.scalar (t(b) %*% H %*% b); -wald_p = 1 - cdf (target = wald_t, dist = "chisq", df = D); -T_str = "Wald test = " + wald_t + " on " + D + " df, p = " + wald_p + " "; - -# Likelihood ratio test -lratio_t = 2 * o_init - 2 * o; -lratio_p = 1 - cdf (target = lratio_t, dist = "chisq", df = D); -T_str = append (T_str, "Likelihood ratio test = " + lratio_t + " on " + D + " df, p = " + lratio_p + " "); - - -H0_inv = inv (H0); -score_t = as.scalar (t (g0) %*% H0_inv %*% g0); -score_p = 1 - cdf (target = score_t, dist = "chisq", df = D); -T_str = append (T_str, "Score (logrank) test = " + score_t + " on " + D + " df, p = " + score_p + " "); - -# Rsquare (Cox & Snell) -Rsquare = 1 - exp (-lratio_t / N); -Rsquare_max = 1 - exp (-2 * o_init / N); -S_str = append (S_str, "Rsquare (Cox & Snell): " + Rsquare + " "); -S_str = append (S_str, "max possible Rsquare: " + Rsquare_max); - -M = matrix (0, rows = D, cols = 7); -M[,1] = b; -M[,2] = exp_b; -M[,3] = se_b; -M[,4] = Z; -M[,5] = P; -M[,6] = CI_l; -M[,7] = CI_r; - -write (M, fileM, format = fmtO); -if (fileS != " ") { - write (S_str, fileS, format = fmtO); -} else { - print (S_str); -} -if (fileT != " ") { - write (T_str, fileT, format = fmtO); -} else { - print (T_str); -} -# needed for prediction -write (RT, fileRT, format = fmtO); -write (H_inv, fileCOV, format = fmtO); -write (X_orig, fileXO, format = fmtO); - - -####### UDFS FOR TRUST REGION NEWTON METHOD - -ensure_trust_bound = - function (double x, double a, double b, double c) - return (double x_new, boolean is_violated) -{ - if (a * x^2 + b * x + c > 0) - { - is_violated = TRUE; - rad = sqrt (b ^ 2 - 4 * a * c); - if (b >= 0) { - x_new = - (2 * c) / (b + rad); - } else { - x_new = - (b - rad) / (2 * a); - } - } else { - is_violated = FALSE; - x_new = x; - } -} - -update_trust_bound = - function (double delta, - double sb_distance, - double so_exact, - double so_linear_approx, - double so_quadratic_approx) - return (double delta) -{ - sigma1 = 0.25; - sigma2 = 0.5; - sigma3 = 4.0; - - if (so_exact <= so_linear_approx) { - alpha = sigma3; - } else { - alpha = max (sigma1, - 0.5 * so_linear_approx / (so_exact - so_linear_approx)); - } - - rho = so_exact / so_quadratic_approx; - if (rho < 0.0001) { - delta = min (max (alpha, sigma1) * sb_distance, sigma2 * delta); - } else { if (rho < 0.25) { - delta = max (sigma1 * delta, min (alpha * sb_distance, sigma2 * delta)); - } else { if (rho < 0.75) { - delta = max (sigma1 * delta, min (alpha * sb_distance, sigma3 * delta)); - } else { - delta = max (delta, min (alpha * sb_distance, sigma3 * delta)); - }}} -} +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# THIS SCRIPT FITS A COX PROPORTIONAL HAZARD REGRESSION MODEL. +# The Breslow method is used for handling ties and the regression parameters +# are computed using trust region newton method with conjugate gradient +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X String --- Location to read the input matrix X containing the survival data containing the following information +# - 1: timestamps +# - 2: whether an event occurred (1) or data is censored (0) +# - 3: feature vectors +# TE String --- Column indices of X as a column vector which contain timestamp (first row) and event information (second row) +# F String " " Column indices of X as a column vector which are to be used for fitting the Cox model +# R String " " If factors (categorical variables) are available in the input matrix X, location to read matrix R containing +# the start and end indices of the factors in X +# - R[,1]: start indices +# - R[,2]: end indices +# Alternatively, user can specify the indices of the baseline level of each factor which needs to be removed from X; +# in this case the start and end indices corresponding to the baseline level need to be the same; +# if R is not provided by default all variables are considered to be continuous +# M String --- Location to store the results of Cox regression analysis including estimated regression parameters of the fitted +# Cox model (the betas), their standard errors, confidence intervals, and P-values +# S String " " Location to store a summary of some statistics of the fitted cox proportional hazard model including +# no. of records, no. of events, log-likelihood, AIC, Rsquare (Cox & Snell), and max possible Rsquare; +# by default is standard output +# T String " " Location to store the results of Likelihood ratio test, Wald test, and Score (log-rank) test of the fitted model; +# by default is standard output +# COV String --- Location to store the variance-covariance matrix of the betas +# RT String --- Location to store matrix RT containing the order-preserving recoded timestamps from X +# XO String --- Location to store sorted input matrix by the timestamps +# MF String --- Location to store column indices of X excluding the baseline factors if available +# alpha Double 0.05 Parameter to compute a 100*(1-alpha)% confidence interval for the betas +# tol Double 0.000001 Tolerance ("epsilon") +# moi Int 100 Max. number of outer (Newton) iterations +# mii Int 0 Max. number of inner (conjugate gradient) iterations, 0 = no max +# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only) +# --------------------------------------------------------------------------------------------- +# OUTPUT: +# 1- A D x 7 matrix M, where D denotes the number of covariates, with the following schema: +# M[,1]: betas +# M[,2]: exp(betas) +# M[,3]: standard error of betas +# M[,4]: Z +# M[,5]: P-value +# M[,6]: lower 100*(1-alpha)% confidence interval of betas +# M[,7]: upper 100*(1-alpha)% confidence interval of betas +# +# Two log files containing a summary of some statistics of the fitted model: +# 1- File S with the following format +# - line 1: no. of observations +# - line 2: no. of events +# - line 3: log-likelihood +# - line 4: AIC +# - line 5: Rsquare (Cox & Snell) +# - line 6: max possible Rsquare +# 2- File T with the following format +# - line 1: Likelihood ratio test statistic, degree of freedom, P-value +# - line 2: Wald test statistic, degree of freedom, P-value +# - line 3: Score (log-rank) test statistic, degree of freedom, P-value +# +# Additionally, the following matrices are stored (needed for prediction) +# 1- A column matrix RT that contains the order-preserving recoded timestamps from X +# 2- Matrix XO which is matrix X with sorted timestamps +# 3- Variance-covariance matrix of the betas COV +# 4- A column matrix MF that contains the column indices of X with the baseline factors removed (if available) +# ------------------------------------------------------------------------------------------- +# HOW TO INVOKE THIS SCRIPT - EXAMPLE: +# hadoop jar SystemML.jar -f Cox.dml -nvargs X=INPUT_DIR/X TE=INPUT_DIR/TE F=INTPUT_DIR/F R=INTPUT_DIR/R +# M=OUTPUT_DIR/M S=OUTPUT_DIR/S T=OUTPUT_DIR/T COV=OUTPUT_DIR/COV RT=OUTPUT_DIR/RT +# XO=OUTPUT_DIR/XO MF=OUTPUT/MF alpha=0.05 tol=0.000001 moi=100 mii=20 fmt=csv + +fileX = $X; +fileTE = $TE; +fileRT = $RT; +fileMF = $MF; +fileM = $M; +fileXO = $XO; +fileCOV = $COV; + +# Default values of some parameters +fileF = ifdef ($F, " "); # $F=" " +fileR = ifdef ($R, " "); # $R=" " +fileS = ifdef ($S, " "); # $S=" " +fileT = ifdef ($T, " "); # $T=" " +fmtO = ifdef ($fmt, "text"); # $fmt="text" +alpha = ifdef ($alpha, 0.05); # $alpha=0.05 +tol = ifdef ($tol, 0.000001); # $tol=0.000001; +maxiter = ifdef ($moi, 100); # $moi=100; +maxinneriter = ifdef ($mii, 0); # $mii=0; + +X_orig = read (fileX); + +TE = read (fileTE); +if (fileF != " ") { + F = read (fileF); +} + +######## CHECK FOR FACTORS AND REMOVE THE BASELINE OF EACH FACTOR FROM THE DATASET + +if (fileR != " ") { # factors available + R = read (fileR); + if (ncol (R) != 2) { + stop ("Matrix R has wrong dimensions!"); + } + print ("REMOVING BASLINE LEVEL OF EACH FACTOR..."); + # identify baseline columns to be removed from X_orig + col_sum = colSums (X_orig); + col_seq = t (seq(1, ncol (X_orig))); + parfor (i in 1:nrow (R), check = 0) { + start_ind = as.scalar (R[i,1]); + end_ind = as.scalar (R[i,2]); + baseline_ind = as.scalar (rowIndexMax (col_sum[1, start_ind:end_ind])) + start_ind - 1; + col_seq[,baseline_ind] = 0; + } + ones = matrix (1, rows = nrow (F), cols = 1); + F_filter = table (ones, F, 1, ncol (X_orig)); + F_filter = removeEmpty (target = F_filter * col_seq, margin = "cols"); + TE_F = t(append (t (TE), F_filter)); +} else if (fileF != " ") { # all features scale + TE_F = t(append (t (TE), t(F))); +} else { # no features available + TE_F = TE; +} + +write (TE_F, fileMF, format = fmtO); + +X_orig = X_orig %*% table (TE_F, seq (1, nrow (TE_F)), ncol (X_orig), nrow (TE_F)); + +######## RECODING TIMESTAMPS PRESERVING THE ORDER +print ("RECODING TIMESTAMPS..."); + +N = nrow (X_orig); +X_orig = order (target = X_orig, by = 1); +Idx = matrix (1, rows = N, cols = 1); +num_timestamps = 1; +if (N == 1) { + RT = matrix (1, rows = 1, cols = 1); +} else { + Idx[2:N,1] = ppred (X_orig[1:(N - 1),1], X_orig[2:N,1], "!="); + num_timestamps = sum (Idx); + A = removeEmpty (target = diag (Idx), margin = "cols"); + if (ncol (A) > 1) { + A[,1:(ncol (A) - 1)] = A[,1:(ncol (A) - 1)] - A[,2:ncol (A)]; + B = cumsum (A); + RT = B %*% seq(1, ncol(B)); + } else { # there is only one group + RT = matrix (1, rows = N, cols = 1); + } +} +E = X_orig[,2]; + +print ("BEGIN COX PROPORTIONAL HAZARD SCRIPT"); + +######## PARAMETERS OF THE TRUST REGION NEWTON METHOD WITH CONJUGATE GRADIENT +# b: the regression betas +# o: loss function value +# g: loss function gradient +# H: loss function Hessian +# sb: shift of b in one iteration +# so: shift of o in one iteration +# r: CG residual = H %*% sb + g +# d: CG direction vector +# Hd: = H %*% d +# c: scalar coefficient in CG +# delta: trust region size +# tol: tolerance value +# i: outer (Newton) iteration count +# j: inner (CG) iteration count + +# computing initial coefficients b (all initialized to 0) +if (ncol (X_orig) > 2) { + X = X_orig[,3:ncol(X_orig)]; + D = ncol (X); + zeros_D = matrix (0, rows = D, cols = 1); + b = zeros_D; +} +d_r = aggregate (target = E, groups = RT, fn = "sum"); +e_r = aggregate (target = RT, groups = RT, fn = "count"); + +# computing initial loss function value o +num_distinct = nrow (d_r); # no. of distinct timestamps +e_r_rev_agg = cumsum (rev(e_r)); +d_r_rev = rev(d_r); +o = sum (d_r_rev * log (e_r_rev_agg)); +o_init = o; +if (ncol (X_orig) < 3) { + loglik = -o; + S_str = "no. of records " + N + " loglik " + loglik; + if (fileS != " ") { + write (S_str, fileS, format = fmtO); + } else { + print (S_str); + } + stop ("No features are selected!"); +} + +# computing initial gradient g +# part 1 g0_1 +g0_1 = - t (colSums (X * E)); # g_1 +# part 2 g0_2 +X_rev_agg = cumsum (rev(X)); +select = table (seq (1, num_distinct), e_r_rev_agg); +X_agg = select %*% X_rev_agg; +g0_2 = t (colSums ((X_agg * d_r_rev)/ e_r_rev_agg)); +# +g0 = g0_1 + g0_2; +g = g0; + +# initialization for trust region Newton method +delta = 0.5 * sqrt (D) / max (sqrt (rowSums (X ^ 2))); +initial_g2 = sum (g ^ 2); +exit_g2 = initial_g2 * tol ^ 2; +maxiter = 100; +maxinneriter = min (D, 100); +i = 0; +sum_g2 = sum (g ^ 2); +while (sum_g2 > exit_g2 & i < maxiter) { + i = i + 1; + sb = zeros_D; + r = g; + r2 = sum (r ^ 2); + exit_r2 = 0.01 * r2; + d = - r; + trust_bound_reached = FALSE; + j = 0; + + exp_Xb = exp (X %*% b); + exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum"); + D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator + X_exp_Xb = X * exp_Xb; + X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb)); + X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg; + + while (r2 > exit_r2 & (! trust_bound_reached) & j < maxinneriter) { + j = j + 1; + # computing Hessian times d (Hd) + # part 1 Hd_1 + Xd = X %*% d; + X_Xd_exp_Xb = X * (Xd * exp_Xb); + X_Xd_exp_Xb_rev_agg = cumsum (rev(X_Xd_exp_Xb)); + X_Xd_exp_Xb_rev_agg = select %*% X_Xd_exp_Xb_rev_agg; + + Hd_1 = X_Xd_exp_Xb_rev_agg / D_r_rev; + # part 2 Hd_2 + + Xd_exp_Xb = Xd * exp_Xb; + Xd_exp_Xb_rev_agg = cumsum (rev(Xd_exp_Xb)); + Xd_exp_Xb_rev_agg = select %*% Xd_exp_Xb_rev_agg; + + Hd_2_num = X_exp_Xb_rev_agg * Xd_exp_Xb_rev_agg; # numerator + Hd_2 = Hd_2_num / (D_r_rev ^ 2); + + Hd = t (colSums ((Hd_1 - Hd_2) * d_r_rev)); + + c = r2 / sum (d * Hd); + [c, trust_bound_reached] = ensure_trust_bound (c, sum(d ^ 2), 2 * sum(sb * d), sum(sb ^ 2) - delta ^ 2); + sb = sb + c * d; + r = r + c * Hd; + r2_new = sum (r ^ 2); + d = - r + (r2_new / r2) * d; + r2 = r2_new; + } + + # computing loss change in 1 iteration (so) + # part 1 so_1 + so_1 = - as.scalar (colSums (X * E) %*% (b + sb)); + # part 2 so_2 + exp_Xbsb = exp (X %*% (b + sb)); + exp_Xbsb_agg = aggregate (target = exp_Xbsb, groups = RT, fn = "sum"); + so_2 = sum (d_r_rev * log (cumsum (rev(exp_Xbsb_agg)))); + # + so = so_1 + so_2; + so = so - o; + + delta = update_trust_bound (delta, sqrt (sum (sb ^ 2)), so, sum (sb * g), 0.5 * sum (sb * (r + g))); + if (so < 0) { + b = b + sb; + o = o + so; + # compute new gradient g + exp_Xb = exp (X %*% b); + exp_Xb_agg = aggregate (target = exp_Xb, groups = RT, fn = "sum"); + X_exp_Xb = X * exp_Xb; + X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb)); + X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg; + + D_r_rev = cumsum (rev(exp_Xb_agg)); # denominator + g_2 = t (colSums ((X_exp_Xb_rev_agg / D_r_rev) * d_r_rev)); + g = g0_1 + g_2; + sum_g2 = sum (g ^ 2); + } +} + +if (sum_g2 > exit_g2 & i >= maxiter) { + print ("Trust region Newton method did not converge!"); +} + + +print ("COMPUTING HESSIAN..."); + +H0 = matrix (0, rows = D, cols = D); +H = matrix (0, rows = D, cols = D); + +X_exp_Xb_rev_2 = rev(X_exp_Xb); +X_rev_2 = rev(X); + +X_exp_Xb_rev_agg = cumsum (rev(X_exp_Xb)); +X_exp_Xb_rev_agg = select %*% X_exp_Xb_rev_agg; + +parfor (i in 1:D, check = 0) { + Xi = X[,i]; + Xi_rev = rev(Xi); + + ## ----------Start calculating H0-------------- + # part 1 H0_1 + Xi_X = X_rev_2[,i:D] * Xi_rev; + Xi_X_rev_agg = cumsum (Xi_X); + Xi_X_rev_agg = select %*% Xi_X_rev_agg; + H0_1 = Xi_X_rev_agg / e_r_rev_agg; + + # part 2 H0_2 + Xi_agg = aggregate (target = Xi, groups = RT, fn = "sum"); + Xi_agg_rev_agg = cumsum (rev(Xi_agg)); + H0_2_num = X_agg[,i:D] * Xi_agg_rev_agg; # numerator + H0_2 = H0_2_num / (e_r_rev_agg ^ 2); + + H0[i,i:D] = colSums ((H0_1 - H0_2) * d_r_rev); + #-----------End calculating H0-------------------- + + ## ----------Start calculating H-------------- + # part 1 H_1 + Xi_X_exp_Xb = X_exp_Xb_rev_2[,i:D] * Xi_rev; + Xi_X_exp_Xb_rev_agg = cumsum (Xi_X_exp_Xb); + Xi_X_exp_Xb_rev_agg = select %*% Xi_X_exp_Xb_rev_agg; + H_1 = Xi_X_exp_Xb_rev_agg / D_r_rev; + + # part 2 H_2 + Xi_exp_Xb = exp_Xb * Xi; + Xi_exp_Xb_agg = aggregate (target = Xi_exp_Xb, groups = RT, fn = "sum"); + + Xi_exp_Xb_agg_rev_agg = cumsum (rev(Xi_exp_Xb_agg)); + H_2_num = X_exp_Xb_rev_agg[,i:D] * Xi_exp_Xb_agg_rev_agg; # numerator + H_2 = H_2_num / (D_r_rev ^ 2); + H[i,i:D] = colSums ((H_1 - H_2) * d_r_rev); + #-----------End calculating H-------------------- +} +H = H + t(H) - diag( diag (H)); +H0 = H0 + t(H0) - diag( diag (H0)); + + +# compute standard error for betas +H_inv = inv (H); +se_b = sqrt (diag (H_inv)); + +# compute exp(b), Z, Pr[>|Z|] +exp_b = exp (b); +Z = b / se_b; +P = matrix (0, rows = D, cols = 1); +parfor (i in 1:D) { + P[i,1] = 2 - 2 * (cdf (target = abs (as.scalar (Z[i,1])), dist = "normal")); +} + +# compute confidence intervals for b +z_alpha_2 = icdf (target = 1 - alpha / 2, dist = "normal"); +CI_l = b - se_b * z_alpha_2; +CI_r = b - se_b + z_alpha_2; + +######## SOME STATISTICS AND TESTS +# no. of records +S_str = "no. of records " + N; + +# no.of events +S_str = append (S_str, "no. of events " + sum (E)); + +# log-likelihood +loglik = -o; +S_str = append (S_str, "loglik " + loglik + " "); + +# AIC = -2 * loglik + 2 * D +AIC = -2 * loglik + 2 * D; +S_str = append (S_str, "AIC " + AIC + " "); + +# Wald test +wald_t = as.scalar (t(b) %*% H %*% b); +wald_p = 1 - cdf (target = wald_t, dist = "chisq", df = D); +T_str = "Wald test = " + wald_t + " on " + D + " df, p = " + wald_p + " "; + +# Likelihood ratio test +lratio_t = 2 * o_init - 2 * o; +lratio_p = 1 - cdf (target = lratio_t, dist = "chisq", df = D); +T_str = append (T_str, "Likelihood ratio test = " + lratio_t + " on " + D + " df, p = " + lratio_p + " "); + + +H0_inv = inv (H0); +score_t = as.scalar (t (g0) %*% H0_inv %*% g0); +score_p = 1 - cdf (target = score_t, dist = "chisq", df = D); +T_str = append (T_str, "Score (logrank) test = " + score_t + " on " + D + " df, p = " + score_p + " "); + +# Rsquare (Cox & Snell) +Rsquare = 1 - exp (-lratio_t / N); +Rsquare_max = 1 - exp (-2 * o_init / N); +S_str = append (S_str, "Rsquare (Cox & Snell): " + Rsquare + " "); +S_str = append (S_str, "max possible Rsquare: " + Rsquare_max); + +M = matrix (0, rows = D, cols = 7); +M[,1] = b; +M[,2] = exp_b; +M[,3] = se_b; +M[,4] = Z; +M[,5] = P; +M[,6] = CI_l; +M[,7] = CI_r; + +write (M, fileM, format = fmtO); +if (fileS != " ") { + write (S_str, fileS, format = fmtO); +} else { + print (S_str); +} +if (fileT != " ") { + write (T_str, fileT, format = fmtO); +} else { + print (T_str); +} +# needed for prediction +write (RT, fileRT, format = fmtO); +write (H_inv, fileCOV, format = fmtO); +write (X_orig, fileXO, format = fmtO); + + +####### UDFS FOR TRUST REGION NEWTON METHOD + +ensure_trust_bound = + function (double x, double a, double b, double c) + return (double x_new, boolean is_violated) +{ + if (a * x^2 + b * x + c > 0) + { + is_violated = TRUE; + rad = sqrt (b ^ 2 - 4 * a * c); + if (b >= 0) { + x_new = - (2 * c) / (b + rad); + } else { + x_new = - (b - rad) / (2 * a); + } + } else { + is_violated = FALSE; + x_new = x; + } +} + +update_trust_bound = + function (double delta, + double sb_distance, + double so_exact, + double so_linear_approx, + double so_quadratic_approx) + return (double delta) +{ + sigma1 = 0.25; + sigma2 = 0.5; + sigma3 = 4.0; + + if (so_exact <= so_linear_approx) { + alpha = sigma3; + } else { + alpha = max (sigma1, - 0.5 * so_linear_approx / (so_exact - so_linear_approx)); + } + + rho = so_exact / so_quadratic_approx; + if (rho < 0.0001) { + delta = min (max (alpha, sigma1) * sb_distance, sigma2 * delta); + } else { if (rho < 0.25) { + delta = max (sigma1 * delta, min (alpha * sb_distance, sigma2 * delta)); + } else { if (rho < 0.75) { + delta = max (sigma1 * delta, min (alpha * sb_distance, sigma3 * delta)); + } else { + delta = max (delta, min (alpha * sb_distance, sigma3 * delta)); + }}} +}
