http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/MultiLogReg.dml ---------------------------------------------------------------------- diff --git a/scripts/algorithms/MultiLogReg.dml b/scripts/algorithms/MultiLogReg.dml index 0b18b9d..716678d 100644 --- a/scripts/algorithms/MultiLogReg.dml +++ b/scripts/algorithms/MultiLogReg.dml @@ -1,365 +1,365 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Solves Multinomial Logistic Regression using Trust Region methods. -# (See: Trust Region Newton Method for Logistic Regression, Lin, Weng and Keerthi, JMLR 9 (2008) 627-650) - -# INPUT PARAMETERS: -# -------------------------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# -------------------------------------------------------------------------------------------- -# X String --- Location to read the matrix of feature vectors -# Y String --- Location to read the matrix with category labels -# B String --- Location to store estimated regression parameters (the betas) -# Log String " " Location to write per-iteration variables for log/debugging purposes -# icpt Int 0 Intercept presence, shifting and rescaling X columns: -# 0 = no intercept, no shifting, no rescaling; -# 1 = add intercept, but neither shift nor rescale X; -# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 -# reg Double 0.0 regularization parameter (lambda = 1/C); intercept is not regularized -# tol Double 0.000001 tolerance ("epsilon") -# moi Int 100 max. number of outer (Newton) iterations -# mii Int 0 max. number of inner (conjugate gradient) iterations, 0 = no max -# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only) -# -------------------------------------------------------------------------------------------- -# The largest label represents the baseline category; if label -1 or 0 is present, then it is -# the baseline label (and it is converted to the largest label). -# -# The Log file, when requested, contains the following per-iteration variables in CSV format, -# each line containing triple (NAME, ITERATION, VALUE) with ITERATION = 0 for initial values: -# -# NAME MEANING -# ------------------------------------------------------------------------------------------- -# LINEAR_TERM_MIN The minimum value of X %*% B, used to check for overflows -# LINEAR_TERM_MAX The maximum value of X %*% B, used to check for overflows -# NUM_CG_ITERS Number of inner (Conj.Gradient) iterations in this outer iteration -# IS_TRUST_REACHED 1 = trust region boundary was reached, 0 = otherwise -# POINT_STEP_NORM L2-norm of iteration step from old point (i.e. matrix B) to new point -# OBJECTIVE The loss function we minimize (negative regularized log-likelihood) -# OBJ_DROP_REAL Reduction in the objective during this iteration, actual value -# OBJ_DROP_PRED Reduction in the objective predicted by a quadratic approximation -# OBJ_DROP_RATIO Actual-to-predicted reduction ratio, used to update the trust region -# IS_POINT_UPDATED 1 = new point accepted; 0 = new point rejected, old point restored -# GRADIENT_NORM L2-norm of the loss function gradient (omitted if point is rejected) -# TRUST_DELTA Updated trust region size, the "delta" -# ------------------------------------------------------------------------------------------- -# -# Script invocation example: -# hadoop jar SystemML.jar -f MultiLogReg.dml -nvargs icpt=2 reg=1.0 tol=0.000001 moi=100 mii=20 -# X=INPUT_DIR/X123 Y=INPUT_DIR/Y123 B=OUTPUT_DIR/B123 fmt=csv Log=OUTPUT_DIR/log - -fileX = $X; -fileY = $Y; -fileB = $B; -fileLog = ifdef ($Log, " "); -fmtB = ifdef ($fmt, "text"); - -intercept_status = ifdef ($icpt, 0); # $icpt = 0; -regularization = ifdef ($reg, 0.0); # $reg = 0.0; -tol = ifdef ($tol, 0.000001); # $tol = 0.000001; -maxiter = ifdef ($moi, 100); # $moi = 100; -maxinneriter = ifdef ($mii, 0); # $mii = 0; - -tol = as.double (tol); - -print ("BEGIN MULTINOMIAL LOGISTIC REGRESSION SCRIPT"); -print ("Reading X..."); -X = read (fileX); -print ("Reading Y..."); -Y_vec = read (fileY); - -eta0 = 0.0001; -eta1 = 0.25; -eta2 = 0.75; -sigma1 = 0.25; -sigma2 = 0.5; -sigma3 = 4.0; -psi = 0.1; - -N = nrow (X); -D = ncol (X); - -# Introduce the intercept, shift and rescale the columns of X if needed -if (intercept_status == 1 | intercept_status == 2) # add the intercept column -{ - X = append (X, matrix (1, rows = N, cols = 1)); - D = ncol (X); -} - -scale_lambda = matrix (1, rows = D, cols = 1); -if (intercept_status == 1 | intercept_status == 2) -{ - scale_lambda [D, 1] = 0; -} - -if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1 -{ # Important assumption: X [, D] = matrix (1, rows = N, cols = 1) - avg_X_cols = t(colSums(X)) / N; - var_X_cols = (t(colSums (X ^ 2)) - N * (avg_X_cols ^ 2)) / (N - 1); - is_unsafe = ppred (var_X_cols, 0.0, "<="); - scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe); - scale_X [D, 1] = 1; - shift_X = - avg_X_cols * scale_X; - shift_X [D, 1] = 0; - rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2); -} else { - scale_X = matrix (1, rows = D, cols = 1); - shift_X = matrix (0, rows = D, cols = 1); - rowSums_X_sq = rowSums (X ^ 2); -} - -# Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2) -# with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale. -# The transform is then associatively applied to the other side of the expression, -# and is rewritten via "scale_X" and "shift_X" as follows: -# -# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: -# ssX_A = diag (scale_X) %*% A; -# ssX_A [D, ] = ssX_A [D, ] + t(shift_X) %*% A; -# -# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: -# tssX_A = diag (scale_X) %*% A + shift_X %*% A [D, ]; - -# Convert "Y_vec" into indicator matrice: -if (min (Y_vec) <= 0) { - # Category labels "0", "-1" etc. are converted into the largest label - max_y = max (Y_vec); - Y_vec = Y_vec + (- Y_vec + max_y + 1) * ppred (Y_vec , 0.0, "<="); -} -Y = table (seq (1, N, 1), Y_vec); -K = ncol (Y) - 1; # The number of non-baseline categories - -lambda = (scale_lambda %*% matrix (1, rows = 1, cols = K)) * regularization; -delta = 0.5 * sqrt (D) / max (sqrt (rowSums_X_sq)); - -B = matrix (0, rows = D, cols = K); ### LT = X %*% (SHIFT/SCALE TRANSFORM) %*% B; - ### LT = append (LT, matrix (0, rows = N, cols = 1)); - ### LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1); -P = matrix (1, rows = N, cols = K+1); ### exp_LT = exp (LT); -P = P / (K + 1); ### P = exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1)); -obj = N * log (K + 1); ### obj = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2)); - -Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]); -if (intercept_status == 2) { - Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ]; -} -Grad = Grad + lambda * B; -norm_Grad = sqrt (sum (Grad ^ 2)); -norm_Grad_initial = norm_Grad; - -if (maxinneriter == 0) { - maxinneriter = D * K; -} -iter = 1; - -# boolean for convergence check -converge = (norm_Grad < tol) | (iter > maxiter); - -print ("-- Initially: Objective = " + obj + ", Gradient Norm = " + norm_Grad + ", Trust Delta = " + delta); - -if (fileLog != " ") { - log_str = "OBJECTIVE,0," + obj; - log_str = append (log_str, "GRADIENT_NORM,0," + norm_Grad); - log_str = append (log_str, "TRUST_DELTA,0," + delta); -} else { - log_str = " "; -} - -while (! converge) -{ - # SOLVE TRUST REGION SUB-PROBLEM - S = matrix (0, rows = D, cols = K); - R = - Grad; - V = R; - delta2 = delta ^ 2; - inneriter = 1; - norm_R2 = sum (R ^ 2); - innerconverge = (sqrt (norm_R2) <= psi * norm_Grad); - is_trust_boundary_reached = 0; - - while (! innerconverge) - { - if (intercept_status == 2) { - ssX_V = diag (scale_X) %*% V; - ssX_V [D, ] = ssX_V [D, ] + t(shift_X) %*% V; - } else { - ssX_V = V; - } - Q = P [, 1:K] * (X %*% ssX_V); - HV = t(X) %*% (Q - P [, 1:K] * (rowSums (Q) %*% matrix (1, rows = 1, cols = K))); - if (intercept_status == 2) { - HV = diag (scale_X) %*% HV + shift_X %*% HV [D, ]; - } - HV = HV + lambda * V; - alpha = norm_R2 / sum (V * HV); - Snew = S + alpha * V; - norm_Snew2 = sum (Snew ^ 2); - if (norm_Snew2 <= delta2) - { - S = Snew; - R = R - alpha * HV; - old_norm_R2 = norm_R2 - norm_R2 = sum (R ^ 2); - V = R + (norm_R2 / old_norm_R2) * V; - innerconverge = (sqrt (norm_R2) <= psi * norm_Grad); - } else { - is_trust_boundary_reached = 1; - sv = sum (S * V); - v2 = sum (V ^ 2); - s2 = sum (S ^ 2); - rad = sqrt (sv ^ 2 + v2 * (delta2 - s2)); - if (sv >= 0) { - alpha = (delta2 - s2) / (sv + rad); - } else { - alpha = (rad - sv) / v2; - } - S = S + alpha * V; - R = R - alpha * HV; - innerconverge = TRUE; - } - inneriter = inneriter + 1; - innerconverge = innerconverge | (inneriter > maxinneriter); - } - - # END TRUST REGION SUB-PROBLEM - - # compute rho, update B, obtain delta - gs = sum (S * Grad); - qk = - 0.5 * (gs - sum (S * R)); - B_new = B + S; - if (intercept_status == 2) { - ssX_B_new = diag (scale_X) %*% B_new; - ssX_B_new [D, ] = ssX_B_new [D, ] + t(shift_X) %*% B_new; - } else { - ssX_B_new = B_new; - } - - LT = append ((X %*% ssX_B_new), matrix (0, rows = N, cols = 1)); - if (fileLog != " ") { - log_str = append (log_str, "LINEAR_TERM_MIN," + iter + "," + min (LT)); - log_str = append (log_str, "LINEAR_TERM_MAX," + iter + "," + max (LT)); - } - LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1); - exp_LT = exp (LT); - P_new = exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1)); - obj_new = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2)); - - # Consider updating LT in the inner loop - # Consider the big "obj" and "obj_new" rounding-off their small difference below: - - actred = (obj - obj_new); - - rho = actred / qk; - is_rho_accepted = (rho > eta0); - snorm = sqrt (sum (S ^ 2)); - - if (fileLog != " ") { - log_str = append (log_str, "NUM_CG_ITERS," + iter + "," + (inneriter - 1)); - log_str = append (log_str, "IS_TRUST_REACHED," + iter + "," + is_trust_boundary_reached); - log_str = append (log_str, "POINT_STEP_NORM," + iter + "," + snorm); - log_str = append (log_str, "OBJECTIVE," + iter + "," + obj_new); - log_str = append (log_str, "OBJ_DROP_REAL," + iter + "," + actred); - log_str = append (log_str, "OBJ_DROP_PRED," + iter + "," + qk); - log_str = append (log_str, "OBJ_DROP_RATIO," + iter + "," + rho); - } - - if (iter == 1) { - delta = min (delta, snorm); - } - - alpha2 = obj_new - obj - gs; - if (alpha2 <= 0) { - alpha = sigma3; - } - else { - alpha = max (sigma1, -0.5 * gs / alpha2); - } - - if (rho < eta0) { - delta = min (max (alpha, sigma1) * snorm, sigma2 * delta); - } - else { - if (rho < eta1) { - delta = max (sigma1 * delta, min (alpha * snorm, sigma2 * delta)); - } - else { - if (rho < eta2) { - delta = max (sigma1 * delta, min (alpha * snorm, sigma3 * delta)); - } - else { - delta = max (delta, min (alpha * snorm, sigma3 * delta)); - } - } - } - - if (is_trust_boundary_reached == 1) - { - print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations, trust bound REACHED"); - } else { - print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations"); - } - print (" -- Obj.Reduction: Actual = " + actred + ", Predicted = " + qk + - " (A/P: " + (round (10000.0 * rho) / 10000.0) + "), Trust Delta = " + delta); - - if (is_rho_accepted) - { - B = B_new; - P = P_new; - Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]); - if (intercept_status == 2) { - Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ]; - } - Grad = Grad + lambda * B; - norm_Grad = sqrt (sum (Grad ^ 2)); - obj = obj_new; - print (" -- New Objective = " + obj + ", Beta Change Norm = " + snorm + ", Gradient Norm = " + norm_Grad); - if (fileLog != " ") { - log_str = append (log_str, "IS_POINT_UPDATED," + iter + ",1"); - log_str = append (log_str, "GRADIENT_NORM," + iter + "," + norm_Grad); - } - } else { - if (fileLog != " ") { - log_str = append (log_str, "IS_POINT_UPDATED," + iter + ",0"); - } - } - - if (fileLog != " ") { - log_str = append (log_str, "TRUST_DELTA," + iter + "," + delta); - } - - iter = iter + 1; - converge = ((norm_Grad < (tol * norm_Grad_initial)) | (iter > maxiter) | - ((is_trust_boundary_reached == 0) & (abs (actred) < (abs (obj) + abs (obj_new)) * 0.00000000000001))); - if (converge) { print ("Termination / Convergence condition satisfied."); } else { print (" "); } -} - -if (intercept_status == 2) { - B_out = diag (scale_X) %*% B; - B_out [D, ] = B_out [D, ] + t(shift_X) %*% B; -} else { - B_out = B; -} -write (B_out, fileB, format=fmtB); - -if (fileLog != " ") { - write (log_str, fileLog); -} - +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Solves Multinomial Logistic Regression using Trust Region methods. +# (See: Trust Region Newton Method for Logistic Regression, Lin, Weng and Keerthi, JMLR 9 (2008) 627-650) + +# INPUT PARAMETERS: +# -------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# -------------------------------------------------------------------------------------------- +# X String --- Location to read the matrix of feature vectors +# Y String --- Location to read the matrix with category labels +# B String --- Location to store estimated regression parameters (the betas) +# Log String " " Location to write per-iteration variables for log/debugging purposes +# icpt Int 0 Intercept presence, shifting and rescaling X columns: +# 0 = no intercept, no shifting, no rescaling; +# 1 = add intercept, but neither shift nor rescale X; +# 2 = add intercept, shift & rescale X columns to mean = 0, variance = 1 +# reg Double 0.0 regularization parameter (lambda = 1/C); intercept is not regularized +# tol Double 0.000001 tolerance ("epsilon") +# moi Int 100 max. number of outer (Newton) iterations +# mii Int 0 max. number of inner (conjugate gradient) iterations, 0 = no max +# fmt String "text" Matrix output format, usually "text" or "csv" (for matrices only) +# -------------------------------------------------------------------------------------------- +# The largest label represents the baseline category; if label -1 or 0 is present, then it is +# the baseline label (and it is converted to the largest label). +# +# The Log file, when requested, contains the following per-iteration variables in CSV format, +# each line containing triple (NAME, ITERATION, VALUE) with ITERATION = 0 for initial values: +# +# NAME MEANING +# ------------------------------------------------------------------------------------------- +# LINEAR_TERM_MIN The minimum value of X %*% B, used to check for overflows +# LINEAR_TERM_MAX The maximum value of X %*% B, used to check for overflows +# NUM_CG_ITERS Number of inner (Conj.Gradient) iterations in this outer iteration +# IS_TRUST_REACHED 1 = trust region boundary was reached, 0 = otherwise +# POINT_STEP_NORM L2-norm of iteration step from old point (i.e. matrix B) to new point +# OBJECTIVE The loss function we minimize (negative regularized log-likelihood) +# OBJ_DROP_REAL Reduction in the objective during this iteration, actual value +# OBJ_DROP_PRED Reduction in the objective predicted by a quadratic approximation +# OBJ_DROP_RATIO Actual-to-predicted reduction ratio, used to update the trust region +# IS_POINT_UPDATED 1 = new point accepted; 0 = new point rejected, old point restored +# GRADIENT_NORM L2-norm of the loss function gradient (omitted if point is rejected) +# TRUST_DELTA Updated trust region size, the "delta" +# ------------------------------------------------------------------------------------------- +# +# Script invocation example: +# hadoop jar SystemML.jar -f MultiLogReg.dml -nvargs icpt=2 reg=1.0 tol=0.000001 moi=100 mii=20 +# X=INPUT_DIR/X123 Y=INPUT_DIR/Y123 B=OUTPUT_DIR/B123 fmt=csv Log=OUTPUT_DIR/log + +fileX = $X; +fileY = $Y; +fileB = $B; +fileLog = ifdef ($Log, " "); +fmtB = ifdef ($fmt, "text"); + +intercept_status = ifdef ($icpt, 0); # $icpt = 0; +regularization = ifdef ($reg, 0.0); # $reg = 0.0; +tol = ifdef ($tol, 0.000001); # $tol = 0.000001; +maxiter = ifdef ($moi, 100); # $moi = 100; +maxinneriter = ifdef ($mii, 0); # $mii = 0; + +tol = as.double (tol); + +print ("BEGIN MULTINOMIAL LOGISTIC REGRESSION SCRIPT"); +print ("Reading X..."); +X = read (fileX); +print ("Reading Y..."); +Y_vec = read (fileY); + +eta0 = 0.0001; +eta1 = 0.25; +eta2 = 0.75; +sigma1 = 0.25; +sigma2 = 0.5; +sigma3 = 4.0; +psi = 0.1; + +N = nrow (X); +D = ncol (X); + +# Introduce the intercept, shift and rescale the columns of X if needed +if (intercept_status == 1 | intercept_status == 2) # add the intercept column +{ + X = append (X, matrix (1, rows = N, cols = 1)); + D = ncol (X); +} + +scale_lambda = matrix (1, rows = D, cols = 1); +if (intercept_status == 1 | intercept_status == 2) +{ + scale_lambda [D, 1] = 0; +} + +if (intercept_status == 2) # scale-&-shift X columns to mean 0, variance 1 +{ # Important assumption: X [, D] = matrix (1, rows = N, cols = 1) + avg_X_cols = t(colSums(X)) / N; + var_X_cols = (t(colSums (X ^ 2)) - N * (avg_X_cols ^ 2)) / (N - 1); + is_unsafe = ppred (var_X_cols, 0.0, "<="); + scale_X = 1.0 / sqrt (var_X_cols * (1 - is_unsafe) + is_unsafe); + scale_X [D, 1] = 1; + shift_X = - avg_X_cols * scale_X; + shift_X [D, 1] = 0; + rowSums_X_sq = (X ^ 2) %*% (scale_X ^ 2) + X %*% (2 * scale_X * shift_X) + sum (shift_X ^ 2); +} else { + scale_X = matrix (1, rows = D, cols = 1); + shift_X = matrix (0, rows = D, cols = 1); + rowSums_X_sq = rowSums (X ^ 2); +} + +# Henceforth we replace "X" with "X %*% (SHIFT/SCALE TRANSFORM)" and rowSums(X ^ 2) +# with "rowSums_X_sq" in order to preserve the sparsity of X under shift and scale. +# The transform is then associatively applied to the other side of the expression, +# and is rewritten via "scale_X" and "shift_X" as follows: +# +# ssX_A = (SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: +# ssX_A = diag (scale_X) %*% A; +# ssX_A [D, ] = ssX_A [D, ] + t(shift_X) %*% A; +# +# tssX_A = t(SHIFT/SCALE TRANSFORM) %*% A --- is rewritten as: +# tssX_A = diag (scale_X) %*% A + shift_X %*% A [D, ]; + +# Convert "Y_vec" into indicator matrice: +if (min (Y_vec) <= 0) { + # Category labels "0", "-1" etc. are converted into the largest label + max_y = max (Y_vec); + Y_vec = Y_vec + (- Y_vec + max_y + 1) * ppred (Y_vec , 0.0, "<="); +} +Y = table (seq (1, N, 1), Y_vec); +K = ncol (Y) - 1; # The number of non-baseline categories + +lambda = (scale_lambda %*% matrix (1, rows = 1, cols = K)) * regularization; +delta = 0.5 * sqrt (D) / max (sqrt (rowSums_X_sq)); + +B = matrix (0, rows = D, cols = K); ### LT = X %*% (SHIFT/SCALE TRANSFORM) %*% B; + ### LT = append (LT, matrix (0, rows = N, cols = 1)); + ### LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1); +P = matrix (1, rows = N, cols = K+1); ### exp_LT = exp (LT); +P = P / (K + 1); ### P = exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1)); +obj = N * log (K + 1); ### obj = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2)); + +Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]); +if (intercept_status == 2) { + Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ]; +} +Grad = Grad + lambda * B; +norm_Grad = sqrt (sum (Grad ^ 2)); +norm_Grad_initial = norm_Grad; + +if (maxinneriter == 0) { + maxinneriter = D * K; +} +iter = 1; + +# boolean for convergence check +converge = (norm_Grad < tol) | (iter > maxiter); + +print ("-- Initially: Objective = " + obj + ", Gradient Norm = " + norm_Grad + ", Trust Delta = " + delta); + +if (fileLog != " ") { + log_str = "OBJECTIVE,0," + obj; + log_str = append (log_str, "GRADIENT_NORM,0," + norm_Grad); + log_str = append (log_str, "TRUST_DELTA,0," + delta); +} else { + log_str = " "; +} + +while (! converge) +{ + # SOLVE TRUST REGION SUB-PROBLEM + S = matrix (0, rows = D, cols = K); + R = - Grad; + V = R; + delta2 = delta ^ 2; + inneriter = 1; + norm_R2 = sum (R ^ 2); + innerconverge = (sqrt (norm_R2) <= psi * norm_Grad); + is_trust_boundary_reached = 0; + + while (! innerconverge) + { + if (intercept_status == 2) { + ssX_V = diag (scale_X) %*% V; + ssX_V [D, ] = ssX_V [D, ] + t(shift_X) %*% V; + } else { + ssX_V = V; + } + Q = P [, 1:K] * (X %*% ssX_V); + HV = t(X) %*% (Q - P [, 1:K] * (rowSums (Q) %*% matrix (1, rows = 1, cols = K))); + if (intercept_status == 2) { + HV = diag (scale_X) %*% HV + shift_X %*% HV [D, ]; + } + HV = HV + lambda * V; + alpha = norm_R2 / sum (V * HV); + Snew = S + alpha * V; + norm_Snew2 = sum (Snew ^ 2); + if (norm_Snew2 <= delta2) + { + S = Snew; + R = R - alpha * HV; + old_norm_R2 = norm_R2 + norm_R2 = sum (R ^ 2); + V = R + (norm_R2 / old_norm_R2) * V; + innerconverge = (sqrt (norm_R2) <= psi * norm_Grad); + } else { + is_trust_boundary_reached = 1; + sv = sum (S * V); + v2 = sum (V ^ 2); + s2 = sum (S ^ 2); + rad = sqrt (sv ^ 2 + v2 * (delta2 - s2)); + if (sv >= 0) { + alpha = (delta2 - s2) / (sv + rad); + } else { + alpha = (rad - sv) / v2; + } + S = S + alpha * V; + R = R - alpha * HV; + innerconverge = TRUE; + } + inneriter = inneriter + 1; + innerconverge = innerconverge | (inneriter > maxinneriter); + } + + # END TRUST REGION SUB-PROBLEM + + # compute rho, update B, obtain delta + gs = sum (S * Grad); + qk = - 0.5 * (gs - sum (S * R)); + B_new = B + S; + if (intercept_status == 2) { + ssX_B_new = diag (scale_X) %*% B_new; + ssX_B_new [D, ] = ssX_B_new [D, ] + t(shift_X) %*% B_new; + } else { + ssX_B_new = B_new; + } + + LT = append ((X %*% ssX_B_new), matrix (0, rows = N, cols = 1)); + if (fileLog != " ") { + log_str = append (log_str, "LINEAR_TERM_MIN," + iter + "," + min (LT)); + log_str = append (log_str, "LINEAR_TERM_MAX," + iter + "," + max (LT)); + } + LT = LT - rowMaxs (LT) %*% matrix (1, rows = 1, cols = K+1); + exp_LT = exp (LT); + P_new = exp_LT / (rowSums (exp_LT) %*% matrix (1, rows = 1, cols = K+1)); + obj_new = - sum (Y * LT) + sum (log (rowSums (exp_LT))) + 0.5 * sum (lambda * (B_new ^ 2)); + + # Consider updating LT in the inner loop + # Consider the big "obj" and "obj_new" rounding-off their small difference below: + + actred = (obj - obj_new); + + rho = actred / qk; + is_rho_accepted = (rho > eta0); + snorm = sqrt (sum (S ^ 2)); + + if (fileLog != " ") { + log_str = append (log_str, "NUM_CG_ITERS," + iter + "," + (inneriter - 1)); + log_str = append (log_str, "IS_TRUST_REACHED," + iter + "," + is_trust_boundary_reached); + log_str = append (log_str, "POINT_STEP_NORM," + iter + "," + snorm); + log_str = append (log_str, "OBJECTIVE," + iter + "," + obj_new); + log_str = append (log_str, "OBJ_DROP_REAL," + iter + "," + actred); + log_str = append (log_str, "OBJ_DROP_PRED," + iter + "," + qk); + log_str = append (log_str, "OBJ_DROP_RATIO," + iter + "," + rho); + } + + if (iter == 1) { + delta = min (delta, snorm); + } + + alpha2 = obj_new - obj - gs; + if (alpha2 <= 0) { + alpha = sigma3; + } + else { + alpha = max (sigma1, -0.5 * gs / alpha2); + } + + if (rho < eta0) { + delta = min (max (alpha, sigma1) * snorm, sigma2 * delta); + } + else { + if (rho < eta1) { + delta = max (sigma1 * delta, min (alpha * snorm, sigma2 * delta)); + } + else { + if (rho < eta2) { + delta = max (sigma1 * delta, min (alpha * snorm, sigma3 * delta)); + } + else { + delta = max (delta, min (alpha * snorm, sigma3 * delta)); + } + } + } + + if (is_trust_boundary_reached == 1) + { + print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations, trust bound REACHED"); + } else { + print ("-- Outer Iteration " + iter + ": Had " + (inneriter - 1) + " CG iterations"); + } + print (" -- Obj.Reduction: Actual = " + actred + ", Predicted = " + qk + + " (A/P: " + (round (10000.0 * rho) / 10000.0) + "), Trust Delta = " + delta); + + if (is_rho_accepted) + { + B = B_new; + P = P_new; + Grad = t(X) %*% (P [, 1:K] - Y [, 1:K]); + if (intercept_status == 2) { + Grad = diag (scale_X) %*% Grad + shift_X %*% Grad [D, ]; + } + Grad = Grad + lambda * B; + norm_Grad = sqrt (sum (Grad ^ 2)); + obj = obj_new; + print (" -- New Objective = " + obj + ", Beta Change Norm = " + snorm + ", Gradient Norm = " + norm_Grad); + if (fileLog != " ") { + log_str = append (log_str, "IS_POINT_UPDATED," + iter + ",1"); + log_str = append (log_str, "GRADIENT_NORM," + iter + "," + norm_Grad); + } + } else { + if (fileLog != " ") { + log_str = append (log_str, "IS_POINT_UPDATED," + iter + ",0"); + } + } + + if (fileLog != " ") { + log_str = append (log_str, "TRUST_DELTA," + iter + "," + delta); + } + + iter = iter + 1; + converge = ((norm_Grad < (tol * norm_Grad_initial)) | (iter > maxiter) | + ((is_trust_boundary_reached == 0) & (abs (actred) < (abs (obj) + abs (obj_new)) * 0.00000000000001))); + if (converge) { print ("Termination / Convergence condition satisfied."); } else { print (" "); } +} + +if (intercept_status == 2) { + B_out = diag (scale_X) %*% B; + B_out [D, ] = B_out [D, ] + t(shift_X) %*% B; +} else { + B_out = B; +} +write (B_out, fileB, format=fmtB); + +if (fileLog != " ") { + write (log_str, fileLog); +} +
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/PCA.dml ---------------------------------------------------------------------- diff --git a/scripts/algorithms/PCA.dml b/scripts/algorithms/PCA.dml index e8b4aec..65800b7 100644 --- a/scripts/algorithms/PCA.dml +++ b/scripts/algorithms/PCA.dml @@ -1,112 +1,112 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# This script performs Principal Component Analysis (PCA) on the given input data. -# -# INPUT PARAMETERS: -# --------------------------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# --------------------------------------------------------------------------------------------- -# INPUT String --- Location to read the matrix A of feature vectors -# K Int --- Indicates dimension of the new vector space constructed from eigen vectors -# CENTER Int 0 Indicates whether or not to center data -# SCALE Int 0 Indicates whether or not to scale data -# OFMT String --- Output data format -# PROJDATA Int 0 This argument indicates if the data should be projected or not -# MODEL String --- Location to already existing model: eigenvectors and eigenvalues -# OUTPUT String / Location to write output matrices (covariance matrix, new basis vectors, -# and data projected onto new basis vectors) -# hadoop jar SystemML.jar -f PCA.dml -nvargs INPUT=INPUT_DIR/pca-1000x1000 -# OUTPUT=OUTPUT_DIR/pca-1000x1000-model PROJDATA=1 CENTER=1 SCALE=1 -# --------------------------------------------------------------------------------------------- - -A = read($INPUT); -K = ifdef($K, ncol(A)); -ofmt = ifdef($OFMT, "CSV"); -projectData = ifdef($PROJDATA,0); -model = ifdef($MODEL,""); -center = ifdef($CENTER,0); -scale = ifdef($SCALE,0); -output = ifdef($OUTPUT,"/"); - -evec_dominant = matrix(0,cols=1,rows=1); - -if (model != "") { - # reuse existing model to project data - evec_dominant = read(model+"/dominant.eigen.vectors"); -}else{ - if (model == "" ){ - model = output; - } - - N = nrow(A); - D = ncol(A); - - # perform z-scoring (centering and scaling) - if (center == 1) { - cm = colMeans(A); - A = A - cm; - } - if (scale == 1) { - cvars = (colSums (A^2)); - if (center == 1){ - cm = colMeans(A); - cvars = (cvars - N*(cm^2))/(N-1); - } - Azscored = (A)/sqrt(cvars); - A = Azscored; - } - - # co-variance matrix - mu = colSums(A)/N; - C = (t(A) %*% A)/(N-1) - (N/(N-1))*t(mu) %*% mu; - - - # compute eigen vectors and values - [evalues, evectors] = eigen(C); - - decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE); - diagmat = table(seq(1,D),decreasing_Idx); - # sorts eigenvalues by decreasing order - evalues = diagmat %*% evalues; - # sorts eigenvectors column-wise in the order of decreasing eigenvalues - evectors = evectors %*% diagmat; - - - # select K dominant eigen vectors - nvec = ncol(evectors); - - eval_dominant = evalues[1:K, 1]; - evec_dominant = evectors[,1:K]; - - # the square root of eigenvalues - eval_stdev_dominant = sqrt(eval_dominant); - - write(eval_stdev_dominant, model+"/dominant.eigen.standard.deviations", format=ofmt); - write(eval_dominant, model+"/dominant.eigen.values", format=ofmt); - write(evec_dominant, model+"/dominant.eigen.vectors", format=ofmt); -} -if (projectData == 1 | model != ""){ - # Construct new data set by treating computed dominant eigenvectors as the basis vectors - newA = A %*% evec_dominant; - write(newA, output+"/projected.data", format=ofmt); -} +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# This script performs Principal Component Analysis (PCA) on the given input data. +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# INPUT String --- Location to read the matrix A of feature vectors +# K Int --- Indicates dimension of the new vector space constructed from eigen vectors +# CENTER Int 0 Indicates whether or not to center data +# SCALE Int 0 Indicates whether or not to scale data +# OFMT String --- Output data format +# PROJDATA Int 0 This argument indicates if the data should be projected or not +# MODEL String --- Location to already existing model: eigenvectors and eigenvalues +# OUTPUT String / Location to write output matrices (covariance matrix, new basis vectors, +# and data projected onto new basis vectors) +# hadoop jar SystemML.jar -f PCA.dml -nvargs INPUT=INPUT_DIR/pca-1000x1000 +# OUTPUT=OUTPUT_DIR/pca-1000x1000-model PROJDATA=1 CENTER=1 SCALE=1 +# --------------------------------------------------------------------------------------------- + +A = read($INPUT); +K = ifdef($K, ncol(A)); +ofmt = ifdef($OFMT, "CSV"); +projectData = ifdef($PROJDATA,0); +model = ifdef($MODEL,""); +center = ifdef($CENTER,0); +scale = ifdef($SCALE,0); +output = ifdef($OUTPUT,"/"); + +evec_dominant = matrix(0,cols=1,rows=1); + +if (model != "") { + # reuse existing model to project data + evec_dominant = read(model+"/dominant.eigen.vectors"); +}else{ + if (model == "" ){ + model = output; + } + + N = nrow(A); + D = ncol(A); + + # perform z-scoring (centering and scaling) + if (center == 1) { + cm = colMeans(A); + A = A - cm; + } + if (scale == 1) { + cvars = (colSums (A^2)); + if (center == 1){ + cm = colMeans(A); + cvars = (cvars - N*(cm^2))/(N-1); + } + Azscored = (A)/sqrt(cvars); + A = Azscored; + } + + # co-variance matrix + mu = colSums(A)/N; + C = (t(A) %*% A)/(N-1) - (N/(N-1))*t(mu) %*% mu; + + + # compute eigen vectors and values + [evalues, evectors] = eigen(C); + + decreasing_Idx = order(target=evalues,by=1,decreasing=TRUE,index.return=TRUE); + diagmat = table(seq(1,D),decreasing_Idx); + # sorts eigenvalues by decreasing order + evalues = diagmat %*% evalues; + # sorts eigenvectors column-wise in the order of decreasing eigenvalues + evectors = evectors %*% diagmat; + + + # select K dominant eigen vectors + nvec = ncol(evectors); + + eval_dominant = evalues[1:K, 1]; + evec_dominant = evectors[,1:K]; + + # the square root of eigenvalues + eval_stdev_dominant = sqrt(eval_dominant); + + write(eval_stdev_dominant, model+"/dominant.eigen.standard.deviations", format=ofmt); + write(eval_dominant, model+"/dominant.eigen.values", format=ofmt); + write(evec_dominant, model+"/dominant.eigen.vectors", format=ofmt); +} +if (projectData == 1 | model != ""){ + # Construct new data set by treating computed dominant eigenvectors as the basis vectors + newA = A %*% evec_dominant; + write(newA, output+"/projected.data", format=ofmt); +}
