http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/l2-svm.dml ---------------------------------------------------------------------- diff --git a/scripts/algorithms/l2-svm.dml b/scripts/algorithms/l2-svm.dml index 04b83b0..140ac5a 100644 --- a/scripts/algorithms/l2-svm.dml +++ b/scripts/algorithms/l2-svm.dml @@ -1,159 +1,159 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Implements binary-class SVM with squared slack variables -# -# Example Usage: -# Assume L2SVM_HOME is set to the home of the dml script -# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR -# Assume epsilon = 0.001, lambda = 1, maxiterations = 100 -# -# hadoop jar SystemML.jar -f $L2SVM_HOME/l2-svm.dml -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/Y icpt=0 tol=0.001 reg=1 maxiter=100 model=$OUPUT_DIR/w Log=$OUTPUT_DIR/Log fmt="text" -# -# Note about inputs: -# Assumes that labels (entries in Y) -# are set to either -1 or +1 -# or the result of recoding -# - -cmdLine_fmt = ifdef($fmt, "text") -cmdLine_icpt = ifdef($icpt, 0) -cmdLine_tol = ifdef($tol, 0.001) -cmdLine_reg = ifdef($reg, 1.0) -cmdLine_maxiter = ifdef($maxiter, 100) - -X = read($X) -Y = read($Y) - -if(nrow(X) < 2) - stop("Stopping due to invalid inputs: Not possible to learn a binary class classifier without at least 2 rows") - -check_min = min(Y) -check_max = max(Y) -num_min = sum(ppred(Y, check_min, "==")) -num_max = sum(ppred(Y, check_max, "==")) - -if(check_min == check_max) - stop("Stopping due to invalid inputs: Y seems to contain exactly one label") - -if(num_min + num_max != nrow(Y)) - stop("Stopping due to invalid inputs: Y seems to contain more than 2 labels") - -if(check_min != -1 | check_max != +1) - Y = 2/(check_max - check_min)*Y - (check_min + check_max)/(check_max - check_min) - -positive_label = check_max -negative_label = check_min - -continue = 1 - -intercept = cmdLine_icpt -if(intercept != 0 & intercept != 1) - stop("Stopping due to invalid argument: Currently supported intercept options are 0 and 1") - -epsilon = cmdLine_tol -if(epsilon < 0) - stop("Stopping due to invalid argument: Tolerance (tol) must be non-negative") - -lambda = cmdLine_reg -if(lambda < 0) - stop("Stopping due to invalid argument: Regularization constant (reg) must be non-negative") - -maxiterations = cmdLine_maxiter -if(maxiterations < 1) - stop("Stopping due to invalid argument: Maximum iterations should be a positive integer") - -num_samples = nrow(X) -dimensions = ncol(X) - -if (intercept == 1) { - ones = matrix(1, rows=num_samples, cols=1) - X = append(X, ones); -} - -num_rows_in_w = dimensions -if(intercept == 1){ - num_rows_in_w = num_rows_in_w + 1 -} -w = matrix(0, rows=num_rows_in_w, cols=1) - -g_old = t(X) %*% Y -s = g_old - -Xw = matrix(0, rows=nrow(X), cols=1) -debug_str = "# Iter, Obj" -iter = 0 -while(continue == 1 & iter < maxiterations) { - # minimizing primal obj along direction s - step_sz = 0 - Xd = X %*% s - wd = lambda * sum(w * s) - dd = lambda * sum(s * s) - continue1 = 1 - while(continue1 == 1){ - tmp_Xw = Xw + step_sz*Xd - out = 1 - Y * (tmp_Xw) - sv = ppred(out, 0, ">") - out = out * sv - g = wd + step_sz*dd - sum(out * Y * Xd) - h = dd + sum(Xd * sv * Xd) - step_sz = step_sz - g/h - if (g*g/h < 0.0000000001){ - continue1 = 0 - } - } - - #update weights - w = w + step_sz*s - Xw = Xw + step_sz*Xd - - out = 1 - Y * Xw - sv = ppred(out, 0, ">") - out = sv * out - obj = 0.5 * sum(out * out) + lambda/2 * sum(w * w) - g_new = t(X) %*% (out * Y) - lambda * w - - print("ITER " + iter + ": OBJ=" + obj) - debug_str = append(debug_str, iter + "," + obj) - - tmp = sum(s * g_old) - if(step_sz*tmp < epsilon*obj){ - continue = 0 - } - - #non-linear CG step - be = sum(g_new * g_new)/sum(g_old * g_old) - s = be * s + g_new - g_old = g_new - - iter = iter + 1 -} - -extra_model_params = matrix(0, rows=4, cols=1) -extra_model_params[1,1] = positive_label -extra_model_params[2,1] = negative_label -extra_model_params[3,1] = intercept -extra_model_params[4,1] = dimensions - -w = t(append(t(w), t(extra_model_params))) -write(w, $model, format=cmdLine_fmt) - -write(debug_str, $Log) +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Implements binary-class SVM with squared slack variables +# +# Example Usage: +# Assume L2SVM_HOME is set to the home of the dml script +# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR +# Assume epsilon = 0.001, lambda = 1, maxiterations = 100 +# +# hadoop jar SystemML.jar -f $L2SVM_HOME/l2-svm.dml -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/Y icpt=0 tol=0.001 reg=1 maxiter=100 model=$OUPUT_DIR/w Log=$OUTPUT_DIR/Log fmt="text" +# +# Note about inputs: +# Assumes that labels (entries in Y) +# are set to either -1 or +1 +# or the result of recoding +# + +cmdLine_fmt = ifdef($fmt, "text") +cmdLine_icpt = ifdef($icpt, 0) +cmdLine_tol = ifdef($tol, 0.001) +cmdLine_reg = ifdef($reg, 1.0) +cmdLine_maxiter = ifdef($maxiter, 100) + +X = read($X) +Y = read($Y) + +if(nrow(X) < 2) + stop("Stopping due to invalid inputs: Not possible to learn a binary class classifier without at least 2 rows") + +check_min = min(Y) +check_max = max(Y) +num_min = sum(ppred(Y, check_min, "==")) +num_max = sum(ppred(Y, check_max, "==")) + +if(check_min == check_max) + stop("Stopping due to invalid inputs: Y seems to contain exactly one label") + +if(num_min + num_max != nrow(Y)) + stop("Stopping due to invalid inputs: Y seems to contain more than 2 labels") + +if(check_min != -1 | check_max != +1) + Y = 2/(check_max - check_min)*Y - (check_min + check_max)/(check_max - check_min) + +positive_label = check_max +negative_label = check_min + +continue = 1 + +intercept = cmdLine_icpt +if(intercept != 0 & intercept != 1) + stop("Stopping due to invalid argument: Currently supported intercept options are 0 and 1") + +epsilon = cmdLine_tol +if(epsilon < 0) + stop("Stopping due to invalid argument: Tolerance (tol) must be non-negative") + +lambda = cmdLine_reg +if(lambda < 0) + stop("Stopping due to invalid argument: Regularization constant (reg) must be non-negative") + +maxiterations = cmdLine_maxiter +if(maxiterations < 1) + stop("Stopping due to invalid argument: Maximum iterations should be a positive integer") + +num_samples = nrow(X) +dimensions = ncol(X) + +if (intercept == 1) { + ones = matrix(1, rows=num_samples, cols=1) + X = append(X, ones); +} + +num_rows_in_w = dimensions +if(intercept == 1){ + num_rows_in_w = num_rows_in_w + 1 +} +w = matrix(0, rows=num_rows_in_w, cols=1) + +g_old = t(X) %*% Y +s = g_old + +Xw = matrix(0, rows=nrow(X), cols=1) +debug_str = "# Iter, Obj" +iter = 0 +while(continue == 1 & iter < maxiterations) { + # minimizing primal obj along direction s + step_sz = 0 + Xd = X %*% s + wd = lambda * sum(w * s) + dd = lambda * sum(s * s) + continue1 = 1 + while(continue1 == 1){ + tmp_Xw = Xw + step_sz*Xd + out = 1 - Y * (tmp_Xw) + sv = ppred(out, 0, ">") + out = out * sv + g = wd + step_sz*dd - sum(out * Y * Xd) + h = dd + sum(Xd * sv * Xd) + step_sz = step_sz - g/h + if (g*g/h < 0.0000000001){ + continue1 = 0 + } + } + + #update weights + w = w + step_sz*s + Xw = Xw + step_sz*Xd + + out = 1 - Y * Xw + sv = ppred(out, 0, ">") + out = sv * out + obj = 0.5 * sum(out * out) + lambda/2 * sum(w * w) + g_new = t(X) %*% (out * Y) - lambda * w + + print("ITER " + iter + ": OBJ=" + obj) + debug_str = append(debug_str, iter + "," + obj) + + tmp = sum(s * g_old) + if(step_sz*tmp < epsilon*obj){ + continue = 0 + } + + #non-linear CG step + be = sum(g_new * g_new)/sum(g_old * g_old) + s = be * s + g_new + g_old = g_new + + iter = iter + 1 +} + +extra_model_params = matrix(0, rows=4, cols=1) +extra_model_params[1,1] = positive_label +extra_model_params[2,1] = negative_label +extra_model_params[3,1] = intercept +extra_model_params[4,1] = dimensions + +w = t(append(t(w), t(extra_model_params))) +write(w, $model, format=cmdLine_fmt) + +write(debug_str, $Log)
http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/m-svm-predict.dml ---------------------------------------------------------------------- diff --git a/scripts/algorithms/m-svm-predict.dml b/scripts/algorithms/m-svm-predict.dml index 4d0c736..ba06cf6 100644 --- a/scripts/algorithms/m-svm-predict.dml +++ b/scripts/algorithms/m-svm-predict.dml @@ -1,84 +1,84 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# This script can be used to compute label predictions -# Meant for use with an SVM model (learnt using m-svm.dml) on a held out test set -# -# Given ground truth labels, the script will compute an -# accuracy (%) for the predictions -# -# Example Usage: -# hadoop jar SystemML.jar -f m-svm-predict.dml -nvargs X=data Y=labels model=model scores=scores accuracy=accuracy confusion=confusion fmt="text" -# - -cmdLine_Y = ifdef($Y, " ") -cmdLine_confusion = ifdef($confusion, " ") -cmdLine_accuracy = ifdef($accuracy, " ") -cmdLine_scores = ifdef($scores, " ") -cmdLine_fmt = ifdef($fmt, "text") - -X = read($X); -W = read($model); - -dimensions = as.scalar(W[nrow(W),1]) -if(dimensions != ncol(X)) - stop("Stopping due to invalid input: Model dimensions do not seem to match input data dimensions") - -intercept = as.scalar(W[nrow(W)-1,1]) -W = W[1:(nrow(W)-2),] - -N = nrow(X); -num_classes = ncol(W) -m=ncol(X); - -b = matrix(0, rows=1, cols=num_classes) -if (intercept == 1) - b = W[m+1,] - -ones = matrix(1, rows=N, cols=1) -scores = X %*% W[1:m,] + ones %*% b; - -if(cmdLine_scores != " ") - write(scores, cmdLine_scores, format=cmdLine_fmt); - -if(cmdLine_Y != " "){ - y = read(cmdLine_Y); - - if(min(y) < 1) - stop("Stopping due to invalid argument: Label vector (Y) must be recoded") - - pred = rowIndexMax(scores); - correct_percentage = sum(ppred(pred - y, 0, "==")) / N * 100; - - acc_str = "Accuracy (%): " + correct_percentage - print(acc_str) - if(cmdLine_accuracy != " ") - write(acc_str, cmdLine_accuracy) - - num_classes_ground_truth = max(y) - if(num_classes < num_classes_ground_truth) - num_classes = num_classes_ground_truth - - if(cmdLine_confusion != " "){ - confusion_mat = table(pred, y, num_classes, num_classes) - write(confusion_mat, cmdLine_confusion, format="csv") - } -} +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# This script can be used to compute label predictions +# Meant for use with an SVM model (learnt using m-svm.dml) on a held out test set +# +# Given ground truth labels, the script will compute an +# accuracy (%) for the predictions +# +# Example Usage: +# hadoop jar SystemML.jar -f m-svm-predict.dml -nvargs X=data Y=labels model=model scores=scores accuracy=accuracy confusion=confusion fmt="text" +# + +cmdLine_Y = ifdef($Y, " ") +cmdLine_confusion = ifdef($confusion, " ") +cmdLine_accuracy = ifdef($accuracy, " ") +cmdLine_scores = ifdef($scores, " ") +cmdLine_fmt = ifdef($fmt, "text") + +X = read($X); +W = read($model); + +dimensions = as.scalar(W[nrow(W),1]) +if(dimensions != ncol(X)) + stop("Stopping due to invalid input: Model dimensions do not seem to match input data dimensions") + +intercept = as.scalar(W[nrow(W)-1,1]) +W = W[1:(nrow(W)-2),] + +N = nrow(X); +num_classes = ncol(W) +m=ncol(X); + +b = matrix(0, rows=1, cols=num_classes) +if (intercept == 1) + b = W[m+1,] + +ones = matrix(1, rows=N, cols=1) +scores = X %*% W[1:m,] + ones %*% b; + +if(cmdLine_scores != " ") + write(scores, cmdLine_scores, format=cmdLine_fmt); + +if(cmdLine_Y != " "){ + y = read(cmdLine_Y); + + if(min(y) < 1) + stop("Stopping due to invalid argument: Label vector (Y) must be recoded") + + pred = rowIndexMax(scores); + correct_percentage = sum(ppred(pred - y, 0, "==")) / N * 100; + + acc_str = "Accuracy (%): " + correct_percentage + print(acc_str) + if(cmdLine_accuracy != " ") + write(acc_str, cmdLine_accuracy) + + num_classes_ground_truth = max(y) + if(num_classes < num_classes_ground_truth) + num_classes = num_classes_ground_truth + + if(cmdLine_confusion != " "){ + confusion_mat = table(pred, y, num_classes, num_classes) + write(confusion_mat, cmdLine_confusion, format="csv") + } +} http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/m-svm.dml ---------------------------------------------------------------------- diff --git a/scripts/algorithms/m-svm.dml b/scripts/algorithms/m-svm.dml index c570872..560d46f 100644 --- a/scripts/algorithms/m-svm.dml +++ b/scripts/algorithms/m-svm.dml @@ -1,174 +1,174 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# Implements multiclass SVM with squared slack variables, -# learns one-against-the-rest binary-class classifiers -# -# Example Usage: -# Assume SVM_HOME is set to the home of the dml script -# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR -# Assume epsilon = 0.001, lambda=1.0, max_iterations = 100 -# -# hadoop jar SystemML.jar -f $SVM_HOME/m-svm.dml -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/y icpt=intercept tol=.001 reg=1.0 maxiter=100 model=$OUTPUT_DIR/w Log=$OUTPUT_DIR/Log fmt="text" -# - -cmdLine_fmt = ifdef($fmt, "text") -cmdLine_icpt = ifdef($icpt, 0) -cmdLine_tol = ifdef($tol, 0.001) -cmdLine_reg = ifdef($reg, 1.0) -cmdLine_maxiter = ifdef($maxiter, 100) - -print("icpt=" + cmdLine_icpt + " tol=" + cmdLine_tol + " reg=" + cmdLine_reg + " maxiter=" + cmdLine_maxiter) - -X = read($X) - -if(nrow(X) < 2) - stop("Stopping due to invalid inputs: Not possible to learn a classifier without at least 2 rows") - -dimensions = ncol(X) - -Y = read($Y) - -if(nrow(X) != nrow(Y)) - stop("Stopping due to invalid argument: Numbers of rows in X and Y must match") - -intercept = cmdLine_icpt -if(intercept != 0 & intercept != 1) - stop("Stopping due to invalid argument: Currently supported intercept options are 0 and 1") - -min_y = min(Y) -if(min_y < 1) - stop("Stopping due to invalid argument: Label vector (Y) must be recoded") -num_classes = max(Y) -if(num_classes == 1) - stop("Stopping due to invalid argument: Maximum label value is 1, need more than one class to learn a multi-class classifier") -mod1 = Y %% 1 -mod1_should_be_nrow = sum(abs(ppred(mod1, 0, "=="))) -if(mod1_should_be_nrow != nrow(Y)) - stop("Stopping due to invalid argument: Please ensure that Y contains (positive) integral labels") - -epsilon = cmdLine_tol -if(epsilon < 0) - stop("Stopping due to invalid argument: Tolerance (tol) must be non-negative") - -lambda = cmdLine_reg -if(lambda < 0) - stop("Stopping due to invalid argument: Regularization constant (reg) must be non-negative") - -max_iterations = cmdLine_maxiter -if(max_iterations < 1) - stop("Stopping due to invalid argument: Maximum iterations should be a positive integer") - -num_samples = nrow(X) -num_features = ncol(X) - -if (intercept == 1) { - ones = matrix(1, rows=num_samples, cols=1); - X = append(X, ones); -} - -num_rows_in_w = num_features -if(intercept == 1){ - num_rows_in_w = num_rows_in_w + 1 -} -w = matrix(0, rows=num_rows_in_w, cols=num_classes) - -debug_mat = matrix(-1, rows=max_iterations, cols=num_classes) -parfor(iter_class in 1:num_classes){ - Y_local = 2 * ppred(Y, iter_class, "==") - 1 - w_class = matrix(0, rows=num_features, cols=1) - if (intercept == 1) { - zero_matrix = matrix(0, rows=1, cols=1); - w_class = t(append(t(w_class), zero_matrix)); - } - - g_old = t(X) %*% Y_local - s = g_old - - Xw = matrix(0, rows=nrow(X), cols=1) - iter = 0 - continue = 1 - while(continue == 1) { - # minimizing primal obj along direction s - step_sz = 0 - Xd = X %*% s - wd = lambda * sum(w_class * s) - dd = lambda * sum(s * s) - continue1 = 1 - while(continue1 == 1){ - tmp_Xw = Xw + step_sz*Xd - out = 1 - Y_local * (tmp_Xw) - sv = ppred(out, 0, ">") - out = out * sv - g = wd + step_sz*dd - sum(out * Y_local * Xd) - h = dd + sum(Xd * sv * Xd) - step_sz = step_sz - g/h - if (g*g/h < 0.0000000001){ - continue1 = 0 - } - } - - #update weights - w_class = w_class + step_sz*s - Xw = Xw + step_sz*Xd - - out = 1 - Y_local * Xw - sv = ppred(out, 0, ">") - out = sv * out - obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class) - g_new = t(X) %*% (out * Y_local) - lambda * w_class - - tmp = sum(s * g_old) - - train_acc = sum(ppred(Y_local*(X%*%w_class), 0, ">="))/num_samples*100 - print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc) - debug_mat[iter+1,iter_class] = obj - - if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)){ - continue = 0 - } - - #non-linear CG step - be = sum(g_new * g_new)/sum(g_old * g_old) - s = be * s + g_new - g_old = g_new - - iter = iter + 1 - } - - w[,iter_class] = w_class -} - -extra_model_params = matrix(0, rows=2, cols=ncol(w)) -extra_model_params[1, 1] = intercept -extra_model_params[2, 1] = dimensions -w = t(append(t(w), t(extra_model_params))) -write(w, $model, format=cmdLine_fmt) - -debug_str = "# Class, Iter, Obj" -for(iter_class in 1:ncol(debug_mat)){ - for(iter in 1:nrow(debug_mat)){ - obj = castAsScalar(debug_mat[iter, iter_class]) - if(obj != -1) - debug_str = append(debug_str, iter_class + "," + iter + "," + obj) - } -} -write(debug_str, $Log) +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# Implements multiclass SVM with squared slack variables, +# learns one-against-the-rest binary-class classifiers +# +# Example Usage: +# Assume SVM_HOME is set to the home of the dml script +# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR +# Assume epsilon = 0.001, lambda=1.0, max_iterations = 100 +# +# hadoop jar SystemML.jar -f $SVM_HOME/m-svm.dml -nvargs X=$INPUT_DIR/X Y=$INPUT_DIR/y icpt=intercept tol=.001 reg=1.0 maxiter=100 model=$OUTPUT_DIR/w Log=$OUTPUT_DIR/Log fmt="text" +# + +cmdLine_fmt = ifdef($fmt, "text") +cmdLine_icpt = ifdef($icpt, 0) +cmdLine_tol = ifdef($tol, 0.001) +cmdLine_reg = ifdef($reg, 1.0) +cmdLine_maxiter = ifdef($maxiter, 100) + +print("icpt=" + cmdLine_icpt + " tol=" + cmdLine_tol + " reg=" + cmdLine_reg + " maxiter=" + cmdLine_maxiter) + +X = read($X) + +if(nrow(X) < 2) + stop("Stopping due to invalid inputs: Not possible to learn a classifier without at least 2 rows") + +dimensions = ncol(X) + +Y = read($Y) + +if(nrow(X) != nrow(Y)) + stop("Stopping due to invalid argument: Numbers of rows in X and Y must match") + +intercept = cmdLine_icpt +if(intercept != 0 & intercept != 1) + stop("Stopping due to invalid argument: Currently supported intercept options are 0 and 1") + +min_y = min(Y) +if(min_y < 1) + stop("Stopping due to invalid argument: Label vector (Y) must be recoded") +num_classes = max(Y) +if(num_classes == 1) + stop("Stopping due to invalid argument: Maximum label value is 1, need more than one class to learn a multi-class classifier") +mod1 = Y %% 1 +mod1_should_be_nrow = sum(abs(ppred(mod1, 0, "=="))) +if(mod1_should_be_nrow != nrow(Y)) + stop("Stopping due to invalid argument: Please ensure that Y contains (positive) integral labels") + +epsilon = cmdLine_tol +if(epsilon < 0) + stop("Stopping due to invalid argument: Tolerance (tol) must be non-negative") + +lambda = cmdLine_reg +if(lambda < 0) + stop("Stopping due to invalid argument: Regularization constant (reg) must be non-negative") + +max_iterations = cmdLine_maxiter +if(max_iterations < 1) + stop("Stopping due to invalid argument: Maximum iterations should be a positive integer") + +num_samples = nrow(X) +num_features = ncol(X) + +if (intercept == 1) { + ones = matrix(1, rows=num_samples, cols=1); + X = append(X, ones); +} + +num_rows_in_w = num_features +if(intercept == 1){ + num_rows_in_w = num_rows_in_w + 1 +} +w = matrix(0, rows=num_rows_in_w, cols=num_classes) + +debug_mat = matrix(-1, rows=max_iterations, cols=num_classes) +parfor(iter_class in 1:num_classes){ + Y_local = 2 * ppred(Y, iter_class, "==") - 1 + w_class = matrix(0, rows=num_features, cols=1) + if (intercept == 1) { + zero_matrix = matrix(0, rows=1, cols=1); + w_class = t(append(t(w_class), zero_matrix)); + } + + g_old = t(X) %*% Y_local + s = g_old + + Xw = matrix(0, rows=nrow(X), cols=1) + iter = 0 + continue = 1 + while(continue == 1) { + # minimizing primal obj along direction s + step_sz = 0 + Xd = X %*% s + wd = lambda * sum(w_class * s) + dd = lambda * sum(s * s) + continue1 = 1 + while(continue1 == 1){ + tmp_Xw = Xw + step_sz*Xd + out = 1 - Y_local * (tmp_Xw) + sv = ppred(out, 0, ">") + out = out * sv + g = wd + step_sz*dd - sum(out * Y_local * Xd) + h = dd + sum(Xd * sv * Xd) + step_sz = step_sz - g/h + if (g*g/h < 0.0000000001){ + continue1 = 0 + } + } + + #update weights + w_class = w_class + step_sz*s + Xw = Xw + step_sz*Xd + + out = 1 - Y_local * Xw + sv = ppred(out, 0, ">") + out = sv * out + obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class) + g_new = t(X) %*% (out * Y_local) - lambda * w_class + + tmp = sum(s * g_old) + + train_acc = sum(ppred(Y_local*(X%*%w_class), 0, ">="))/num_samples*100 + print("For class " + iter_class + " iteration " + iter + " training accuracy: " + train_acc) + debug_mat[iter+1,iter_class] = obj + + if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)){ + continue = 0 + } + + #non-linear CG step + be = sum(g_new * g_new)/sum(g_old * g_old) + s = be * s + g_new + g_old = g_new + + iter = iter + 1 + } + + w[,iter_class] = w_class +} + +extra_model_params = matrix(0, rows=2, cols=ncol(w)) +extra_model_params[1, 1] = intercept +extra_model_params[2, 1] = dimensions +w = t(append(t(w), t(extra_model_params))) +write(w, $model, format=cmdLine_fmt) + +debug_str = "# Class, Iter, Obj" +for(iter_class in 1:ncol(debug_mat)){ + for(iter in 1:nrow(debug_mat)){ + obj = castAsScalar(debug_mat[iter, iter_class]) + if(obj != -1) + debug_str = append(debug_str, iter_class + "," + iter + "," + obj) + } +} +write(debug_str, $Log) http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/random-forest-predict.dml ---------------------------------------------------------------------- diff --git a/scripts/algorithms/random-forest-predict.dml b/scripts/algorithms/random-forest-predict.dml index 2d99670..7bc6cd6 100644 --- a/scripts/algorithms/random-forest-predict.dml +++ b/scripts/algorithms/random-forest-predict.dml @@ -1,193 +1,193 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# THIS SCRIPT COMPUTES LABEL PREDICTIONS MEANT FOR USE WITH A RANDOM FOREST MODEL ON A HELD OUT TEST SET -# OR FOR COMPUTING THE OUT-OF-BAG ERROR ON THE TRAINING SET. -# -# INPUT PARAMETERS: -# --------------------------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# --------------------------------------------------------------------------------------------- -# X String --- Location to read test feature matrix or training feature matrix for computing Out-Of-Bag error; -# note that X needs to be both recoded and dummy coded -# Y String " " Location to read true label matrix Y if requested; note that Y needs to be both recoded and dummy coded -# R String " " Location to read the matrix R which for each feature in X contains the following information -# - R[,1]: column ids -# - R[,2]: start indices -# - R[,3]: end indices -# If R is not provided by default all variables are assumed to be scale -# M String --- Location to read matrix M containing the learned tree i the following format -# - M[1,j]: id of node j (in a complete binary tree) -# - M[2,j]: tree id -# - M[3,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0 -# - M[4,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0 -# - M[5,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features, -# otherwise the label that leaf node j is supposed to predict -# - M[6,j]: If j is an internal node: 1 if the feature chosen for j is scale, otherwise the size of the subset of values -# stored in rows 7,8,... if j is categorical -# If j is a leaf node: number of misclassified samples reaching at node j -# - M[7:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[7,j] -# if the feature chosen for j is scale, otherwise if the feature chosen for j is categorical rows 7,8,... -# depict the value subset chosen for j -# If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0 -# C String " " Location to read the counts matrix containing the number of times samples are chosen in each tree of the random forest -# P String --- Location to store the label predictions for X -# A String " " Location to store the test accuracy (%) for the prediction if requested -# OOB String " " If C is provided location to store the Out-Of-Bag (OOB) error of the learned model -# CM String " " Location to store the confusion matrix if requested -# fmt String "text" The output format of the output, such as "text" or "csv" -# --------------------------------------------------------------------------------------------- -# OUTPUT: -# 1- Matrix Y containing the predicted labels for X -# 2- Test accuracy if requested -# 3- Confusion matrix C if requested -# ------------------------------------------------------------------------------------------- -# HOW TO INVOKE THIS SCRIPT - EXAMPLE: -# hadoop jar SystemML.jar -f random-forest-predict.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=INPUT_DIR/model P=OUTPUT_DIR/predictions -# A=OUTPUT_DIR/accurcay CM=OUTPUT_DIR/confusion fmt=csv - -fileX = $X; -fileM = $M; -fileP = $P; -fileY = ifdef ($Y, " "); -fileR = ifdef ($R, " "); -fileC = ifdef ($C, " "); -fileOOB = ifdef ($OOB, " "); -fileCM = ifdef ($CM, " "); -fileA = ifdef ($A, " "); -fmtO = ifdef ($fmt, "text"); -X = read (fileX); -M = read (fileM); - -num_records = nrow (X); -Y_predicted = matrix (0, rows = num_records, cols = 1); -num_trees = max (M[2,]); -num_labels = max (M[5,]); -num_nodes_per_tree = aggregate (target = t (M[2,]), groups = t (M[2,]), fn = "count"); -num_nodes_per_tree_cum = cumsum (num_nodes_per_tree); - -R_cat = matrix (0, rows = 1, cols = 1); -R_scale = matrix (0, rows = 1, cols = 1); - -if (fileR != " ") { - R = read (fileR); - dummy_coded = ppred (R[,2], R[,3], "!="); - R_scale = removeEmpty (target = R[,2] * (1 - dummy_coded), margin = "rows"); - R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows"); -} else { # only scale features available - R_scale = seq (1, ncol (X)); -} - -if (fileC != " ") { - C = read (fileC); - label_counts_oob = matrix (0, rows = num_records, cols = num_labels); -} - -label_counts = matrix (0, rows = num_records, cols = num_labels); -parfor (i in 1:num_records, check = 0) { - cur_sample = X[i,]; - cur_node_pos = 1; - # cur_node = 1; - cur_tree = 1; - start_ind = 1; - labels_found = FALSE; - while (!labels_found) { - - cur_feature = as.scalar (M[4,cur_node_pos]); - type_label = as.scalar (M[5,cur_node_pos]); - if (cur_feature == 0) { # leaf found - label_counts[i,type_label] = label_counts[i,type_label] + 1; - if (fileC != " ") { - if (as.scalar (C[i,cur_tree]) == 0) label_counts_oob[i,type_label] = label_counts_oob[i,type_label] + 1; - } - if (cur_tree < num_trees) { - cur_node_pos = as.scalar (num_nodes_per_tree_cum[cur_tree,]) + 1; - } else if (cur_tree == num_trees) { - labels_found = TRUE; - } - cur_tree = cur_tree + 1; - } else { - # determine type: 1 for scale, 2 for categorical - if (type_label == 1) { # scale feature - cur_start_ind = as.scalar (R_scale[cur_feature,]); - cur_value = as.scalar (cur_sample[,cur_start_ind]); - cur_split = as.scalar (M[7,cur_node_pos]); - if (cur_value < cur_split) { # go to left branch - cur_node_pos = cur_node_pos + as.scalar (M[3,cur_node_pos]); - # cur_node = as.scalar (cur_M[1,cur_node_pos]); - } else { # go to right branch - cur_node_pos = cur_node_pos + as.scalar (M[3,cur_node_pos]) + 1; - # cur_node = as.scalar (cur_M[1,cur_node_pos]); - } - } else if (type_label == 2) { # categorical feature - cur_start_ind = as.scalar (R_cat[cur_feature,1]); - cur_end_ind = as.scalar (R_cat[cur_feature,2]); - cur_value = as.scalar (rowIndexMax(cur_sample[,cur_start_ind:cur_end_ind])); - cur_offset = as.scalar (M[6,cur_node_pos]); - value_found = sum (ppred (M[7:(7 + cur_offset - 1),cur_node_pos], cur_value, "==")); - if (value_found >= 1) { # go to left branch - cur_node_pos = cur_node_pos + as.scalar (M[3,cur_node_pos]); - # cur_node = as.scalar (cur_M[1,cur_node_pos]); - } else { # go to right branch - cur_node_pos = cur_node_pos + as.scalar (M[3,cur_node_pos]) + 1; - # cur_node = as.scalar (cur_M[1,cur_node_pos]); - } - - } -}}} - -Y_predicted = rowIndexMax (label_counts); -write (Y_predicted, fileP, format = fmtO); - -if (fileY != " ") { - Y_dummy = read (fileY); - num_classes = ncol (Y_dummy); - Y = rowSums (Y_dummy * t (seq (1, num_classes))); - result = ppred (Y, Y_predicted, "=="); - result = sum (result); - accuracy = result / num_records * 100; - acc_str = "Accuracy (%): " + accuracy; - if (fileA != " ") { - write (acc_str, fileA, format = fmtO); - } else { - print (acc_str); - } - if (fileC != " ") { - oob_ind = ppred (rowSums (label_counts_oob), 0, ">") - label_counts_oob = removeEmpty (target = label_counts_oob, margin = "rows"); - num_oob = nrow (label_counts_oob); - Y_predicted_oob = rowIndexMax (label_counts_oob); - Y_oob = removeEmpty (target = Y * oob_ind, margin = "rows"); - result = ppred (Y_oob, Y_predicted_oob, "=="); - oob_error = (1 - (sum (result) / num_oob)) * 100; - oob_str = "Out-Of-Bag error (%): " + oob_error; - if (fileOOB != " ") { - write (oob_str, fileOOB, format = fmtO); - } else { - print (oob_str); - } - } - if (fileCM != " ") { - confusion_mat = table(Y_predicted, Y, num_classes, num_classes) - write(confusion_mat, fileCM, format = fmtO) - } -} +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# THIS SCRIPT COMPUTES LABEL PREDICTIONS MEANT FOR USE WITH A RANDOM FOREST MODEL ON A HELD OUT TEST SET +# OR FOR COMPUTING THE OUT-OF-BAG ERROR ON THE TRAINING SET. +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X String --- Location to read test feature matrix or training feature matrix for computing Out-Of-Bag error; +# note that X needs to be both recoded and dummy coded +# Y String " " Location to read true label matrix Y if requested; note that Y needs to be both recoded and dummy coded +# R String " " Location to read the matrix R which for each feature in X contains the following information +# - R[,1]: column ids +# - R[,2]: start indices +# - R[,3]: end indices +# If R is not provided by default all variables are assumed to be scale +# M String --- Location to read matrix M containing the learned tree i the following format +# - M[1,j]: id of node j (in a complete binary tree) +# - M[2,j]: tree id +# - M[3,j]: Offset (no. of columns) to left child of j if j is an internal node, otherwise 0 +# - M[4,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0 +# - M[5,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features, +# otherwise the label that leaf node j is supposed to predict +# - M[6,j]: If j is an internal node: 1 if the feature chosen for j is scale, otherwise the size of the subset of values +# stored in rows 7,8,... if j is categorical +# If j is a leaf node: number of misclassified samples reaching at node j +# - M[7:,j]: If j is an internal node: Threshold the example's feature value is compared to is stored at M[7,j] +# if the feature chosen for j is scale, otherwise if the feature chosen for j is categorical rows 7,8,... +# depict the value subset chosen for j +# If j is a leaf node 1 if j is impure and the number of samples at j > threshold, otherwise 0 +# C String " " Location to read the counts matrix containing the number of times samples are chosen in each tree of the random forest +# P String --- Location to store the label predictions for X +# A String " " Location to store the test accuracy (%) for the prediction if requested +# OOB String " " If C is provided location to store the Out-Of-Bag (OOB) error of the learned model +# CM String " " Location to store the confusion matrix if requested +# fmt String "text" The output format of the output, such as "text" or "csv" +# --------------------------------------------------------------------------------------------- +# OUTPUT: +# 1- Matrix Y containing the predicted labels for X +# 2- Test accuracy if requested +# 3- Confusion matrix C if requested +# ------------------------------------------------------------------------------------------- +# HOW TO INVOKE THIS SCRIPT - EXAMPLE: +# hadoop jar SystemML.jar -f random-forest-predict.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=INPUT_DIR/model P=OUTPUT_DIR/predictions +# A=OUTPUT_DIR/accurcay CM=OUTPUT_DIR/confusion fmt=csv + +fileX = $X; +fileM = $M; +fileP = $P; +fileY = ifdef ($Y, " "); +fileR = ifdef ($R, " "); +fileC = ifdef ($C, " "); +fileOOB = ifdef ($OOB, " "); +fileCM = ifdef ($CM, " "); +fileA = ifdef ($A, " "); +fmtO = ifdef ($fmt, "text"); +X = read (fileX); +M = read (fileM); + +num_records = nrow (X); +Y_predicted = matrix (0, rows = num_records, cols = 1); +num_trees = max (M[2,]); +num_labels = max (M[5,]); +num_nodes_per_tree = aggregate (target = t (M[2,]), groups = t (M[2,]), fn = "count"); +num_nodes_per_tree_cum = cumsum (num_nodes_per_tree); + +R_cat = matrix (0, rows = 1, cols = 1); +R_scale = matrix (0, rows = 1, cols = 1); + +if (fileR != " ") { + R = read (fileR); + dummy_coded = ppred (R[,2], R[,3], "!="); + R_scale = removeEmpty (target = R[,2] * (1 - dummy_coded), margin = "rows"); + R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows"); +} else { # only scale features available + R_scale = seq (1, ncol (X)); +} + +if (fileC != " ") { + C = read (fileC); + label_counts_oob = matrix (0, rows = num_records, cols = num_labels); +} + +label_counts = matrix (0, rows = num_records, cols = num_labels); +parfor (i in 1:num_records, check = 0) { + cur_sample = X[i,]; + cur_node_pos = 1; + # cur_node = 1; + cur_tree = 1; + start_ind = 1; + labels_found = FALSE; + while (!labels_found) { + + cur_feature = as.scalar (M[4,cur_node_pos]); + type_label = as.scalar (M[5,cur_node_pos]); + if (cur_feature == 0) { # leaf found + label_counts[i,type_label] = label_counts[i,type_label] + 1; + if (fileC != " ") { + if (as.scalar (C[i,cur_tree]) == 0) label_counts_oob[i,type_label] = label_counts_oob[i,type_label] + 1; + } + if (cur_tree < num_trees) { + cur_node_pos = as.scalar (num_nodes_per_tree_cum[cur_tree,]) + 1; + } else if (cur_tree == num_trees) { + labels_found = TRUE; + } + cur_tree = cur_tree + 1; + } else { + # determine type: 1 for scale, 2 for categorical + if (type_label == 1) { # scale feature + cur_start_ind = as.scalar (R_scale[cur_feature,]); + cur_value = as.scalar (cur_sample[,cur_start_ind]); + cur_split = as.scalar (M[7,cur_node_pos]); + if (cur_value < cur_split) { # go to left branch + cur_node_pos = cur_node_pos + as.scalar (M[3,cur_node_pos]); + # cur_node = as.scalar (cur_M[1,cur_node_pos]); + } else { # go to right branch + cur_node_pos = cur_node_pos + as.scalar (M[3,cur_node_pos]) + 1; + # cur_node = as.scalar (cur_M[1,cur_node_pos]); + } + } else if (type_label == 2) { # categorical feature + cur_start_ind = as.scalar (R_cat[cur_feature,1]); + cur_end_ind = as.scalar (R_cat[cur_feature,2]); + cur_value = as.scalar (rowIndexMax(cur_sample[,cur_start_ind:cur_end_ind])); + cur_offset = as.scalar (M[6,cur_node_pos]); + value_found = sum (ppred (M[7:(7 + cur_offset - 1),cur_node_pos], cur_value, "==")); + if (value_found >= 1) { # go to left branch + cur_node_pos = cur_node_pos + as.scalar (M[3,cur_node_pos]); + # cur_node = as.scalar (cur_M[1,cur_node_pos]); + } else { # go to right branch + cur_node_pos = cur_node_pos + as.scalar (M[3,cur_node_pos]) + 1; + # cur_node = as.scalar (cur_M[1,cur_node_pos]); + } + + } +}}} + +Y_predicted = rowIndexMax (label_counts); +write (Y_predicted, fileP, format = fmtO); + +if (fileY != " ") { + Y_dummy = read (fileY); + num_classes = ncol (Y_dummy); + Y = rowSums (Y_dummy * t (seq (1, num_classes))); + result = ppred (Y, Y_predicted, "=="); + result = sum (result); + accuracy = result / num_records * 100; + acc_str = "Accuracy (%): " + accuracy; + if (fileA != " ") { + write (acc_str, fileA, format = fmtO); + } else { + print (acc_str); + } + if (fileC != " ") { + oob_ind = ppred (rowSums (label_counts_oob), 0, ">") + label_counts_oob = removeEmpty (target = label_counts_oob, margin = "rows"); + num_oob = nrow (label_counts_oob); + Y_predicted_oob = rowIndexMax (label_counts_oob); + Y_oob = removeEmpty (target = Y * oob_ind, margin = "rows"); + result = ppred (Y_oob, Y_predicted_oob, "=="); + oob_error = (1 - (sum (result) / num_oob)) * 100; + oob_str = "Out-Of-Bag error (%): " + oob_error; + if (fileOOB != " ") { + write (oob_str, fileOOB, format = fmtO); + } else { + print (oob_str); + } + } + if (fileCM != " ") { + confusion_mat = table(Y_predicted, Y, num_classes, num_classes) + write(confusion_mat, fileCM, format = fmtO) + } +}
