[42/51] [partial] incubator-systemml git commit: [SYSTEMML-482] [SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line endings, and normalizing all of the line endings.

dusenberrymw Fri, 22 Jan 2016 08:34:44 -0800

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/l2-svm.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/l2-svm.dml b/scripts/algorithms/l2-svm.dml
index 04b83b0..140ac5a 100644
--- a/scripts/algorithms/l2-svm.dml
+++ b/scripts/algorithms/l2-svm.dml
@@ -1,159 +1,159 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Implements binary-class SVM with squared slack variables
-#
-# Example Usage:
-# Assume L2SVM_HOME is set to the home of the dml script
-# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
-# Assume epsilon = 0.001, lambda = 1, maxiterations = 100
-#
-# hadoop jar SystemML.jar -f $L2SVM_HOME/l2-svm.dml -nvargs X=$INPUT_DIR/X 
Y=$INPUT_DIR/Y icpt=0 tol=0.001 reg=1 maxiter=100 model=$OUPUT_DIR/w 
Log=$OUTPUT_DIR/Log fmt="text"
-#
-# Note about inputs: 
-# Assumes that labels (entries in Y) 
-# are set to either -1 or +1
-# or the result of recoding
-#
-
-cmdLine_fmt = ifdef($fmt, "text")
-cmdLine_icpt = ifdef($icpt, 0)
-cmdLine_tol = ifdef($tol, 0.001)
-cmdLine_reg = ifdef($reg, 1.0)
-cmdLine_maxiter = ifdef($maxiter, 100)
-
-X = read($X)
-Y = read($Y)
-
-if(nrow(X) < 2)
-       stop("Stopping due to invalid inputs: Not possible to learn a binary 
class classifier without at least 2 rows")
-
-check_min = min(Y)
-check_max = max(Y)
-num_min = sum(ppred(Y, check_min, "=="))
-num_max = sum(ppred(Y, check_max, "=="))
-
-if(check_min == check_max)
-       stop("Stopping due to invalid inputs: Y seems to contain exactly one 
label")
-
-if(num_min + num_max != nrow(Y))
-       stop("Stopping due to invalid inputs: Y seems to contain more than 2 
labels")
-       
-if(check_min != -1 | check_max != +1) 
-       Y = 2/(check_max - check_min)*Y - (check_min + check_max)/(check_max - 
check_min)
-
-positive_label = check_max
-negative_label = check_min
-
-continue = 1
-
-intercept = cmdLine_icpt
-if(intercept != 0 & intercept != 1)
-       stop("Stopping due to invalid argument: Currently supported intercept 
options are 0 and 1")
-
-epsilon = cmdLine_tol
-if(epsilon < 0)
-       stop("Stopping due to invalid argument: Tolerance (tol) must be 
non-negative")
-       
-lambda = cmdLine_reg
-if(lambda < 0)
-       stop("Stopping due to invalid argument: Regularization constant (reg) 
must be non-negative")
-       
-maxiterations = cmdLine_maxiter
-if(maxiterations < 1)
-       stop("Stopping due to invalid argument: Maximum iterations should be a 
positive integer")
-
-num_samples = nrow(X)
-dimensions = ncol(X)
-
-if (intercept == 1) {
-       ones  = matrix(1, rows=num_samples, cols=1)
-       X = append(X, ones);
-}
-
-num_rows_in_w = dimensions
-if(intercept == 1){
-       num_rows_in_w = num_rows_in_w + 1
-}
-w = matrix(0, rows=num_rows_in_w, cols=1)
-
-g_old = t(X) %*% Y
-s = g_old
-
-Xw = matrix(0, rows=nrow(X), cols=1)
-debug_str = "# Iter, Obj"
-iter = 0
-while(continue == 1 & iter < maxiterations)  {
-       # minimizing primal obj along direction s
-    step_sz = 0
-    Xd = X %*% s
-    wd = lambda * sum(w * s)
-    dd = lambda * sum(s * s)
-    continue1 = 1
-    while(continue1 == 1){
-               tmp_Xw = Xw + step_sz*Xd
-       out = 1 - Y * (tmp_Xw)
-       sv = ppred(out, 0, ">")
-       out = out * sv
-       g = wd + step_sz*dd - sum(out * Y * Xd)
-       h = dd + sum(Xd * sv * Xd)
-       step_sz = step_sz - g/h
-       if (g*g/h < 0.0000000001){
-               continue1 = 0
-       }
-    }
-
-    #update weights
-    w = w + step_sz*s
-       Xw = Xw + step_sz*Xd
-       
-    out = 1 - Y * Xw
-    sv = ppred(out, 0, ">")
-    out = sv * out
-    obj = 0.5 * sum(out * out) + lambda/2 * sum(w * w)
-    g_new = t(X) %*% (out * Y) - lambda * w
-
-    print("ITER " + iter + ": OBJ=" + obj)
-       debug_str = append(debug_str, iter + "," + obj)
-       
-    tmp = sum(s * g_old)
-    if(step_sz*tmp < epsilon*obj){
-       continue = 0
-    }
-
-    #non-linear CG step
-    be = sum(g_new * g_new)/sum(g_old * g_old)
-    s = be * s + g_new
-    g_old = g_new
-
-    iter = iter + 1
-}
-
-extra_model_params = matrix(0, rows=4, cols=1)
-extra_model_params[1,1] = positive_label
-extra_model_params[2,1] = negative_label
-extra_model_params[3,1] = intercept
-extra_model_params[4,1] = dimensions
-
-w = t(append(t(w), t(extra_model_params)))
-write(w, $model, format=cmdLine_fmt)
-
-write(debug_str, $Log)
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Implements binary-class SVM with squared slack variables
+#
+# Example Usage:
+# Assume L2SVM_HOME is set to the home of the dml script
+# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
+# Assume epsilon = 0.001, lambda = 1, maxiterations = 100
+#
+# hadoop jar SystemML.jar -f $L2SVM_HOME/l2-svm.dml -nvargs X=$INPUT_DIR/X 
Y=$INPUT_DIR/Y icpt=0 tol=0.001 reg=1 maxiter=100 model=$OUPUT_DIR/w 
Log=$OUTPUT_DIR/Log fmt="text"
+#
+# Note about inputs: 
+# Assumes that labels (entries in Y) 
+# are set to either -1 or +1
+# or the result of recoding
+#
+
+cmdLine_fmt = ifdef($fmt, "text")
+cmdLine_icpt = ifdef($icpt, 0)
+cmdLine_tol = ifdef($tol, 0.001)
+cmdLine_reg = ifdef($reg, 1.0)
+cmdLine_maxiter = ifdef($maxiter, 100)
+
+X = read($X)
+Y = read($Y)
+
+if(nrow(X) < 2)
+       stop("Stopping due to invalid inputs: Not possible to learn a binary 
class classifier without at least 2 rows")
+
+check_min = min(Y)
+check_max = max(Y)
+num_min = sum(ppred(Y, check_min, "=="))
+num_max = sum(ppred(Y, check_max, "=="))
+
+if(check_min == check_max)
+       stop("Stopping due to invalid inputs: Y seems to contain exactly one 
label")
+
+if(num_min + num_max != nrow(Y))
+       stop("Stopping due to invalid inputs: Y seems to contain more than 2 
labels")
+       
+if(check_min != -1 | check_max != +1) 
+       Y = 2/(check_max - check_min)*Y - (check_min + check_max)/(check_max - 
check_min)
+
+positive_label = check_max
+negative_label = check_min
+
+continue = 1
+
+intercept = cmdLine_icpt
+if(intercept != 0 & intercept != 1)
+       stop("Stopping due to invalid argument: Currently supported intercept 
options are 0 and 1")
+
+epsilon = cmdLine_tol
+if(epsilon < 0)
+       stop("Stopping due to invalid argument: Tolerance (tol) must be 
non-negative")
+       
+lambda = cmdLine_reg
+if(lambda < 0)
+       stop("Stopping due to invalid argument: Regularization constant (reg) 
must be non-negative")
+       
+maxiterations = cmdLine_maxiter
+if(maxiterations < 1)
+       stop("Stopping due to invalid argument: Maximum iterations should be a 
positive integer")
+
+num_samples = nrow(X)
+dimensions = ncol(X)
+
+if (intercept == 1) {
+       ones  = matrix(1, rows=num_samples, cols=1)
+       X = append(X, ones);
+}
+
+num_rows_in_w = dimensions
+if(intercept == 1){
+       num_rows_in_w = num_rows_in_w + 1
+}
+w = matrix(0, rows=num_rows_in_w, cols=1)
+
+g_old = t(X) %*% Y
+s = g_old
+
+Xw = matrix(0, rows=nrow(X), cols=1)
+debug_str = "# Iter, Obj"
+iter = 0
+while(continue == 1 & iter < maxiterations)  {
+       # minimizing primal obj along direction s
+    step_sz = 0
+    Xd = X %*% s
+    wd = lambda * sum(w * s)
+    dd = lambda * sum(s * s)
+    continue1 = 1
+    while(continue1 == 1){
+               tmp_Xw = Xw + step_sz*Xd
+       out = 1 - Y * (tmp_Xw)
+       sv = ppred(out, 0, ">")
+       out = out * sv
+       g = wd + step_sz*dd - sum(out * Y * Xd)
+       h = dd + sum(Xd * sv * Xd)
+       step_sz = step_sz - g/h
+       if (g*g/h < 0.0000000001){
+               continue1 = 0
+       }
+    }
+
+    #update weights
+    w = w + step_sz*s
+       Xw = Xw + step_sz*Xd
+       
+    out = 1 - Y * Xw
+    sv = ppred(out, 0, ">")
+    out = sv * out
+    obj = 0.5 * sum(out * out) + lambda/2 * sum(w * w)
+    g_new = t(X) %*% (out * Y) - lambda * w
+
+    print("ITER " + iter + ": OBJ=" + obj)
+       debug_str = append(debug_str, iter + "," + obj)
+       
+    tmp = sum(s * g_old)
+    if(step_sz*tmp < epsilon*obj){
+       continue = 0
+    }
+
+    #non-linear CG step
+    be = sum(g_new * g_new)/sum(g_old * g_old)
+    s = be * s + g_new
+    g_old = g_new
+
+    iter = iter + 1
+}
+
+extra_model_params = matrix(0, rows=4, cols=1)
+extra_model_params[1,1] = positive_label
+extra_model_params[2,1] = negative_label
+extra_model_params[3,1] = intercept
+extra_model_params[4,1] = dimensions
+
+w = t(append(t(w), t(extra_model_params)))
+write(w, $model, format=cmdLine_fmt)
+
+write(debug_str, $Log)


http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/m-svm-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/m-svm-predict.dml 
b/scripts/algorithms/m-svm-predict.dml
index 4d0c736..ba06cf6 100644
--- a/scripts/algorithms/m-svm-predict.dml
+++ b/scripts/algorithms/m-svm-predict.dml
@@ -1,84 +1,84 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# This script can be used to compute label predictions
-# Meant for use with an SVM model (learnt using m-svm.dml) on a held out test 
set
-#
-# Given ground truth labels, the script will compute an 
-# accuracy (%) for the predictions
-#
-# Example Usage:
-# hadoop jar SystemML.jar -f m-svm-predict.dml -nvargs X=data Y=labels 
model=model scores=scores accuracy=accuracy confusion=confusion fmt="text"
-#                                                                              
                         
-
-cmdLine_Y = ifdef($Y, " ")
-cmdLine_confusion = ifdef($confusion, " ")
-cmdLine_accuracy = ifdef($accuracy, " ")
-cmdLine_scores = ifdef($scores, " ")
-cmdLine_fmt = ifdef($fmt, "text")
-
-X = read($X);
-W = read($model);
-
-dimensions = as.scalar(W[nrow(W),1])
-if(dimensions != ncol(X))
-       stop("Stopping due to invalid input: Model dimensions do not seem to 
match input data dimensions")
-
-intercept = as.scalar(W[nrow(W)-1,1])
-W = W[1:(nrow(W)-2),]
-
-N = nrow(X);
-num_classes = ncol(W)
-m=ncol(X);
-
-b = matrix(0, rows=1, cols=num_classes)
-if (intercept == 1)
-       b = W[m+1,]
-
-ones = matrix(1, rows=N, cols=1)
-scores = X %*% W[1:m,] + ones %*% b;
-       
-if(cmdLine_scores != " ")
-       write(scores, cmdLine_scores, format=cmdLine_fmt);
-
-if(cmdLine_Y != " "){
-       y = read(cmdLine_Y);
-       
-       if(min(y) < 1)
-               stop("Stopping due to invalid argument: Label vector (Y) must 
be recoded")
-       
-       pred = rowIndexMax(scores);
-       correct_percentage = sum(ppred(pred - y, 0, "==")) / N * 100;
-       
-       acc_str = "Accuracy (%): " + correct_percentage
-       print(acc_str)
-       if(cmdLine_accuracy != " ")
-               write(acc_str, cmdLine_accuracy)
-
-       num_classes_ground_truth = max(y)
-       if(num_classes < num_classes_ground_truth)
-               num_classes = num_classes_ground_truth
-
-       if(cmdLine_confusion != " "){
-               confusion_mat = table(pred, y, num_classes, num_classes)
-               write(confusion_mat, cmdLine_confusion, format="csv")
-       }
-}
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# This script can be used to compute label predictions
+# Meant for use with an SVM model (learnt using m-svm.dml) on a held out test 
set
+#
+# Given ground truth labels, the script will compute an 
+# accuracy (%) for the predictions
+#
+# Example Usage:
+# hadoop jar SystemML.jar -f m-svm-predict.dml -nvargs X=data Y=labels 
model=model scores=scores accuracy=accuracy confusion=confusion fmt="text"
+#                                                                              
                         
+
+cmdLine_Y = ifdef($Y, " ")
+cmdLine_confusion = ifdef($confusion, " ")
+cmdLine_accuracy = ifdef($accuracy, " ")
+cmdLine_scores = ifdef($scores, " ")
+cmdLine_fmt = ifdef($fmt, "text")
+
+X = read($X);
+W = read($model);
+
+dimensions = as.scalar(W[nrow(W),1])
+if(dimensions != ncol(X))
+       stop("Stopping due to invalid input: Model dimensions do not seem to 
match input data dimensions")
+
+intercept = as.scalar(W[nrow(W)-1,1])
+W = W[1:(nrow(W)-2),]
+
+N = nrow(X);
+num_classes = ncol(W)
+m=ncol(X);
+
+b = matrix(0, rows=1, cols=num_classes)
+if (intercept == 1)
+       b = W[m+1,]
+
+ones = matrix(1, rows=N, cols=1)
+scores = X %*% W[1:m,] + ones %*% b;
+       
+if(cmdLine_scores != " ")
+       write(scores, cmdLine_scores, format=cmdLine_fmt);
+
+if(cmdLine_Y != " "){
+       y = read(cmdLine_Y);
+       
+       if(min(y) < 1)
+               stop("Stopping due to invalid argument: Label vector (Y) must 
be recoded")
+       
+       pred = rowIndexMax(scores);
+       correct_percentage = sum(ppred(pred - y, 0, "==")) / N * 100;
+       
+       acc_str = "Accuracy (%): " + correct_percentage
+       print(acc_str)
+       if(cmdLine_accuracy != " ")
+               write(acc_str, cmdLine_accuracy)
+
+       num_classes_ground_truth = max(y)
+       if(num_classes < num_classes_ground_truth)
+               num_classes = num_classes_ground_truth
+
+       if(cmdLine_confusion != " "){
+               confusion_mat = table(pred, y, num_classes, num_classes)
+               write(confusion_mat, cmdLine_confusion, format="csv")
+       }
+}

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/m-svm.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/m-svm.dml b/scripts/algorithms/m-svm.dml
index c570872..560d46f 100644
--- a/scripts/algorithms/m-svm.dml
+++ b/scripts/algorithms/m-svm.dml
@@ -1,174 +1,174 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-# Implements multiclass SVM with squared slack variables, 
-# learns one-against-the-rest binary-class classifiers
-# 
-# Example Usage:
-# Assume SVM_HOME is set to the home of the dml script
-# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
-# Assume epsilon = 0.001, lambda=1.0, max_iterations = 100
-# 
-# hadoop jar SystemML.jar -f $SVM_HOME/m-svm.dml -nvargs X=$INPUT_DIR/X 
Y=$INPUT_DIR/y icpt=intercept tol=.001 reg=1.0 maxiter=100 model=$OUTPUT_DIR/w 
Log=$OUTPUT_DIR/Log fmt="text"
-#
-
-cmdLine_fmt = ifdef($fmt, "text")
-cmdLine_icpt = ifdef($icpt, 0)
-cmdLine_tol = ifdef($tol, 0.001)
-cmdLine_reg = ifdef($reg, 1.0)
-cmdLine_maxiter = ifdef($maxiter, 100)
-
-print("icpt=" + cmdLine_icpt + " tol=" + cmdLine_tol + " reg=" + cmdLine_reg + 
" maxiter=" + cmdLine_maxiter)
-
-X = read($X)
-
-if(nrow(X) < 2)
-       stop("Stopping due to invalid inputs: Not possible to learn a 
classifier without at least 2 rows")
-
-dimensions = ncol(X)
-
-Y = read($Y)
-
-if(nrow(X) != nrow(Y))
-       stop("Stopping due to invalid argument: Numbers of rows in X and Y must 
match")
-
-intercept = cmdLine_icpt
-if(intercept != 0 & intercept != 1)
-       stop("Stopping due to invalid argument: Currently supported intercept 
options are 0 and 1")
-
-min_y = min(Y)
-if(min_y < 1)
-       stop("Stopping due to invalid argument: Label vector (Y) must be 
recoded")
-num_classes = max(Y)
-if(num_classes == 1)
-       stop("Stopping due to invalid argument: Maximum label value is 1, need 
more than one class to learn a multi-class classifier")  
-mod1 = Y %% 1
-mod1_should_be_nrow = sum(abs(ppred(mod1, 0, "==")))
-if(mod1_should_be_nrow != nrow(Y))
-       stop("Stopping due to invalid argument: Please ensure that Y contains 
(positive) integral labels")
-       
-epsilon = cmdLine_tol
-if(epsilon < 0)
-       stop("Stopping due to invalid argument: Tolerance (tol) must be 
non-negative")
-
-lambda = cmdLine_reg
-if(lambda < 0)
-       stop("Stopping due to invalid argument: Regularization constant (reg) 
must be non-negative")
-
-max_iterations = cmdLine_maxiter
-if(max_iterations < 1)
-       stop("Stopping due to invalid argument: Maximum iterations should be a 
positive integer")
-
-num_samples = nrow(X)
-num_features = ncol(X)
-
-if (intercept == 1) {
-       ones  = matrix(1, rows=num_samples, cols=1);
-       X = append(X, ones);
-}
-
-num_rows_in_w = num_features
-if(intercept == 1){
-       num_rows_in_w = num_rows_in_w + 1
-}
-w = matrix(0, rows=num_rows_in_w, cols=num_classes)
-
-debug_mat = matrix(-1, rows=max_iterations, cols=num_classes)
-parfor(iter_class in 1:num_classes){             
-       Y_local = 2 * ppred(Y, iter_class, "==") - 1
-       w_class = matrix(0, rows=num_features, cols=1)
-       if (intercept == 1) {
-               zero_matrix = matrix(0, rows=1, cols=1);
-               w_class = t(append(t(w_class), zero_matrix));
-       }
- 
-       g_old = t(X) %*% Y_local
-       s = g_old
-
-       Xw = matrix(0, rows=nrow(X), cols=1)
-       iter = 0
-       continue = 1
-       while(continue == 1)  {
-               # minimizing primal obj along direction s
-               step_sz = 0
-               Xd = X %*% s
-               wd = lambda * sum(w_class * s)
-               dd = lambda * sum(s * s)
-               continue1 = 1
-               while(continue1 == 1){
-                       tmp_Xw = Xw + step_sz*Xd
-                       out = 1 - Y_local * (tmp_Xw)
-                       sv = ppred(out, 0, ">")
-                       out = out * sv
-                       g = wd + step_sz*dd - sum(out * Y_local * Xd)
-                       h = dd + sum(Xd * sv * Xd)
-                       step_sz = step_sz - g/h
-                       if (g*g/h < 0.0000000001){
-                       continue1 = 0
-               }
-       }
- 
-               #update weights
-               w_class = w_class + step_sz*s
-               Xw = Xw + step_sz*Xd
- 
-               out = 1 - Y_local * Xw
-               sv = ppred(out, 0, ">")
-               out = sv * out
-               obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class)
-               g_new = t(X) %*% (out * Y_local) - lambda * w_class
-
-               tmp = sum(s * g_old)
-  
-               train_acc = sum(ppred(Y_local*(X%*%w_class), 0, 
">="))/num_samples*100
-               print("For class " + iter_class + " iteration " + iter + " 
training accuracy: " + train_acc)
-               debug_mat[iter+1,iter_class] = obj         
-   
-               if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)){
-                       continue = 0
-               }
- 
-               #non-linear CG step
-               be = sum(g_new * g_new)/sum(g_old * g_old)
-               s = be * s + g_new
-               g_old = g_new
-
-               iter = iter + 1
-       }
-
-       w[,iter_class] = w_class
-}
-
-extra_model_params = matrix(0, rows=2, cols=ncol(w))
-extra_model_params[1, 1] = intercept
-extra_model_params[2, 1] = dimensions
-w = t(append(t(w), t(extra_model_params)))
-write(w, $model, format=cmdLine_fmt)
-
-debug_str = "# Class, Iter, Obj"
-for(iter_class in 1:ncol(debug_mat)){
-       for(iter in 1:nrow(debug_mat)){
-               obj = castAsScalar(debug_mat[iter, iter_class])
-               if(obj != -1) 
-                       debug_str = append(debug_str, iter_class + "," + iter + 
"," + obj)
-       }
-}
-write(debug_str, $Log)
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+# Implements multiclass SVM with squared slack variables, 
+# learns one-against-the-rest binary-class classifiers
+# 
+# Example Usage:
+# Assume SVM_HOME is set to the home of the dml script
+# Assume input and output directories are on hdfs as INPUT_DIR and OUTPUT_DIR
+# Assume epsilon = 0.001, lambda=1.0, max_iterations = 100
+# 
+# hadoop jar SystemML.jar -f $SVM_HOME/m-svm.dml -nvargs X=$INPUT_DIR/X 
Y=$INPUT_DIR/y icpt=intercept tol=.001 reg=1.0 maxiter=100 model=$OUTPUT_DIR/w 
Log=$OUTPUT_DIR/Log fmt="text"
+#
+
+cmdLine_fmt = ifdef($fmt, "text")
+cmdLine_icpt = ifdef($icpt, 0)
+cmdLine_tol = ifdef($tol, 0.001)
+cmdLine_reg = ifdef($reg, 1.0)
+cmdLine_maxiter = ifdef($maxiter, 100)
+
+print("icpt=" + cmdLine_icpt + " tol=" + cmdLine_tol + " reg=" + cmdLine_reg + 
" maxiter=" + cmdLine_maxiter)
+
+X = read($X)
+
+if(nrow(X) < 2)
+       stop("Stopping due to invalid inputs: Not possible to learn a 
classifier without at least 2 rows")
+
+dimensions = ncol(X)
+
+Y = read($Y)
+
+if(nrow(X) != nrow(Y))
+       stop("Stopping due to invalid argument: Numbers of rows in X and Y must 
match")
+
+intercept = cmdLine_icpt
+if(intercept != 0 & intercept != 1)
+       stop("Stopping due to invalid argument: Currently supported intercept 
options are 0 and 1")
+
+min_y = min(Y)
+if(min_y < 1)
+       stop("Stopping due to invalid argument: Label vector (Y) must be 
recoded")
+num_classes = max(Y)
+if(num_classes == 1)
+       stop("Stopping due to invalid argument: Maximum label value is 1, need 
more than one class to learn a multi-class classifier")  
+mod1 = Y %% 1
+mod1_should_be_nrow = sum(abs(ppred(mod1, 0, "==")))
+if(mod1_should_be_nrow != nrow(Y))
+       stop("Stopping due to invalid argument: Please ensure that Y contains 
(positive) integral labels")
+       
+epsilon = cmdLine_tol
+if(epsilon < 0)
+       stop("Stopping due to invalid argument: Tolerance (tol) must be 
non-negative")
+
+lambda = cmdLine_reg
+if(lambda < 0)
+       stop("Stopping due to invalid argument: Regularization constant (reg) 
must be non-negative")
+
+max_iterations = cmdLine_maxiter
+if(max_iterations < 1)
+       stop("Stopping due to invalid argument: Maximum iterations should be a 
positive integer")
+
+num_samples = nrow(X)
+num_features = ncol(X)
+
+if (intercept == 1) {
+       ones  = matrix(1, rows=num_samples, cols=1);
+       X = append(X, ones);
+}
+
+num_rows_in_w = num_features
+if(intercept == 1){
+       num_rows_in_w = num_rows_in_w + 1
+}
+w = matrix(0, rows=num_rows_in_w, cols=num_classes)
+
+debug_mat = matrix(-1, rows=max_iterations, cols=num_classes)
+parfor(iter_class in 1:num_classes){             
+       Y_local = 2 * ppred(Y, iter_class, "==") - 1
+       w_class = matrix(0, rows=num_features, cols=1)
+       if (intercept == 1) {
+               zero_matrix = matrix(0, rows=1, cols=1);
+               w_class = t(append(t(w_class), zero_matrix));
+       }
+ 
+       g_old = t(X) %*% Y_local
+       s = g_old
+
+       Xw = matrix(0, rows=nrow(X), cols=1)
+       iter = 0
+       continue = 1
+       while(continue == 1)  {
+               # minimizing primal obj along direction s
+               step_sz = 0
+               Xd = X %*% s
+               wd = lambda * sum(w_class * s)
+               dd = lambda * sum(s * s)
+               continue1 = 1
+               while(continue1 == 1){
+                       tmp_Xw = Xw + step_sz*Xd
+                       out = 1 - Y_local * (tmp_Xw)
+                       sv = ppred(out, 0, ">")
+                       out = out * sv
+                       g = wd + step_sz*dd - sum(out * Y_local * Xd)
+                       h = dd + sum(Xd * sv * Xd)
+                       step_sz = step_sz - g/h
+                       if (g*g/h < 0.0000000001){
+                       continue1 = 0
+               }
+       }
+ 
+               #update weights
+               w_class = w_class + step_sz*s
+               Xw = Xw + step_sz*Xd
+ 
+               out = 1 - Y_local * Xw
+               sv = ppred(out, 0, ">")
+               out = sv * out
+               obj = 0.5 * sum(out * out) + lambda/2 * sum(w_class * w_class)
+               g_new = t(X) %*% (out * Y_local) - lambda * w_class
+
+               tmp = sum(s * g_old)
+  
+               train_acc = sum(ppred(Y_local*(X%*%w_class), 0, 
">="))/num_samples*100
+               print("For class " + iter_class + " iteration " + iter + " 
training accuracy: " + train_acc)
+               debug_mat[iter+1,iter_class] = obj         
+   
+               if((step_sz*tmp < epsilon*obj) | (iter >= max_iterations-1)){
+                       continue = 0
+               }
+ 
+               #non-linear CG step
+               be = sum(g_new * g_new)/sum(g_old * g_old)
+               s = be * s + g_new
+               g_old = g_new
+
+               iter = iter + 1
+       }
+
+       w[,iter_class] = w_class
+}
+
+extra_model_params = matrix(0, rows=2, cols=ncol(w))
+extra_model_params[1, 1] = intercept
+extra_model_params[2, 1] = dimensions
+w = t(append(t(w), t(extra_model_params)))
+write(w, $model, format=cmdLine_fmt)
+
+debug_str = "# Class, Iter, Obj"
+for(iter_class in 1:ncol(debug_mat)){
+       for(iter in 1:nrow(debug_mat)){
+               obj = castAsScalar(debug_mat[iter, iter_class])
+               if(obj != -1) 
+                       debug_str = append(debug_str, iter_class + "," + iter + 
"," + obj)
+       }
+}
+write(debug_str, $Log)

http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/random-forest-predict.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/random-forest-predict.dml 
b/scripts/algorithms/random-forest-predict.dml
index 2d99670..7bc6cd6 100644
--- a/scripts/algorithms/random-forest-predict.dml
+++ b/scripts/algorithms/random-forest-predict.dml
@@ -1,193 +1,193 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#  
-# THIS SCRIPT COMPUTES LABEL PREDICTIONS MEANT FOR USE WITH A RANDOM FOREST 
MODEL ON A HELD OUT TEST SET 
-# OR FOR COMPUTING THE OUT-OF-BAG ERROR ON THE TRAINING SET.
-#
-# INPUT         PARAMETERS:
-# 
---------------------------------------------------------------------------------------------
-# NAME          TYPE     DEFAULT      MEANING
-# 
---------------------------------------------------------------------------------------------
-# X             String   ---          Location to read test feature matrix or 
training feature matrix for computing Out-Of-Bag error; 
-#                                                                        note 
that X needs to be both recoded and dummy coded 
-# Y                        String   " "                  Location to read true 
label matrix Y if requested; note that Y needs to be both recoded and dummy 
coded
-# R                    String   " "          Location to read the matrix R 
which for each feature in X contains the following information 
-#                                                                              
- R[,1]: column ids
-#                                                                              
- R[,2]: start indices 
-#                                                                              
- R[,3]: end indices
-#                                                                        If R 
is not provided by default all variables are assumed to be scale
-# M             String          ---              Location to read matrix M 
containing the learned tree i the following format
-#                                                                              
- M[1,j]: id of node j (in a complete binary tree)
-#                                                                              
- M[2,j]: tree id 
-#                                                                              
- M[3,j]: Offset (no. of columns) to left child of j if j is an internal node, 
otherwise 0
-#                                                                              
- M[4,j]: Feature index of the feature that node j looks at if j is an internal 
node, otherwise 0
-#                                                                              
- M[5,j]: Type of the feature that node j looks at if j is an internal node: 1 
for scale and 2 for categorical features, 
-#                                                                              
          otherwise the label that leaf node j is supposed to predict
-#                                                                              
- M[6,j]: If j is an internal node: 1 if the feature chosen for j is scale, 
otherwise the size of the subset of values 
-#                                                                              
                  stored in rows 7,8,... if j is categorical 
-#                                                                              
                  If j is a leaf node: number of misclassified samples reaching 
at node j 
-#                                                                              
- M[7:,j]: If j is an internal node: Threshold the example's feature value is 
compared to is stored at M[7,j] 
-#                                                                              
                   if the feature chosen for j is scale, otherwise if the 
feature chosen for j is categorical rows 7,8,... 
-#                                                                              
                   depict the value subset chosen for j
-#                                                                              
           If j is a leaf node 1 if j is impure and the number of samples at j 
> threshold, otherwise 0
-# C                    String   " "              Location to read the counts 
matrix containing the number of times samples are chosen in each tree of the 
random forest
-# P                            String   ---              Location to store the 
label predictions for X
-# A                    String   " "          Location to store the test 
accuracy (%) for the prediction if requested
-# OOB                  String   " "              If C is provided location to 
store the Out-Of-Bag (OOB) error of the learned model 
-# CM                   String   " "              Location to store the 
confusion matrix if requested 
-# fmt              String   "text"       The output format of the output, such 
as "text" or "csv"
-# 
---------------------------------------------------------------------------------------------
-# OUTPUT: 
-#      1- Matrix Y containing the predicted labels for X 
-#   2- Test accuracy if requested
-#   3- Confusion matrix C if requested
-# 
-------------------------------------------------------------------------------------------
-# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemML.jar -f random-forest-predict.dml -nvargs X=INPUT_DIR/X 
Y=INPUT_DIR/Y R=INPUT_DIR/R M=INPUT_DIR/model P=OUTPUT_DIR/predictions
-#                                                                              
                                A=OUTPUT_DIR/accurcay CM=OUTPUT_DIR/confusion 
fmt=csv
-
-fileX = $X;
-fileM = $M;
-fileP = $P;
-fileY = ifdef ($Y, " ");
-fileR = ifdef ($R, " ");
-fileC = ifdef ($C, " ");
-fileOOB = ifdef ($OOB, " ");
-fileCM = ifdef ($CM, " ");
-fileA = ifdef ($A, " ");
-fmtO = ifdef ($fmt, "text");
-X = read (fileX);
-M = read (fileM);
-
-num_records = nrow (X);
-Y_predicted = matrix (0, rows = num_records, cols = 1);
-num_trees  = max (M[2,]);
-num_labels = max (M[5,]);
-num_nodes_per_tree = aggregate (target = t (M[2,]), groups = t (M[2,]), fn = 
"count");
-num_nodes_per_tree_cum = cumsum (num_nodes_per_tree);
-
-R_cat = matrix (0, rows = 1, cols = 1);
-R_scale = matrix (0, rows = 1, cols = 1);
-
-if (fileR != " ") {
-       R = read (fileR);
-       dummy_coded = ppred (R[,2], R[,3], "!=");
-       R_scale = removeEmpty (target = R[,2] * (1 - dummy_coded), margin = 
"rows");
-       R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows");
-} else { # only scale features available
-       R_scale = seq (1, ncol (X));
-}
-
-if (fileC != " ") {
-       C = read (fileC);
-       label_counts_oob = matrix (0, rows = num_records, cols = num_labels);
-}
-
-label_counts = matrix (0, rows = num_records, cols = num_labels); 
-parfor (i in 1:num_records, check = 0) {
-       cur_sample = X[i,];
-       cur_node_pos = 1;
-       # cur_node = 1;
-       cur_tree = 1;
-       start_ind = 1;
-       labels_found = FALSE;
-       while (!labels_found) {
-               
-               cur_feature = as.scalar (M[4,cur_node_pos]);
-               type_label = as.scalar (M[5,cur_node_pos]);
-               if (cur_feature == 0) { # leaf found
-                       label_counts[i,type_label] = label_counts[i,type_label] 
+ 1;
-                       if (fileC != " ") {
-                               if (as.scalar (C[i,cur_tree]) == 0) 
label_counts_oob[i,type_label] = label_counts_oob[i,type_label] + 1;
-                       }
-                       if (cur_tree < num_trees) {
-                               cur_node_pos = as.scalar 
(num_nodes_per_tree_cum[cur_tree,]) + 1;
-                       } else if (cur_tree == num_trees) {
-                               labels_found = TRUE;
-                       }
-                       cur_tree = cur_tree + 1;
-               } else {
-                       # determine type: 1 for scale, 2 for categorical 
-                       if (type_label == 1) { # scale feature
-                               cur_start_ind = as.scalar 
(R_scale[cur_feature,]);
-                               cur_value = as.scalar 
(cur_sample[,cur_start_ind]);
-                               cur_split = as.scalar (M[7,cur_node_pos]);
-                               if (cur_value < cur_split) { # go to left branch
-                                       cur_node_pos = cur_node_pos + as.scalar 
(M[3,cur_node_pos]);
-                                       # cur_node = as.scalar 
(cur_M[1,cur_node_pos]);
-                               } else { # go to right branch
-                                       cur_node_pos = cur_node_pos + as.scalar 
(M[3,cur_node_pos]) + 1;
-                                       # cur_node = as.scalar 
(cur_M[1,cur_node_pos]);                                 
-                               }
-                       } else if (type_label == 2) { # categorical feature     
                        
-                               cur_start_ind = as.scalar 
(R_cat[cur_feature,1]);
-                               cur_end_ind = as.scalar (R_cat[cur_feature,2]); 
                                
-                               cur_value = as.scalar 
(rowIndexMax(cur_sample[,cur_start_ind:cur_end_ind])); 
-                               cur_offset = as.scalar (M[6,cur_node_pos]);
-                               value_found = sum (ppred (M[7:(7 + cur_offset - 
1),cur_node_pos], cur_value, "=="));
-                               if (value_found >= 1) { # go to left branch
-                                       cur_node_pos = cur_node_pos + as.scalar 
(M[3,cur_node_pos]);
-                                       # cur_node = as.scalar 
(cur_M[1,cur_node_pos]);
-                               } else { # go to right branch
-                                       cur_node_pos = cur_node_pos + as.scalar 
(M[3,cur_node_pos]) + 1;
-                                       # cur_node = as.scalar 
(cur_M[1,cur_node_pos]);                                         
-                               }
-               
-                       }
-}}}
-
-Y_predicted = rowIndexMax (label_counts);
-write (Y_predicted, fileP, format = fmtO);
-
-if (fileY != " ") {
-       Y_dummy = read (fileY);
-       num_classes = ncol (Y_dummy);
-       Y = rowSums (Y_dummy * t (seq (1, num_classes)));
-       result = ppred (Y, Y_predicted, "==");
-       result = sum (result);
-       accuracy = result / num_records * 100;
-       acc_str = "Accuracy (%): " + accuracy;
-       if (fileA != " ") {
-               write (acc_str, fileA, format = fmtO);
-       } else {
-               print (acc_str);
-       }
-       if (fileC != " ") {
-               oob_ind = ppred (rowSums (label_counts_oob), 0, ">")
-               label_counts_oob = removeEmpty (target = label_counts_oob, 
margin = "rows");
-               num_oob = nrow (label_counts_oob);
-               Y_predicted_oob = rowIndexMax (label_counts_oob);
-               Y_oob = removeEmpty (target = Y * oob_ind, margin = "rows");
-               result = ppred (Y_oob, Y_predicted_oob, "==");
-               oob_error = (1 - (sum (result) / num_oob)) * 100;
-               oob_str = "Out-Of-Bag error (%): " + oob_error;
-               if (fileOOB != " ") {
-                       write (oob_str, fileOOB, format = fmtO);
-               } else {
-                       print (oob_str);
-               }
-       }
-       if (fileCM != " ") {
-               confusion_mat = table(Y_predicted, Y, num_classes, num_classes)
-        write(confusion_mat, fileCM, format = fmtO)
-       }
-}
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#  
+# THIS SCRIPT COMPUTES LABEL PREDICTIONS MEANT FOR USE WITH A RANDOM FOREST 
MODEL ON A HELD OUT TEST SET 
+# OR FOR COMPUTING THE OUT-OF-BAG ERROR ON THE TRAINING SET.
+#
+# INPUT         PARAMETERS:
+# 
---------------------------------------------------------------------------------------------
+# NAME          TYPE     DEFAULT      MEANING
+# 
---------------------------------------------------------------------------------------------
+# X             String   ---          Location to read test feature matrix or 
training feature matrix for computing Out-Of-Bag error; 
+#                                                                        note 
that X needs to be both recoded and dummy coded 
+# Y                        String   " "                  Location to read true 
label matrix Y if requested; note that Y needs to be both recoded and dummy 
coded
+# R                    String   " "          Location to read the matrix R 
which for each feature in X contains the following information 
+#                                                                              
- R[,1]: column ids
+#                                                                              
- R[,2]: start indices 
+#                                                                              
- R[,3]: end indices
+#                                                                        If R 
is not provided by default all variables are assumed to be scale
+# M             String          ---              Location to read matrix M 
containing the learned tree i the following format
+#                                                                              
- M[1,j]: id of node j (in a complete binary tree)
+#                                                                              
- M[2,j]: tree id 
+#                                                                              
- M[3,j]: Offset (no. of columns) to left child of j if j is an internal node, 
otherwise 0
+#                                                                              
- M[4,j]: Feature index of the feature that node j looks at if j is an internal 
node, otherwise 0
+#                                                                              
- M[5,j]: Type of the feature that node j looks at if j is an internal node: 1 
for scale and 2 for categorical features, 
+#                                                                              
          otherwise the label that leaf node j is supposed to predict
+#                                                                              
- M[6,j]: If j is an internal node: 1 if the feature chosen for j is scale, 
otherwise the size of the subset of values 
+#                                                                              
                  stored in rows 7,8,... if j is categorical 
+#                                                                              
                  If j is a leaf node: number of misclassified samples reaching 
at node j 
+#                                                                              
- M[7:,j]: If j is an internal node: Threshold the example's feature value is 
compared to is stored at M[7,j] 
+#                                                                              
                   if the feature chosen for j is scale, otherwise if the 
feature chosen for j is categorical rows 7,8,... 
+#                                                                              
                   depict the value subset chosen for j
+#                                                                              
           If j is a leaf node 1 if j is impure and the number of samples at j 
> threshold, otherwise 0
+# C                    String   " "              Location to read the counts 
matrix containing the number of times samples are chosen in each tree of the 
random forest
+# P                            String   ---              Location to store the 
label predictions for X
+# A                    String   " "          Location to store the test 
accuracy (%) for the prediction if requested
+# OOB                  String   " "              If C is provided location to 
store the Out-Of-Bag (OOB) error of the learned model 
+# CM                   String   " "              Location to store the 
confusion matrix if requested 
+# fmt              String   "text"       The output format of the output, such 
as "text" or "csv"
+# 
---------------------------------------------------------------------------------------------
+# OUTPUT: 
+#      1- Matrix Y containing the predicted labels for X 
+#   2- Test accuracy if requested
+#   3- Confusion matrix C if requested
+# 
-------------------------------------------------------------------------------------------
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f random-forest-predict.dml -nvargs X=INPUT_DIR/X 
Y=INPUT_DIR/Y R=INPUT_DIR/R M=INPUT_DIR/model P=OUTPUT_DIR/predictions
+#                                                                              
                                A=OUTPUT_DIR/accurcay CM=OUTPUT_DIR/confusion 
fmt=csv
+
+fileX = $X;
+fileM = $M;
+fileP = $P;
+fileY = ifdef ($Y, " ");
+fileR = ifdef ($R, " ");
+fileC = ifdef ($C, " ");
+fileOOB = ifdef ($OOB, " ");
+fileCM = ifdef ($CM, " ");
+fileA = ifdef ($A, " ");
+fmtO = ifdef ($fmt, "text");
+X = read (fileX);
+M = read (fileM);
+
+num_records = nrow (X);
+Y_predicted = matrix (0, rows = num_records, cols = 1);
+num_trees  = max (M[2,]);
+num_labels = max (M[5,]);
+num_nodes_per_tree = aggregate (target = t (M[2,]), groups = t (M[2,]), fn = 
"count");
+num_nodes_per_tree_cum = cumsum (num_nodes_per_tree);
+
+R_cat = matrix (0, rows = 1, cols = 1);
+R_scale = matrix (0, rows = 1, cols = 1);
+
+if (fileR != " ") {
+       R = read (fileR);
+       dummy_coded = ppred (R[,2], R[,3], "!=");
+       R_scale = removeEmpty (target = R[,2] * (1 - dummy_coded), margin = 
"rows");
+       R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows");
+} else { # only scale features available
+       R_scale = seq (1, ncol (X));
+}
+
+if (fileC != " ") {
+       C = read (fileC);
+       label_counts_oob = matrix (0, rows = num_records, cols = num_labels);
+}
+
+label_counts = matrix (0, rows = num_records, cols = num_labels); 
+parfor (i in 1:num_records, check = 0) {
+       cur_sample = X[i,];
+       cur_node_pos = 1;
+       # cur_node = 1;
+       cur_tree = 1;
+       start_ind = 1;
+       labels_found = FALSE;
+       while (!labels_found) {
+               
+               cur_feature = as.scalar (M[4,cur_node_pos]);
+               type_label = as.scalar (M[5,cur_node_pos]);
+               if (cur_feature == 0) { # leaf found
+                       label_counts[i,type_label] = label_counts[i,type_label] 
+ 1;
+                       if (fileC != " ") {
+                               if (as.scalar (C[i,cur_tree]) == 0) 
label_counts_oob[i,type_label] = label_counts_oob[i,type_label] + 1;
+                       }
+                       if (cur_tree < num_trees) {
+                               cur_node_pos = as.scalar 
(num_nodes_per_tree_cum[cur_tree,]) + 1;
+                       } else if (cur_tree == num_trees) {
+                               labels_found = TRUE;
+                       }
+                       cur_tree = cur_tree + 1;
+               } else {
+                       # determine type: 1 for scale, 2 for categorical 
+                       if (type_label == 1) { # scale feature
+                               cur_start_ind = as.scalar 
(R_scale[cur_feature,]);
+                               cur_value = as.scalar 
(cur_sample[,cur_start_ind]);
+                               cur_split = as.scalar (M[7,cur_node_pos]);
+                               if (cur_value < cur_split) { # go to left branch
+                                       cur_node_pos = cur_node_pos + as.scalar 
(M[3,cur_node_pos]);
+                                       # cur_node = as.scalar 
(cur_M[1,cur_node_pos]);
+                               } else { # go to right branch
+                                       cur_node_pos = cur_node_pos + as.scalar 
(M[3,cur_node_pos]) + 1;
+                                       # cur_node = as.scalar 
(cur_M[1,cur_node_pos]);                                 
+                               }
+                       } else if (type_label == 2) { # categorical feature     
                        
+                               cur_start_ind = as.scalar 
(R_cat[cur_feature,1]);
+                               cur_end_ind = as.scalar (R_cat[cur_feature,2]); 
                                
+                               cur_value = as.scalar 
(rowIndexMax(cur_sample[,cur_start_ind:cur_end_ind])); 
+                               cur_offset = as.scalar (M[6,cur_node_pos]);
+                               value_found = sum (ppred (M[7:(7 + cur_offset - 
1),cur_node_pos], cur_value, "=="));
+                               if (value_found >= 1) { # go to left branch
+                                       cur_node_pos = cur_node_pos + as.scalar 
(M[3,cur_node_pos]);
+                                       # cur_node = as.scalar 
(cur_M[1,cur_node_pos]);
+                               } else { # go to right branch
+                                       cur_node_pos = cur_node_pos + as.scalar 
(M[3,cur_node_pos]) + 1;
+                                       # cur_node = as.scalar 
(cur_M[1,cur_node_pos]);                                         
+                               }
+               
+                       }
+}}}
+
+Y_predicted = rowIndexMax (label_counts);
+write (Y_predicted, fileP, format = fmtO);
+
+if (fileY != " ") {
+       Y_dummy = read (fileY);
+       num_classes = ncol (Y_dummy);
+       Y = rowSums (Y_dummy * t (seq (1, num_classes)));
+       result = ppred (Y, Y_predicted, "==");
+       result = sum (result);
+       accuracy = result / num_records * 100;
+       acc_str = "Accuracy (%): " + accuracy;
+       if (fileA != " ") {
+               write (acc_str, fileA, format = fmtO);
+       } else {
+               print (acc_str);
+       }
+       if (fileC != " ") {
+               oob_ind = ppred (rowSums (label_counts_oob), 0, ">")
+               label_counts_oob = removeEmpty (target = label_counts_oob, 
margin = "rows");
+               num_oob = nrow (label_counts_oob);
+               Y_predicted_oob = rowIndexMax (label_counts_oob);
+               Y_oob = removeEmpty (target = Y * oob_ind, margin = "rows");
+               result = ppred (Y_oob, Y_predicted_oob, "==");
+               oob_error = (1 - (sum (result) / num_oob)) * 100;
+               oob_str = "Out-Of-Bag error (%): " + oob_error;
+               if (fileOOB != " ") {
+                       write (oob_str, fileOOB, format = fmtO);
+               } else {
+                       print (oob_str);
+               }
+       }
+       if (fileCM != " ") {
+               confusion_mat = table(Y_predicted, Y, num_classes, num_classes)
+        write(confusion_mat, fileCM, format = fmtO)
+       }
+}

[42/51] [partial] incubator-systemml git commit: [SYSTEMML-482] [SYSTEMML-480] Adding a Git attributes file to enfore Unix-styled line endings, and normalizing all of the line endings.

Reply via email to