This is an automated email from the ASF dual-hosted git repository.
mboehm7 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/systemds.git
The following commit(s) were added to refs/heads/master by this push:
new 968f45a [SYSTEMDS-2844] Initial builtin function and cleanup
randomForest
968f45a is described below
commit 968f45ac927c84455c76551b734cc2e56a4307ea
Author: Christof Jahn <[email protected]>
AuthorDate: Sun Apr 4 00:50:22 2021 +0200
[SYSTEMDS-2844] Initial builtin function and cleanup randomForest
DIA project WS2020/21
Closes #1204.
Co-authored-by: Maximilian Kammerer <[email protected]>
Co-authored-by: Philipp Diethard <[email protected]>
---
scripts/builtin/randomForest.dml | 1286 ++++++++++++++++++++
.../java/org/apache/sysds/common/Builtins.java | 1 +
.../functions/builtin/BuiltinRandomForestTest.java | 114 ++
.../scripts/functions/builtin/randomForest.dml | 34 +
4 files changed, 1435 insertions(+)
diff --git a/scripts/builtin/randomForest.dml b/scripts/builtin/randomForest.dml
new file mode 100644
index 0000000..3975fbe
--- /dev/null
+++ b/scripts/builtin/randomForest.dml
@@ -0,0 +1,1286 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#
+# THIS SCRIPT IMPLEMENTS CLASSIFICATION RANDOM FOREST WITH BOTH SCALE AND
CATEGORICAL FEATURES
+#
+# INPUT PARAMETERS:
+#
---------------------------------------------------------------------------------------------
+# NAME TYPE DEFAULT MEANING
+#
---------------------------------------------------------------------------------------------
+# X Matrix --- Feature matrix X; note that X needs to
be both recoded and dummy coded
+# Y Matrix --- Label matrix Y; note that Y needs to
be both recoded and dummy coded
+# R Matrix " " Matrix which for each feature in X
contains the following information
+# - R[,1]: column ids TODO pass
recorded and binned
+# - R[,2]: start indices
+# - R[,3]: end indices
+# If R is not provided by default all
variables are assumed to be scale
+# bins Int 20 Number of equiheight bins per scale
feature to choose thresholds
+# depth Int 25 Maximum depth of the learned tree
+# num_leaf Int 10 Number of samples when splitting stops
and a leaf node is added
+# num_samples Int 3000 Number of samples at which point we
switch to in-memory subtree building
+# num_trees Int 10 Number of trees to be learned in the
random forest model
+# subsamp_rate Double 1.0 Parameter controlling the size of each
tree in the forest; samples are selected from a
+# Poisson distribution with parameter
subsamp_rate (the default value is 1.0)
+# feature_subset Double 0.5 Parameter that controls the number of
feature used as candidates for splitting at each tree node
+# as a power of number of features in
the dataset;
+# by default square root of features
(i.e., feature_subset = 0.5) are used at each tree node
+# impurity String "Gini" Impurity measure: entropy or Gini (the
default)
+#
---------------------------------------------------------------------------------------------
+
+# Output TYPE MEANING:
+#
---------------------------------------------------------------------------------------------
+# M Matrix Matrix M containing the learned tree, where each
column corresponds to a node
+# in the learned tree and each row contains the
following information:
+# M[1,j]: id of node j (in a complete binary tree)
+# M[2,j]: tree id to which node j belongs
+# M[3,j]: Offset (no. of columns) to left child of j
+# M[4,j]: Feature index of the feature that node j
looks at if j is an internal node, otherwise 0
+# M[5,j]: Type of the feature that node j looks at
if j is an internal node: 1 for scale and 2 for categorical features,
+# otherwise the label that leaf node j is
supposed to predict
+# M[6,j]: 1 if j is an internal node and the
feature chosen for j is scale, otherwise the size of the subset of values
+# stored in rows 7,8,... if j is categorical
+# M[7:,j]: Only applicable for internal nodes.
Threshold the example's feature value is compared to is stored at M[7,j] if the
feature chosen for j is scale;
+# If the feature chosen for j is
categorical rows 7,8,... depict the value subset chosen for j
+# C Matrix Matrix C containing the number of times samples are
chosen in each tree of the random forest
+# S_map Matrix Mappings from scale feature ids to global feature
ids
+# C_map Matrix Mappings from categorical feature ids to global
feature ids
+#
-------------------------------------------------------------------------------------------
+
+m_randomForest = function(Matrix[Double] X, Matrix[Double] Y, Matrix[Double]
R,
+ Integer bins = 20, Integer depth = 25, Integer num_leaf = 10, Integer
num_samples = 3000,
+ Integer num_trees = 1, Double subsamp_rate = 1.0, Double feature_subset =
0.5, String impurity = "Gini")
+ return(Matrix[Double] M, Matrix[Double] C, Matrix[Double] S_map,
Matrix[Double] C_map)
+{
+ print("started random-forest ...")
+
+ num_bins = bins;
+ threshold = num_samples;
+ imp = impurity;
+ fpow = feature_subset;
+ rate = subsamp_rate;
+
+ M = matrix(0, rows = 10, cols = 10);
+
+ Y_bin = Y;
+ num_records = nrow (X);
+ num_classes = ncol (Y_bin);
+
+ # check if there is only one class label
+ Y_bin_sum = sum (colSums (Y_bin) == num_records);
+ if (Y_bin_sum == 1)
+ stop ("Y contains only one class label. No model will be learned!");
+ else if (Y_bin_sum > 1)
+ stop ("Y is not properly dummy coded. Multiple columns of Y contain only
ones!")
+
+ # split data into X_scale and X_cat
+
+ if (length(R) != 0) {
+ R_tmp = order (target = R, by = 2); # sort by start indices
+ dummy_coded = (R_tmp[,2] != R_tmp[,3]);
+ R_scale = removeEmpty (target = R_tmp[,2:3] * (1 - dummy_coded), margin =
"rows");
+ R_cat = removeEmpty (target = R_tmp[,2:3] * dummy_coded, margin = "rows");
+ S_map = removeEmpty (target = (1 - dummy_coded) * seq (1, nrow (R_tmp)),
margin = "rows");
+ C_map = removeEmpty (target = dummy_coded * seq (1, nrow (R_tmp)), margin
= "rows");
+
+ sum_dummy = sum (dummy_coded);
+ if (sum_dummy == nrow (R_tmp)) { # all features categorical
+ print ("All features categorical");
+ num_cat_features = nrow (R_cat);
+ num_scale_features = 0;
+ X_cat = X;
+ distinct_values = t (R_cat[,2] - R_cat[,1] + 1);
+ distinct_values_max = max (distinct_values);
+ distinct_values_offset = cumsum (t (distinct_values));
+ distinct_values_overall = sum (distinct_values);
+ } else if (sum_dummy == 0) { # all features scale
+ print ("All features scale");
+ num_scale_features = ncol (X);
+ num_cat_features = 0;
+ X_scale = X;
+ distinct_values_max = 1;
+ } else { # some features scale some features categorical
+ num_cat_features = nrow (R_cat);
+ num_scale_features = nrow (R_scale);
+ distinct_values = t (R_cat[,2] - R_cat[,1] + 1);
+ distinct_values_max = max (distinct_values);
+ distinct_values_offset = cumsum (t (distinct_values));
+ distinct_values_overall = sum (distinct_values);
+
+ W = matrix (1, rows = num_cat_features, cols = 1) %*% matrix ("1 -1",
rows = 1, cols = 2);
+ W = matrix (W, rows = 2 * num_cat_features, cols = 1);
+ if (as.scalar (R_cat[num_cat_features, 2]) == ncol (X))
+ W[2 * num_cat_features,] = 0;
+
+ last = (R_cat[,2] != ncol (X));
+ R_cat1 = (R_cat[,2] + 1) * last;
+ R_cat[,2] = (R_cat[,2] * (1 - last)) + R_cat1;
+ R_cat_vec = matrix (R_cat, rows = 2 * num_cat_features, cols = 1);
+
+ col_tab = table (R_cat_vec, 1, W, ncol (X), 1);
+ col_ind = cumsum (col_tab);
+
+ col_ind_cat = removeEmpty (target = col_ind * seq (1, ncol (X)), margin
= "rows");
+ col_ind_scale = removeEmpty (target = (1 - col_ind) * seq (1, ncol (X)),
margin = "rows");
+ X_cat = X %*% table (col_ind_cat, seq (1, nrow (col_ind_cat)), ncol (X),
nrow (col_ind_cat));
+ X_scale = X %*% table (col_ind_scale, seq (1, nrow (col_ind_scale)),
ncol (X), nrow (col_ind_scale));
+ }
+ } else { # only scale features exist
+ print ("All features scale");
+ num_scale_features = ncol (X);
+ num_cat_features = 0;
+ X_scale = X;
+ distinct_values_max = 1;
+ }
+
+ if (num_scale_features > 0) {
+ print ("COMPUTING BINNING...");
+ bin_size = max (as.integer (num_records / num_bins), 1);
+ count_thresholds = matrix (0, rows = 1, cols = num_scale_features)
+ thresholds = matrix (0, rows = num_bins+1, cols = num_scale_features)
+ parfor(i1 in 1:num_scale_features) {
+ #approximate equi-height binning
+ bbin = seq(0, 1, 1/num_bins);
+ col_bins = quantile(X_scale[,i1], bbin)
+ count_thresholds[,i1] = num_bins; #TODO probe empty
+ thresholds[,i1] = col_bins;
+ }
+
+ print ("PREPROCESSING SCALE FEATURE MATRIX...");
+ min_num_bins = min (count_thresholds);
+ max_num_bins = max (count_thresholds);
+ total_num_bins = sum (count_thresholds);
+ cum_count_thresholds = t (cumsum (t (count_thresholds)));
+ X_scale_ext = matrix (0, rows = num_records, cols = total_num_bins);
+ parfor (i2 in 1:num_scale_features, check = 0) {
+ Xi2 = X_scale[,i2];
+ count_threshold = as.scalar (count_thresholds[,i2]);
+ offset_feature = 1;
+ if (i2 > 1)
+ offset_feature = offset_feature + as.integer (as.scalar
(cum_count_thresholds[, (i2 - 1)]));
+ ti2 = t(thresholds[1:count_threshold, i2]);
+ X_scale_ext[,offset_feature:(offset_feature + count_threshold - 1)] =
outer (Xi2, ti2, "<");
+ }
+ }
+
+ num_features_total = num_scale_features + num_cat_features;
+ num_feature_samples = as.integer (floor (num_features_total ^ fpow));
+
+
#-------------------------------------------------------------------------------------
+
+ ##### INITIALIZATION
+ L = matrix (1, rows = num_records, cols = num_trees); # last visited node id
for each training sample
+
+ # create matrix of counts (generated by Poisson distribution) storing how
many times each sample appears in each tree
+ print ("CONPUTING COUNTS...");
+ C = rand (rows = num_records, cols = num_trees, pdf = "poisson", lambda =
rate);
+ Ix_nonzero = (C != 0);
+ L = L * Ix_nonzero;
+ total_counts = sum (C);
+
+ # model
+ # LARGE leaf nodes
+ # NC_large[,1]: node id
+ # NC_large[,2]: tree id
+ # NC_large[,3]: class label
+ # NC_large[,4]: no. of misclassified samples
+ # NC_large[,5]: 1 if special leaf (impure and 3 samples at that leaf >
threshold) or 0 otherwise
+ NC_large = matrix (0, rows = 5, cols = 1);
+
+ # SMALL leaf nodes
+ # same schema as for LARGE leaf nodes (to be initialized)
+ NC_small = matrix (0, rows = 5, cols = 1);
+
+ # LARGE internal nodes
+ # Q_large[,1]: node id
+ # Q_large[,2]: tree id
+ Q_large = matrix (0, rows = 2, cols = num_trees);
+ Q_large[1,] = matrix (1, rows = 1, cols = num_trees);
+ Q_large[2,] = t (seq (1, num_trees));
+
+ # SMALL internal nodes
+ # same schema as for LARGE internal nodes (to be initialized)
+ Q_small = matrix (0, rows = 2, cols = 1);
+
+ # F_large[,1]: feature
+ # F_large[,2]: type
+ # F_large[,3]: offset
+ F_large = matrix (0, rows = 3, cols = 1);
+
+ # same schema as for LARGE nodes
+ F_small = matrix (0, rows = 3, cols = 1);
+
+ # split points for LARGE internal nodes
+ S_large = matrix (0, rows = 1, cols = 1);
+
+ # split points for SMALL internal nodes
+ S_small = matrix (0, rows = 1, cols = 1);
+
+ # initialize queue
+ cur_nodes_large = matrix (1, rows = 2, cols = num_trees);
+ cur_nodes_large[2,] = t (seq (1, num_trees));
+
+ num_cur_nodes_large = num_trees;
+ num_cur_nodes_small = 0;
+ level = 0;
+
+ while ((num_cur_nodes_large + num_cur_nodes_small) > 0 & level < depth) {
+ level = level + 1;
+ print (" --- start level " + level + " --- ");
+
+ ##### PREPARE MODEL
+ if (num_cur_nodes_large > 0) { # LARGE nodes to process
+ cur_Q_large = matrix (0, rows = 2, cols = 2 * num_cur_nodes_large);
+ cur_NC_large = matrix (0, rows = 5, cols = 2 * num_cur_nodes_large);
+ cur_F_large = matrix (0, rows = 3, cols = num_cur_nodes_large);
+ cur_S_large = matrix (0, rows = 1, cols = num_cur_nodes_large *
distinct_values_max);
+ cur_nodes_small = matrix (0, rows = 3, cols = 2 * num_cur_nodes_large);
+ }
+
+ ##### LOOP OVER LARGE NODES...
+ parfor (i6 in 1:num_cur_nodes_large, check = 0) {
+ cur_node = as.scalar (cur_nodes_large[1,i6]);
+ cur_tree = as.scalar (cur_nodes_large[2,i6]);
+
+ # select sample features WOR
+ feature_samples = sample (num_features_total, num_feature_samples);
+ feature_samples = order (target = feature_samples, by = 1);
+ num_scale_feature_samples = sum (feature_samples <= num_scale_features);
+ num_cat_feature_samples = num_feature_samples -
num_scale_feature_samples;
+
+ # --- find best split ---
+ # samples that reach cur_node
+ Ix = (L[,cur_tree] == cur_node);
+
+ cur_Y_bin = Y_bin * (Ix * C[,cur_tree]);
+ label_counts_overall = colSums (cur_Y_bin);
+ label_sum_overall = sum (label_counts_overall);
+ label_dist_overall = label_counts_overall / label_sum_overall;
+
+ if (imp == "entropy") {
+ label_dist_zero = (label_dist_overall == 0);
+ cur_impurity = - sum (label_dist_overall * log (label_dist_overall +
label_dist_zero)); # / log (2); # impurity before
+ } else { # imp == "Gini"
+ cur_impurity = sum (label_dist_overall * (1 - label_dist_overall)); #
impurity before
+ }
+ best_scale_gain = 0;
+ best_cat_gain = 0;
+ if (num_scale_features > 0 & num_scale_feature_samples > 0) {
+ scale_feature_samples = feature_samples[1:num_scale_feature_samples,];
+
+ # main operation
+ label_counts_left_scale = t (t (cur_Y_bin) %*% X_scale_ext);
+
+ # compute left and right label distribution
+ label_sum_left = rowSums (label_counts_left_scale);
+ label_dist_left = label_counts_left_scale / label_sum_left;
+ if (imp == "entropy") {
+ label_dist_left = replace (target = label_dist_left, pattern = 0,
replacement = 1);
+ log_label_dist_left = log (label_dist_left); # / log (2)
+ impurity_left_scale = - rowSums (label_dist_left *
log_label_dist_left);
+ } else { # imp == "Gini"
+ impurity_left_scale = rowSums (label_dist_left * (1 -
label_dist_left));
+ }
+ #
+ label_counts_right_scale = - label_counts_left_scale +
label_counts_overall;
+ label_sum_right = rowSums (label_counts_right_scale);
+ label_dist_right = label_counts_right_scale / label_sum_right;
+ if (imp == "entropy") {
+ label_dist_right = replace (target = label_dist_right, pattern = 0,
replacement = 1);
+ log_label_dist_right = log (label_dist_right); # / log (2)
+ impurity_right_scale = - rowSums (label_dist_right *
log_label_dist_right);
+ } else { # imp == "Gini"
+ impurity_right_scale = rowSums (label_dist_right * (1 -
label_dist_right));
+ }
+
+ I_gain_scale = cur_impurity - ( ( label_sum_left / label_sum_overall )
* impurity_left_scale + ( label_sum_right / label_sum_overall ) *
impurity_right_scale);
+
+ I_gain_scale = replace (target = I_gain_scale, pattern = NaN,
replacement = 0);
+
+ # determine best feature to split on and the split value
+ feature_start_ind = matrix (0, rows = 1, cols = num_scale_features);
+ feature_start_ind[1,1] = 1;
+ if (num_scale_features > 1) {
+ feature_start_ind[1,2:num_scale_features] =
cum_count_thresholds[1,1:(num_scale_features - 1)] + 1;
+ }
+ max_I_gain_found = 0;
+ max_I_gain_found_ind = 0;
+ best_i = 0;
+
+ for (i in 1:num_scale_feature_samples) { # assuming feature_samples is
5x1
+ cur_feature_samples_bin = as.scalar (scale_feature_samples[i,]);
+ cur_start_ind = as.scalar
(feature_start_ind[,cur_feature_samples_bin]);
+ cur_end_ind = as.scalar
(cum_count_thresholds[,cur_feature_samples_bin]);
+ I_gain_portion = I_gain_scale[cur_start_ind:cur_end_ind,];
+ cur_max_I_gain = max (I_gain_portion);
+ cur_max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain_portion)));
+ if (cur_max_I_gain > max_I_gain_found) {
+ max_I_gain_found = cur_max_I_gain;
+ max_I_gain_found_ind = cur_max_I_gain_ind;
+ best_i = i;
+ }
+ }
+
+ best_scale_gain = max_I_gain_found;
+ max_I_gain_ind_scale = max_I_gain_found_ind;
+ best_scale_feature = 0;
+ if (best_i > 0) {
+ best_scale_feature = as.scalar (scale_feature_samples[best_i,]);
+ }
+ best_scale_split = max_I_gain_ind_scale;
+ if (best_scale_feature > 1) {
+ best_scale_split = best_scale_split +
as.scalar(cum_count_thresholds[,(best_scale_feature - 1)]);
+ }
+ }
+
+ if (num_cat_features > 0 & num_cat_feature_samples > 0){
+ cat_feature_samples = feature_samples[(num_scale_feature_samples +
1):(num_scale_feature_samples + num_cat_feature_samples),] - num_scale_features;
+
+ # initialization
+ split_values_bin = matrix (0, rows = 1, cols =
distinct_values_overall);
+ split_values = split_values_bin;
+ split_values_offset = matrix (0, rows = 1, cols = num_cat_features);
+ I_gains = split_values_offset;
+ impurities_left = split_values_offset;
+ impurities_right = split_values_offset;
+ best_label_counts_left = matrix (0, rows = num_cat_features, cols =
num_classes);
+ best_label_counts_right = matrix (0, rows = num_cat_features, cols =
num_classes);
+
+ # main operation
+ label_counts = t (t (cur_Y_bin) %*% X_cat);
+
+ parfor (i9 in 1:num_cat_feature_samples, check = 0){
+ cur_cat_feature = as.scalar (cat_feature_samples[i9,1]);
+ start_ind = 1;
+ if (cur_cat_feature > 1)
+ start_ind = start_ind + as.scalar
(distinct_values_offset[(cur_cat_feature - 1),]);
+ offset = as.scalar (distinct_values[1,cur_cat_feature]);
+ cur_label_counts = label_counts[start_ind:(start_ind + offset - 1),];
+ label_sum = rowSums (cur_label_counts);
+ label_dist = cur_label_counts / label_sum;
+ if (imp == "entropy") {
+ label_dist = replace (target = label_dist, pattern = 0,
replacement = 1);
+ log_label_dist = log (label_dist); # / log(2)
+ impurity_tmp = - rowSums (label_dist * log_label_dist);
+ impurity_tmp = replace (target = impurity_tmp, pattern = NaN,
replacement = 1/0);
+ } else { # imp == "Gini"
+ impurity_tmp = rowSums (label_dist * (1 - label_dist));
+ }
+
+ # sort cur feature by impurity
+ cur_distinct_values = seq (1, nrow (cur_label_counts));
+ cur_distinct_values_impurity = cbind (cur_distinct_values,
impurity_tmp);
+ cur_feature_sorted = order (target = cur_distinct_values_impurity,
by = 2, decreasing = FALSE);
+ P = table (cur_distinct_values, cur_feature_sorted); # permutation
matrix
+ label_counts_sorted = P %*% cur_label_counts;
+
+ # compute left and right label distribution
+ label_counts_left = cumsum (label_counts_sorted);
+
+ label_sum_left = rowSums (label_counts_left);
+ label_dist_left = label_counts_left / label_sum_left;
+ label_dist_left = replace (target = label_dist_left, pattern = NaN,
replacement = 1);
+ if (imp == "entropy") {
+ label_dist_left = replace (target = label_dist_left, pattern = 0,
replacement = 1);
+ log_label_dist_left = log (label_dist_left); # / log(2)
+ impurity_left = - rowSums (label_dist_left * log_label_dist_left);
+ } else { # imp == "Gini"
+ impurity_left = rowSums (label_dist_left * (1 - label_dist_left));
+ }
+ #
+ label_counts_right = - label_counts_left + label_counts_overall;
+ label_sum_right = rowSums (label_counts_right);
+ label_dist_right = label_counts_right / label_sum_right;
+ label_dist_right = replace (target = label_dist_right, pattern =
NaN, replacement = 1);
+ if (imp == "entropy") {
+ label_dist_right = replace (target = label_dist_right, pattern =
0, replacement = 1);
+ log_label_dist_right = log (label_dist_right); # / log (2)
+ impurity_right = - rowSums (label_dist_right *
log_label_dist_right);
+ } else { # imp == "Gini"
+ impurity_right = rowSums (label_dist_right * (1 -
label_dist_right));
+ }
+ I_gain = cur_impurity - ( ( label_sum_left / label_sum_overall ) *
impurity_left + ( label_sum_right / label_sum_overall ) * impurity_right);
+
+ Ix_label_sum_left_zero = (label_sum_left == 0);
+ Ix_label_sum_right_zero = (label_sum_right == 0);
+ Ix_label_sum_zero = Ix_label_sum_left_zero * Ix_label_sum_right_zero;
+ I_gain = I_gain * (1 - Ix_label_sum_zero);
+
+ I_gain[nrow (I_gain),] = 0; # last entry invalid
+
+ max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain)));
+
+ split_values[1, start_ind:(start_ind + max_I_gain_ind - 1)] = t
(cur_feature_sorted[1:max_I_gain_ind,1]);
+ for (i10 in 1:max_I_gain_ind) {
+ ind = as.scalar (cur_feature_sorted[i10,1]);
+ if (ind == 1)
+ split_values_bin[1,start_ind] = 1.0;
+ else
+ split_values_bin[1,(start_ind + ind - 1)] = 1.0;
+ }
+ split_values_offset[1,cur_cat_feature] = max_I_gain_ind;
+
+ I_gains[1,cur_cat_feature] = max (I_gain);
+
+ impurities_left[1,cur_cat_feature] = as.scalar
(impurity_left[max_I_gain_ind,]);
+ impurities_right[1,cur_cat_feature] = as.scalar
(impurity_right[max_I_gain_ind,]);
+ best_label_counts_left[cur_cat_feature,] =
label_counts_left[max_I_gain_ind,];
+ best_label_counts_right[cur_cat_feature,] =
label_counts_right[max_I_gain_ind,];
+ }
+
+ # determine best feature to split on and the split values
+ best_cat_feature = as.scalar (rowIndexMax (I_gains));
+ best_cat_gain = max (I_gains);
+ start_ind = 1;
+ if (best_cat_feature > 1)
+ start_ind = start_ind + as.scalar
(distinct_values_offset[(best_cat_feature - 1),]);
+ offset = as.scalar (distinct_values[1,best_cat_feature]);
+ best_split_values_bin = split_values_bin[1, start_ind:(start_ind +
offset - 1)];
+ }
+
+ # compare best scale feature to best cat. feature and pick the best one
+ if (num_scale_features > 0 & num_scale_feature_samples > 0 &
best_scale_gain >= best_cat_gain & best_scale_gain > 0) {
+ # --- update model ---
+ cur_F_large[1,i6] = best_scale_feature;
+ cur_F_large[2,i6] = 1;
+ cur_F_large[3,i6] = 1;
+ cur_S_large[1,(i6 - 1) * distinct_values_max + 1] =
thresholds[max_I_gain_ind_scale, best_scale_feature];
+ left_child = 2 * (cur_node - 1) + 1 + 1;
+ right_child = 2 * (cur_node - 1) + 2 + 1;
+
+ # samples going to the left subtree
+ Ix_left = X_scale_ext[,best_scale_split];
+ Ix_left = Ix * Ix_left;
+ Ix_right = Ix * (1 - Ix_left);
+ L[,cur_tree] = L[,cur_tree] * (1 - Ix_left) + (Ix_left * left_child);
+ L[,cur_tree] = L[,cur_tree] * (1 - Ix_right) + (Ix_right *
right_child);
+ left_child_size = sum (Ix_left * C[,cur_tree]);
+ right_child_size = sum (Ix_right * C[,cur_tree]);
+
+ # check if left or right child is a leaf
+ left_pure = FALSE;
+ right_pure = FALSE;
+ cur_impurity_left = as.scalar(impurity_left_scale[best_scale_split,]);
# max_I_gain_ind_scale
+ cur_impurity_right =
as.scalar(impurity_right_scale[best_scale_split,]); # max_I_gain_ind_scale
+ if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level ==
depth)) &
+ (right_child_size <= num_leaf | cur_impurity_right == 0 | (level ==
depth)) |
+ (left_child_size <= threshold & right_child_size <= threshold &
(level == depth)) ) { # both left and right nodes are leaf
+
+ cur_label_counts_left = label_counts_left_scale[best_scale_split,];
# max_I_gain_ind_scale
+ cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child;
+ cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
+ cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax
(cur_label_counts_left)); # leaf class label
+ left_pure = TRUE;
+ # compute number of misclassified points
+ cur_NC_large[4,(2 * (i6 - 1) + 1)] = left_child_size - max
(cur_label_counts_left);
+
+ cur_label_counts_right = label_counts_overall -
cur_label_counts_left;
+ cur_NC_large[1,(2 * i6)] = right_child;
+ cur_NC_large[2,(2 * i6)] = cur_tree;
+ cur_NC_large[3,(2 * i6)] = as.scalar( rowIndexMax
(cur_label_counts_right)); # leaf class label
+ right_pure = TRUE;
+ # compute number of misclassified pints
+ cur_NC_large[4,(2 * i6)] = right_child_size - max
(cur_label_counts_right);
+
+ } else if (left_child_size <= num_leaf | cur_impurity_left == 0 |
(level == depth) |
+ (left_child_size <= threshold & (level == depth))) {
+
+ cur_label_counts_left = label_counts_left_scale[best_scale_split,];
# max_I_gain_ind_scale
+ cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child;
+ cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
+ cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax
(cur_label_counts_left)); # leaf class label
+ left_pure = TRUE;
+ # compute number of misclassified points
+ cur_NC_large[4,(2 * (i6 - 1) + 1)] = left_child_size - max
(cur_label_counts_left);
+
+ } else if (right_child_size <= num_leaf | cur_impurity_right == 0 |
(level == depth) |
+ (right_child_size <= threshold & (level == depth))) {
+
+ cur_label_counts_right =
label_counts_right_scale[best_scale_split,]; # max_I_gain_ind_scale
+ cur_NC_large[1,(2 * i6)] = right_child;
+ cur_NC_large[2,(2 * i6)] = cur_tree;
+ cur_NC_large[3,(2 * i6)] = as.scalar( rowIndexMax
(cur_label_counts_right)); # leaf class label
+ right_pure = TRUE;
+ # compute number of misclassified pints
+ cur_NC_large[4,(2 * i6)] = right_child_size - max
(cur_label_counts_right);
+ }
+ }
+ else if (num_cat_features > 0 & num_cat_feature_samples > 0 &
best_cat_gain > 0) {
+ # --- update model ---
+ cur_F_large[1,i6] = best_cat_feature;
+ cur_F_large[2,i6] = 2;
+ offset_nonzero = as.scalar (split_values_offset[1,best_cat_feature]);
+ S_start_ind = (i6 - 1) * distinct_values_max + 1;
+ cur_F_large[3,i6] = offset_nonzero;
+ cur_S_large[1,S_start_ind:(S_start_ind + offset_nonzero - 1)] =
split_values[1,start_ind:(start_ind + offset_nonzero - 1)];
+
+ left_child = 2 * (cur_node - 1) + 1 + 1;
+ right_child = 2 * (cur_node - 1) + 2 + 1;
+
+ # samples going to the left subtree
+ Ix_left = rowSums (X_cat[,start_ind:(start_ind + offset - 1)] *
best_split_values_bin);
+ Ix_left = (Ix_left >= 1);
+ Ix_left = Ix * Ix_left;
+ Ix_right = Ix * (1 - Ix_left);
+ L[,cur_tree] = L[,cur_tree] * (1 - Ix_left) + (Ix_left * left_child);
+ L[,cur_tree] = L[,cur_tree] * (1 - Ix_right) + (Ix_right *
right_child);
+ left_child_size = sum (Ix_left * C[,cur_tree]);
+ right_child_size = sum (Ix_right * C[,cur_tree]);
+
+ # check if left or right child is a leaf
+ left_pure = FALSE;
+ right_pure = FALSE;
+ cur_impurity_left = as.scalar(impurities_left[,best_cat_feature]);
+ cur_impurity_right = as.scalar(impurities_right[,best_cat_feature]);
+ if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level ==
depth)) &
+ (right_child_size <= num_leaf | cur_impurity_right == 0 | (level ==
depth)) |
+ (left_child_size <= threshold & right_child_size <= threshold &
(level == depth)) ) { # both left and right nodes are leaf
+
+ cur_label_counts_left = best_label_counts_left[best_cat_feature,];
+ cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child;
+ cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
+ cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax
(cur_label_counts_left)); # leaf class label
+ left_pure = TRUE;
+ # compute number of misclassified points
+ cur_NC_large[4,(2 * (i6 - 1) + 1)] = left_child_size - max
(cur_label_counts_left);
+
+ cur_label_counts_right = label_counts_overall -
cur_label_counts_left;
+ cur_NC_large[1,(2 * i6)] = right_child;
+ cur_NC_large[2,(2 * i6)] = cur_tree;
+ cur_NC_large[3,(2 * i6)] = as.scalar( rowIndexMax
(cur_label_counts_right)); # leaf class label
+ right_pure = TRUE;
+ # compute number of misclassified pints
+ cur_NC_large[4,(2 * i6)] = right_child_size - max
(cur_label_counts_right);
+
+ } else if (left_child_size <= num_leaf | cur_impurity_left == 0 |
(level == depth) |
+ (left_child_size <= threshold & (level == depth))) {
+
+ cur_label_counts_left = best_label_counts_left[best_cat_feature,];
+ cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child;
+ cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
+ cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax
(cur_label_counts_left)); # leaf class label
+ left_pure = TRUE;
+ # compute number of misclassified points
+ cur_NC_large[4,(2 * (i6 - 1) + 1)] = left_child_size - max
(cur_label_counts_left);
+
+ } else if (right_child_size <= num_leaf | cur_impurity_right == 0 |
(level == depth) |
+ (right_child_size <= threshold & (level == depth))) {
+
+ cur_label_counts_right = best_label_counts_right[best_cat_feature,];
+ cur_NC_large[1,(2 * i6)] = right_child;
+ cur_NC_large[2,(2 * i6)] = cur_tree;
+ cur_NC_large[3,(2 * i6)] = as.scalar( rowIndexMax
(cur_label_counts_right)); # leaf class label
+ right_pure = TRUE;
+ # compute number of misclassified pints
+ cur_NC_large[4,(2 * i6)] = right_child_size - max
(cur_label_counts_right);
+
+ }
+ } else {
+ print ("NUMBER OF SAMPLES AT NODE " + cur_node + " in tree " +
cur_tree + " CANNOT BE REDUCED TO MATCH " + num_leaf + ". THIS NODE IS DECLARED
AS LEAF!");
+ right_pure = TRUE;
+ left_pure = TRUE;
+ cur_NC_large[1,(2 * (i6 - 1) + 1)] = cur_node;
+ cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
+ class_label = as.scalar (rowIndexMax (label_counts_overall));
+ cur_NC_large[3,(2 * (i6 - 1) + 1)] = class_label;
+ cur_NC_large[4,(2 * (i6 - 1) + 1)] = label_sum_overall - max
(label_counts_overall);
+ cur_NC_large[5,(2 * (i6 - 1) + 1)] = 1; # special leaf
+ }
+
+ # add nodes to Q
+ if (!left_pure) {
+ if (left_child_size > threshold) {
+ cur_Q_large[1,(2 * (i6 - 1)+ 1)] = left_child;
+ cur_Q_large[2,(2 * (i6 - 1)+ 1)] = cur_tree;
+ } else {
+ cur_nodes_small[1,(2 * (i6 - 1)+ 1)] = left_child;
+ cur_nodes_small[2,(2 * (i6 - 1)+ 1)] = left_child_size;
+ cur_nodes_small[3,(2 * (i6 - 1)+ 1)] = cur_tree;
+ }
+ }
+ if (!right_pure) {
+ if (right_child_size > threshold) {
+ cur_Q_large[1,(2 * i6)] = right_child;
+ cur_Q_large[2,(2 * i6)] = cur_tree;
+ } else{
+ cur_nodes_small[1,(2 * i6)] = right_child;
+ cur_nodes_small[2,(2 * i6)] = right_child_size;
+ cur_nodes_small[3,(2 * i6)] = cur_tree;
+ }
+ }
+ }
+
+ ##### PREPARE MODEL FOR LARGE NODES
+ if (num_cur_nodes_large > 0) {
+ cur_Q_large = removeEmpty (target = cur_Q_large, margin = "cols");
+ if (as.scalar (cur_Q_large[1,1]) != 0) Q_large = cbind (Q_large,
cur_Q_large);
+ cur_NC_large = removeEmpty (target = cur_NC_large, margin = "cols");
+ if (as.scalar (cur_NC_large[1,1]) != 0) NC_large = cbind (NC_large,
cur_NC_large);
+
+ cur_F_large = removeEmpty (target = cur_F_large, margin = "cols");
+ if (as.scalar (cur_F_large[1,1]) != 0) F_large = cbind (F_large,
cur_F_large);
+ cur_S_large = removeEmpty (target = cur_S_large, margin = "cols");
+ if (as.scalar (cur_S_large[1,1]) != 0) S_large = cbind (S_large,
cur_S_large);
+
+ num_cur_nodes_large_pre = 2 * num_cur_nodes_large;
+ if (as.scalar (cur_Q_large[1,1]) == 0) {
+ num_cur_nodes_large = 0;
+ } else {
+ cur_nodes_large = cur_Q_large;
+ num_cur_nodes_large = ncol (cur_Q_large);
+ }
+ }
+
+ ##### PREPARE MODEL FOR SMALL NODES
+ cur_nodes_small_nonzero = removeEmpty (target = cur_nodes_small, margin =
"cols");
+ if (as.scalar (cur_nodes_small_nonzero[1,1]) != 0) { # if SMALL nodes exist
+ num_cur_nodes_small = ncol (cur_nodes_small_nonzero);
+ }
+
+ if (num_cur_nodes_small > 0) { # SMALL nodes to process
+ reserve_len = sum (2 ^ (ceil (log (cur_nodes_small_nonzero[2,]) / log
(2)))) + num_cur_nodes_small;
+ cur_Q_small = matrix (0, rows = 2, cols = reserve_len);
+ cur_F_small = matrix (0, rows = 3, cols = reserve_len);
+ cur_NC_small = matrix (0, rows = 5, cols = reserve_len);
+ cur_S_small = matrix (0, rows = 1, cols = reserve_len *
distinct_values_max); # split values of the best feature
+ }
+
+ ##### LOOP OVER SMALL NODES...
+ for (i7 in 1:num_cur_nodes_small) {
+ cur_node_small = as.scalar (cur_nodes_small_nonzero[1,i7]);
+ cur_tree_small = as.scalar (cur_nodes_small_nonzero[3,i7]);
+ # build dataset for SMALL node
+ Ix = (L[,cur_tree_small] == cur_node_small);
+ #print(as.scalar(Ix[0,0]));
+ if (num_scale_features > 0)
+ X_scale_ext_small = removeEmpty (target = X_scale_ext, margin =
"rows", select = Ix);
+ if (num_cat_features > 0)
+ X_cat_small = removeEmpty (target = X_cat, margin = "rows", select =
Ix);
+
+ L_small = removeEmpty (target = L * Ix, margin = "rows");
+ C_small = removeEmpty (target = C * Ix, margin = "rows");
+ Y_bin_small = removeEmpty (target = Y_bin * Ix, margin = "rows");
+
+ # compute offset
+ offsets = cumsum (t (2 ^ ceil (log (cur_nodes_small_nonzero[2,]) / log
(2))));
+ start_ind_global = 1;
+ if (i7 > 1)
+ start_ind_global = start_ind_global + as.scalar (offsets[(i7 - 1),]);
+ start_ind_S_global = 1;
+ if (i7 > 1)
+ start_ind_S_global = start_ind_S_global + (as.scalar (offsets[(i7 -
1),]) * distinct_values_max);
+
+ Q = matrix (0, rows = 2, cols = 1);
+ Q[1,1] = cur_node_small;
+ Q[2,1] = cur_tree_small;
+ F = matrix (0, rows = 3, cols = 1);
+ NC = matrix (0, rows = 5, cols = 1);
+ S = matrix (0, rows = 1, cols = 1);
+ cur_nodes_ = matrix (cur_node_small, rows = 2, cols = 1);
+ cur_nodes_[1,1] = cur_node_small;
+ cur_nodes_[2,1] = cur_tree_small;
+ num_cur_nodes = 1;
+ level_ = level;
+
+ while (num_cur_nodes > 0 & level_ < depth) {
+ level_ = level_ + 1;
+ cur_Q = matrix (0, rows = 2, cols = 2 * num_cur_nodes);
+ cur_F = matrix (0, rows = 3, cols = num_cur_nodes);
+ cur_NC = matrix (0, rows = 5, cols = 2 * num_cur_nodes);
+ cur_S = matrix (0, rows = 1, cols = num_cur_nodes *
distinct_values_max);
+
+ parfor (i8 in 1:num_cur_nodes, check = 0) {
+ cur_node = as.scalar (cur_nodes_[1,i8]);
+ cur_tree = as.scalar (cur_nodes_[2,i8]);
+
+ # select sample features WOR
+ feature_samples = sample (num_features_total, num_feature_samples);
+ feature_samples = order (target = feature_samples, by = 1);
+ num_scale_feature_samples = sum (feature_samples <=
num_scale_features);
+ num_cat_feature_samples = num_feature_samples -
num_scale_feature_samples;
+
+ # --- find best split ---
+ # samples that reach cur_node
+ Ix = (L_small[,cur_tree] == cur_node);
+ cur_Y_bin = Y_bin_small * (Ix * C_small[,cur_tree]);
+ label_counts_overall = colSums (cur_Y_bin);
+
+ label_sum_overall = sum (label_counts_overall);
+ label_dist_overall = label_counts_overall / label_sum_overall;
+ if (imp == "entropy") {
+ label_dist_zero = (label_dist_overall == 0);
+ cur_impurity = - sum (label_dist_overall * log (label_dist_overall
+ label_dist_zero)); # / log (2);
+ } else { # imp == "Gini"
+ cur_impurity = sum (label_dist_overall * (1 - label_dist_overall));
+ }
+ best_scale_gain = 0;
+ best_cat_gain = 0;
+
+ if (num_scale_features > 0 & num_scale_feature_samples > 0) {
+ scale_feature_samples =
feature_samples[1:num_scale_feature_samples,];
+
+ # main operation
+ label_counts_left_scale = t (t (cur_Y_bin) %*% X_scale_ext_small);
+
+ # compute left and right label distribution
+ label_sum_left = rowSums (label_counts_left_scale);
+ label_dist_left = label_counts_left_scale / label_sum_left;
+ if (imp == "entropy") {
+ label_dist_left = replace (target = label_dist_left, pattern =
0, replacement = 1);
+ log_label_dist_left = log (label_dist_left); # / log (2)
+ impurity_left_scale = - rowSums (label_dist_left *
log_label_dist_left);
+ } else { # imp == "Gini"
+ impurity_left_scale = rowSums (label_dist_left * (1 -
label_dist_left));
+ }
+ #
+ label_counts_right_scale = - label_counts_left_scale +
label_counts_overall;
+ label_sum_right = rowSums (label_counts_right_scale);
+ label_dist_right = label_counts_right_scale / label_sum_right;
+ if (imp == "entropy") {
+ label_dist_right = replace (target = label_dist_right, pattern =
0, replacement = 1);
+ log_label_dist_right = log (label_dist_right); # log (2)
+ impurity_right_scale = - rowSums (label_dist_right *
log_label_dist_right);
+ } else { # imp == "Gini"
+ impurity_right_scale = rowSums (label_dist_right * (1 -
label_dist_right));
+ }
+ I_gain_scale = cur_impurity - ( ( label_sum_left /
label_sum_overall ) * impurity_left_scale + ( label_sum_right /
label_sum_overall ) * impurity_right_scale);
+
+ I_gain_scale = replace (target = I_gain_scale, pattern = NaN,
replacement = 0);
+
+ # determine best feature to split on and the split value
+ feature_start_ind = matrix (0, rows = 1, cols =
num_scale_features);
+ feature_start_ind[1,1] = 1;
+ if (num_scale_features > 1) {
+ feature_start_ind[1,2:num_scale_features] =
cum_count_thresholds[1,1:(num_scale_features - 1)] + 1;
+ }
+ max_I_gain_found = 0;
+ max_I_gain_found_ind = 0;
+ best_i = 0;
+
+ for (i in 1:num_scale_feature_samples) { # assuming
feature_samples is 5x1
+ cur_feature_samples_bin = as.scalar (scale_feature_samples[i,]);
+ cur_start_ind = as.scalar
(feature_start_ind[,cur_feature_samples_bin]);
+ cur_end_ind = as.scalar
(cum_count_thresholds[,cur_feature_samples_bin]);
+ I_gain_portion = I_gain_scale[cur_start_ind:cur_end_ind,];
+ cur_max_I_gain = max (I_gain_portion);
+ cur_max_I_gain_ind = as.scalar (rowIndexMax (t
(I_gain_portion)));
+ if (cur_max_I_gain > max_I_gain_found) {
+ max_I_gain_found = cur_max_I_gain;
+ max_I_gain_found_ind = cur_max_I_gain_ind;
+ best_i = i;
+ }
+ }
+
+ best_scale_gain = max_I_gain_found;
+ max_I_gain_ind_scale = max_I_gain_found_ind;
+ best_scale_feature = 0;
+ if (best_i > 0) {
+ best_scale_feature = as.scalar (scale_feature_samples[best_i,]);
+ }
+ best_scale_split = max_I_gain_ind_scale;
+ if (best_scale_feature > 1) {
+ best_scale_split = best_scale_split +
as.scalar(cum_count_thresholds[,(best_scale_feature - 1)]);
+ }
+ }
+
+ if (num_cat_features > 0 & num_cat_feature_samples > 0){
+ cat_feature_samples = feature_samples[(num_scale_feature_samples +
1):(num_scale_feature_samples + num_cat_feature_samples),] - num_scale_features;
+
+ # initialization
+ split_values_bin = matrix (0, rows = 1, cols =
distinct_values_overall);
+ split_values = split_values_bin;
+ split_values_offset = matrix (0, rows = 1, cols =
num_cat_features);
+ I_gains = split_values_offset;
+ impurities_left = split_values_offset;
+ impurities_right = split_values_offset;
+ best_label_counts_left = matrix (0, rows = num_cat_features, cols
= num_classes);
+ best_label_counts_right = matrix (0, rows = num_cat_features, cols
= num_classes);
+
+ # main operation
+ label_counts = t (t (cur_Y_bin) %*% X_cat_small);
+
+ parfor (i9 in 1:num_cat_feature_samples, check = 0){
+ cur_cat_feature = as.scalar (cat_feature_samples[i9,1]);
+ start_ind = 1;
+ if (cur_cat_feature > 1)
+ start_ind = start_ind + as.scalar
(distinct_values_offset[(cur_cat_feature - 1),]);
+ offset = as.scalar (distinct_values[1,cur_cat_feature]);
+ cur_label_counts = label_counts[start_ind:(start_ind + offset -
1),];
+ label_sum = rowSums (cur_label_counts);
+ label_dist = cur_label_counts / label_sum;
+ if (imp == "entropy") {
+ label_dist = replace (target = label_dist, pattern = 0,
replacement = 1);
+ log_label_dist = log (label_dist); # / log(2)
+ impurity_tmp = - rowSums (label_dist * log_label_dist);
+ impurity_tmp = replace (target = impurity_tmp, pattern = NaN,
replacement = 1/0);
+ } else { # imp == "Gini"
+ impurity_tmp = rowSums (label_dist * (1 - label_dist));
+ }
+
+ # sort cur feature by impurity
+ cur_distinct_values = seq (1, nrow (cur_label_counts));
+ cur_distinct_values_impurity = cbind (cur_distinct_values,
impurity_tmp);
+ cur_feature_sorted = order (target =
cur_distinct_values_impurity, by = 2, decreasing = FALSE);
+ P = table (cur_distinct_values, cur_feature_sorted); #
permutation matrix
+ label_counts_sorted = P %*% cur_label_counts;
+
+ # compute left and right label distribution
+ label_counts_left = cumsum (label_counts_sorted);
+
+ label_sum_left = rowSums (label_counts_left);
+ label_dist_left = label_counts_left / label_sum_left;
+ label_dist_left = replace (target = label_dist_left, pattern =
NaN, replacement = 1);
+ if (imp == "entropy") {
+ label_dist_left = replace (target = label_dist_left, pattern =
0, replacement = 1);
+ log_label_dist_left = log (label_dist_left); # / log(2)
+ impurity_left = - rowSums (label_dist_left *
log_label_dist_left);
+ } else { # imp == "Gini"
+ impurity_left = rowSums (label_dist_left * (1 -
label_dist_left));
+ }
+ #
+ label_counts_right = - label_counts_left + label_counts_overall;
+ label_sum_right = rowSums (label_counts_right);
+ label_dist_right = label_counts_right / label_sum_right;
+ label_dist_right = replace (target = label_dist_right, pattern =
NaN, replacement = 1);
+ if (imp == "entropy") {
+ label_dist_right = replace (target = label_dist_right, pattern
= 0, replacement = 1);
+ log_label_dist_right = log (label_dist_right); # / log (2)
+ impurity_right = - rowSums (label_dist_right *
log_label_dist_right);
+ } else { # imp == "Gini"
+ impurity_right = rowSums (label_dist_right * (1 -
label_dist_right));
+ }
+ I_gain = cur_impurity - ( ( label_sum_left / label_sum_overall )
* impurity_left + ( label_sum_right / label_sum_overall ) * impurity_right);
+
+ Ix_label_sum_left_zero = (label_sum_left == 0);
+ Ix_label_sum_right_zero = (label_sum_right == 0);
+ Ix_label_sum_zero = Ix_label_sum_left_zero *
Ix_label_sum_right_zero;
+ I_gain = I_gain * (1 - Ix_label_sum_zero);
+
+ I_gain[nrow (I_gain),] = 0; # last entry invalid
+ max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain)));
+
+ split_values[1, start_ind:(start_ind + max_I_gain_ind - 1)] = t
(cur_feature_sorted[1:max_I_gain_ind,1]);
+ for (i10 in 1:max_I_gain_ind) {
+ ind = as.scalar (cur_feature_sorted[i10,1]);
+ if (ind == 1)
+ split_values_bin[1,start_ind] = 1.0;
+ else
+ split_values_bin[1,(start_ind + ind - 1)] = 1.0;
+ }
+ split_values_offset[1,cur_cat_feature] = max_I_gain_ind;
+
+ I_gains[1,cur_cat_feature] = max (I_gain);
+
+ impurities_left[1,cur_cat_feature] = as.scalar
(impurity_left[max_I_gain_ind,]);
+ impurities_right[1,cur_cat_feature] = as.scalar
(impurity_right[max_I_gain_ind,]);
+ best_label_counts_left[cur_cat_feature,] =
label_counts_left[max_I_gain_ind,];
+ best_label_counts_right[cur_cat_feature,] =
label_counts_right[max_I_gain_ind,];
+ }
+
+ # determine best feature to split on and the split values
+ best_cat_feature = as.scalar (rowIndexMax (I_gains));
+ best_cat_gain = max (I_gains);
+ start_ind = 1;
+ if (best_cat_feature > 1) {
+ start_ind = start_ind + as.scalar
(distinct_values_offset[(best_cat_feature - 1),]);
+ }
+ offset = as.scalar (distinct_values[1,best_cat_feature]);
+ best_split_values_bin = split_values_bin[1, start_ind:(start_ind +
offset - 1)];
+ }
+
+ # compare best scale feature to best cat. feature and pick the best
one
+ if (num_scale_features > 0 & num_scale_feature_samples > 0 &
best_scale_gain >= best_cat_gain & best_scale_gain > 0) {
+ # --- update model ---
+ cur_F[1,i8] = best_scale_feature;
+ cur_F[2,i8] = 1;
+ cur_F[3,i8] = 1;
+ cur_S[1,(i8 - 1) * distinct_values_max + 1] =
thresholds[max_I_gain_ind_scale, best_scale_feature];
+
+ left_child = 2 * (cur_node - 1) + 1 + 1;
+ right_child = 2 * (cur_node - 1) + 2 + 1;
+
+ # samples going to the left subtree
+ Ix_left = X_scale_ext_small[, best_scale_split];
+ Ix_left = Ix * Ix_left;
+ Ix_right = Ix * (1 - Ix_left);
+ L_small[,cur_tree] = L_small[,cur_tree] * (1 - Ix_left) + (Ix_left
* left_child);
+ L_small[,cur_tree] = L_small[,cur_tree] * (1 - Ix_right) +
(Ix_right * right_child);
+ left_child_size = sum (Ix_left * C_small[,cur_tree]);
+ right_child_size = sum (Ix_right * C_small[,cur_tree]);
+
+ # check if left or right child is a leaf
+ left_pure = FALSE;
+ right_pure = FALSE;
+ cur_impurity_left =
as.scalar(impurity_left_scale[best_scale_split,]);
+ cur_impurity_right =
as.scalar(impurity_right_scale[best_scale_split,]);
+ if ( (left_child_size <= num_leaf | cur_impurity_left == 0 |
level_ == depth) &
+ (right_child_size <= num_leaf | cur_impurity_right == 0 |
level_ == depth) ) { # both left and right nodes are leaf
+
+ cur_label_counts_left =
label_counts_left_scale[best_scale_split,];
+ cur_NC[1,(2 * (i8 - 1) + 1)] = left_child;
+ cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree;
+ cur_NC[3,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax
(cur_label_counts_left)); # leaf class label
+ left_pure = TRUE;
+ # compute number of misclassified points
+ cur_NC[4,(2 * (i8 - 1) + 1)] = left_child_size - max
(cur_label_counts_left);
+
+ cur_label_counts_right = label_counts_overall -
cur_label_counts_left;
+ cur_NC[1,(2 * i8)] = right_child;
+ cur_NC[2,(2 * i8)] = cur_tree;
+ cur_NC[3,(2 * i8)] = as.scalar( rowIndexMax
(cur_label_counts_right)); # leaf class label
+ right_pure = TRUE;
+ # compute number of misclassified points
+ cur_NC[4,(2 * i8)] = right_child_size - max
(cur_label_counts_right);
+
+ } else if (left_child_size <= num_leaf | cur_impurity_left == 0 |
level_ == depth) {
+
+ cur_label_counts_left =
label_counts_left_scale[best_scale_split,];
+ cur_NC[1,(2 * (i8 - 1) + 1)] = left_child;
+ cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree;
+ cur_NC[3,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax
(cur_label_counts_left)); # leaf class label
+ left_pure = TRUE;
+ # compute number of misclassified points
+ cur_NC[4,(2 * (i8 - 1) + 1)] = left_child_size - max
(cur_label_counts_left);
+
+ } else if (right_child_size <= num_leaf | cur_impurity_right == 0
| level_ == depth) {
+
+ cur_label_counts_right =
label_counts_right_scale[best_scale_split,];
+ cur_NC[1,(2 * i8)] = right_child;
+ cur_NC[2,(2 * i8)] = cur_tree;
+ cur_NC[3,(2 * i8)] = as.scalar( rowIndexMax
(cur_label_counts_right)); # leaf class label
+ right_pure = TRUE;
+ # compute number of misclassified points
+ cur_NC[4,(2 * i8)] = right_child_size - max
(cur_label_counts_right);
+ }
+ } else if (num_cat_features > 0 & num_cat_feature_samples > 0 &
best_cat_gain > 0) {
+
+ # --- update model ---
+ cur_F[1,i8] = best_cat_feature;
+ cur_F[2,i8] = 2;
+ offset_nonzero = as.scalar
(split_values_offset[1,best_cat_feature]);
+ S_start_ind = (i8 - 1) * distinct_values_max + 1;
+ cur_F[3,i8] = offset_nonzero;
+ cur_S[1,S_start_ind:(S_start_ind + offset_nonzero - 1)] =
split_values[1,start_ind:(start_ind + offset_nonzero - 1)];
+
+ left_child = 2 * (cur_node - 1) + 1 + 1;
+ right_child = 2 * (cur_node - 1) + 2 + 1;
+
+ # samples going to the left subtree
+ Ix_left = rowSums (X_cat_small[,start_ind:(start_ind + offset -
1)] * best_split_values_bin);
+ Ix_left = (Ix_left >= 1);
+ Ix_left = Ix * Ix_left;
+ Ix_right = Ix * (1 - Ix_left);
+ L_small[,cur_tree] = L_small[,cur_tree] * (1 - Ix_left) + (Ix_left
* left_child);
+ L_small[,cur_tree] = L_small[,cur_tree] * (1 - Ix_right) +
(Ix_right * right_child);
+ left_child_size = sum (Ix_left * C_small[,cur_tree]);
+ right_child_size = sum (Ix_right * C_small[,cur_tree]);
+
+ # check if left or right child is a leaf
+ left_pure = FALSE;
+ right_pure = FALSE;
+ cur_impurity_left = as.scalar(impurities_left[,best_cat_feature]);
+ cur_impurity_right =
as.scalar(impurities_right[,best_cat_feature]);
+ if ( (left_child_size <= num_leaf | cur_impurity_left == 0 |
level_ == depth) &
+ (right_child_size <= num_leaf | cur_impurity_right == 0 |
level_ == depth) ) { # both left and right nodes are leaf
+
+ cur_label_counts_left =
best_label_counts_left[best_cat_feature,];
+ cur_NC[1,(2 * (i8 - 1) + 1)] = left_child;
+ cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree;
+ cur_NC[3,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax
(cur_label_counts_left)); # leaf class label
+ left_pure = TRUE;
+ # compute number of misclassified points
+ cur_NC[4,(2 * (i8 - 1) + 1)] = left_child_size - max
(cur_label_counts_left);
+
+ cur_label_counts_right = label_counts_overall -
cur_label_counts_left;
+ cur_NC[1,(2 * i8)] = right_child;
+ cur_NC[2,(2 * i8)] = cur_tree;
+ cur_NC[3,(2 * i8)] = as.scalar( rowIndexMax
(cur_label_counts_right)); # leaf class label
+ right_pure = TRUE;
+ # compute number of misclassified points
+ cur_NC[4,(2 * i8)] = right_child_size - max
(cur_label_counts_right);
+
+ } else if (left_child_size <= num_leaf | cur_impurity_left == 0 |
level_ == depth) {
+
+ cur_label_counts_left =
best_label_counts_left[best_cat_feature,];
+ cur_NC[1,(2 * (i8 - 1) + 1)] = left_child;
+ cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree;
+ cur_NC[3,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax
(cur_label_counts_left)); # leaf class label
+ left_pure = TRUE;
+ # compute number of misclassified points
+ cur_NC[4,(2 * (i8 - 1) + 1)] = left_child_size - max
(cur_label_counts_left);
+
+ } else if (right_child_size <= num_leaf | cur_impurity_right == 0
| level_ == depth) {
+ cur_label_counts_right =
best_label_counts_right[best_cat_feature,];
+ cur_NC[1,(2 * i8)] = right_child;
+ cur_NC[2,(2 * i8)] = cur_tree;
+ cur_NC[3,(2 * i8)] = as.scalar( rowIndexMax
(cur_label_counts_right)); # leaf class label
+ right_pure = TRUE;
+ # compute number of misclassified points
+ cur_NC[4,(2 * i8)] = right_child_size - max
(cur_label_counts_right);
+ }
+ } else {
+ print ("NUMBER OF SAMPLES AT NODE " + cur_node + " in tree " +
cur_tree + " CANNOT BE REDUCED TO MATCH " + num_leaf + ". THIS NODE IS DECLARED
AS LEAF!");
+ right_pure = TRUE;
+ left_pure = TRUE;
+ cur_NC[1,(2 * (i8 - 1) + 1)] = cur_node;
+ cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree;
+ class_label = as.scalar (rowIndexMax (label_counts_overall));
+ cur_NC[3,(2 * (i8 - 1) + 1)] = class_label;
+ cur_NC[4,(2 * (i8 - 1) + 1)] = label_sum_overall - max
(label_counts_overall);
+ cur_NC[5,(2 * (i8 - 1) + 1)] = 1; # special leaf
+ }
+
+ # add nodes to Q
+ if (!left_pure)
+ cur_Q[,(2 * (i8 - 1)+ 1)] = as.matrix(list(left_child,cur_tree));
+ if (!right_pure)
+ cur_Q[,(2 * i8)] = as.matrix(list(right_child, cur_tree));
+ }
+
+ cur_Q = removeEmpty (target = cur_Q, margin = "cols");
+ Q = cbind (Q, cur_Q);
+ NC = cbind (NC, cur_NC);
+ F = cbind (F, cur_F);
+ S = cbind (S, cur_S);
+
+ num_cur_nodes_pre = 2 * num_cur_nodes;
+ if (as.scalar (cur_Q[1,1]) == 0) {
+ num_cur_nodes = 0;
+ } else {
+ cur_nodes_ = cur_Q;
+ num_cur_nodes = ncol (cur_Q);
+ }
+ }
+
+ cur_Q_small[,start_ind_global:(start_ind_global + ncol (Q) - 1)] = Q;
+ cur_NC_small[,start_ind_global:(start_ind_global + ncol (NC) - 1)] = NC;
+ cur_F_small[,start_ind_global:(start_ind_global + ncol (F) - 1)] = F;
+ cur_S_small[,start_ind_S_global:(start_ind_S_global + ncol (S) - 1)] = S;
+ }
+
+ ##### PREPARE MODEL FOR SMALL NODES
+ if (num_cur_nodes_small > 0) { # small nodes already processed
+ cur_Q_small = removeEmpty (target = cur_Q_small, margin = "cols");
+ if (as.scalar (cur_Q_small[1,1]) != 0) Q_small = cbind (Q_small,
cur_Q_small);
+ cur_NC_small = removeEmpty (target = cur_NC_small, margin = "cols");
+ if (as.scalar (cur_NC_small[1,1]) != 0) NC_small = cbind (NC_small,
cur_NC_small);
+
+ cur_F_small = removeEmpty (target = cur_F_small, margin = "cols"); #
+ if (as.scalar (cur_F_small[1,1]) != 0) F_small = cbind (F_small,
cur_F_small);
+ cur_S_small = removeEmpty (target = cur_S_small, margin = "cols"); #
+ if (as.scalar (cur_S_small[1,1]) != 0) S_small = cbind (S_small,
cur_S_small);
+
+ num_cur_nodes_small = 0; # reset
+ }
+ print (" --- end level " + level + ", remaining no. of LARGE nodes to
expand " + num_cur_nodes_large + " --- ");
+ }
+
+ #### prepare model
+ print ("PREPARING MODEL...")
+ ### large nodes
+ if (as.scalar (Q_large[1,1]) == 0 & ncol (Q_large) > 1)
+ Q_large = Q_large[,2:ncol (Q_large)];
+ if (as.scalar (NC_large[1,1]) == 0 & ncol (NC_large) > 1)
+ NC_large = NC_large[,2:ncol (NC_large)];
+ if (as.scalar (S_large[1,1]) == 0 & ncol (S_large) > 1)
+ S_large = S_large[,2:ncol (S_large)];
+ if (as.scalar (F_large[1,1]) == 0 & ncol (F_large) > 1)
+ F_large = F_large[,2:ncol (F_large)];
+
+ ### small nodes
+ if (as.scalar (Q_small[1,1]) == 0 & ncol (Q_small) > 1)
+ Q_small = Q_small[,2:ncol (Q_small)];
+ if (as.scalar (NC_small[1,1]) == 0 & ncol (NC_small) > 1)
+ NC_small = NC_small[,2:ncol (NC_small)];
+ if (as.scalar (S_small[1,1]) == 0 & ncol (S_small) > 1)
+ S_small = S_small[,2:ncol (S_small)];
+ if (as.scalar (F_small[1,1]) == 0 & ncol (F_small) > 1)
+ F_small = F_small[,2:ncol (F_small)];
+
+ # check for special leaves and if there are any remove them from Q_large and
Q_small
+ special_large_leaves_ind = NC_large[5,];
+ num_special_large_leaf = sum (special_large_leaves_ind);
+ if (num_special_large_leaf > 0) {
+ print ("PROCESSING " + num_special_large_leaf + " SPECIAL LARGE
LEAVES...");
+ special_large_leaves = removeEmpty (target = NC_large[1:2,] *
special_large_leaves_ind, margin = "cols");
+ large_internal_ind = 1 - colSums (outer (t (special_large_leaves[1,]),
Q_large[1,], "==") * outer (t (special_large_leaves[2,]), Q_large[2,], "=="));
+ Q_large = removeEmpty (target = Q_large * large_internal_ind, margin =
"cols");
+ F_large = removeEmpty (target = F_large, margin = "cols"); # remove
special leaves from F
+ }
+
+ special_small_leaves_ind = NC_small[5,];
+ num_special_small_leaf = sum (special_small_leaves_ind);
+ if (num_special_small_leaf > 0) {
+ print ("PROCESSING " + num_special_small_leaf + " SPECIAL SMALL
LEAVES...");
+ special_small_leaves = removeEmpty (target = NC_small[1:2,] *
special_small_leaves_ind, margin = "cols");
+ small_internal_ind = 1 - colSums (outer (t (special_small_leaves[1,]),
Q_small[1,], "==") * outer (t (special_small_leaves[2,]), Q_small[2,], "=="));
+ Q_small = removeEmpty (target = Q_small * small_internal_ind, margin =
"cols");
+ F_small = removeEmpty (target = F_small, margin = "cols"); # remove
special leaves from F
+ }
+
+ # model corresponding to large internal nodes
+ no_large_internal_node = FALSE;
+ if (as.scalar (Q_large[1,1]) != 0) {
+ print ("PROCESSING LARGE INTERNAL NODES...");
+ num_large_internal = ncol (Q_large);
+ max_offset = max (max (F_large[3,]), max (F_small[3,]));
+ M1_large = matrix (0, rows = 6 + max_offset, cols = num_large_internal);
+ M1_large[1:2,] = Q_large;
+ M1_large[4:6,] = F_large;
+ # process S_large
+ cum_offsets_large = cumsum (t (F_large[3,]));
+ parfor (it in 1:num_large_internal, check = 0) {
+ start_ind = 1;
+ if (it > 1)
+ start_ind = start_ind + as.scalar (cum_offsets_large[(it - 1),]);
+ offset = as.scalar (F_large[3,it]);
+ M1_large[7:(7 + offset - 1),it] = t (S_large[1,start_ind:(start_ind +
offset - 1)]);
+ }
+ } else {
+ print ("No LARGE internal nodes available");
+ no_large_internal_node = TRUE;
+ }
+
+ # model corresponding to small internal nodes
+ no_small_internal_node = FALSE;
+ if (as.scalar (Q_small[1,1]) != 0) {
+ print ("PROCESSING SMALL INTERNAL NODES...");
+ num_small_internal = ncol (Q_small);
+ M1_small = matrix (0, rows = 6 + max_offset, cols = num_small_internal);
+ M1_small[1:2,] = Q_small;
+ M1_small[4:6,] = F_small;
+ # process S_small
+ cum_offsets_small = cumsum (t (F_small[3,]));
+ parfor (it in 1:num_small_internal, check = 0) {
+ start_ind = 1;
+ if (it > 1)
+ start_ind = start_ind + as.scalar (cum_offsets_small[(it - 1),]);
+ offset = as.scalar (F_small[3,it]);
+ M1_small[7:(7 + offset - 1),it] = t (S_small[1,start_ind:(start_ind +
offset - 1)]);
+ }
+ } else {
+ print ("No SMALL internal nodes available");
+ no_small_internal_node = TRUE;
+ }
+
+ # model corresponding to large leaf nodes
+ no_large_leaf_node = FALSE;
+ if (as.scalar (NC_large[1,1]) != 0) {
+ print ("PROCESSING LARGE LEAF NODES...");
+ num_large_leaf = ncol (NC_large);
+ M2_large = matrix (0, rows = 6 + max_offset, cols = num_large_leaf);
+ M2_large[1:2,] = NC_large[1:2,];
+ M2_large[5:7,] = NC_large[3:5,];
+ } else {
+ print ("No LARGE leaf nodes available");
+ no_large_leaf_node = TRUE;
+ }
+
+ # model corresponding to small leaf nodes
+ no_small_leaf_node = FALSE;
+ if (as.scalar (NC_small[1,1]) != 0) {
+ print ("PROCESSING SMALL LEAF NODES...");
+ num_small_leaf = ncol (NC_small);
+ M2_small = matrix (0, rows = 6 + max_offset, cols = num_small_leaf);
+ M2_small[1:2,] = NC_small[1:2,];
+ M2_small[5:7,] = NC_small[3:5,];
+ } else {
+ print ("No SMALL leaf nodes available");
+ no_small_leaf_node = TRUE;
+ }
+
+ if (no_large_internal_node)
+ M1 = M1_small;
+ else if (no_small_internal_node)
+ M1 = M1_large;
+ else
+ M1 = cbind (M1_large, M1_small);
+
+ if (no_large_leaf_node)
+ M2 = M2_small;
+ else if (no_small_leaf_node)
+ M2 = M2_large;
+ else
+ M2 = cbind (M2_large, M2_small);
+
+ M = cbind (M1, M2);
+ M = t (order (target = t (M), by = 1)); # sort by node id
+ M = t (order (target = t (M), by = 2)); # sort by tree id
+
+ # removing redundant subtrees
+ if (ncol (M) > 1) {
+ print ("CHECKING FOR REDUNDANT SUBTREES...");
+ red_leaf = TRUE;
+ process_red_subtree = FALSE;
+ invalid_node_ind = matrix (0, rows = 1, cols = ncol (M));
+ while (red_leaf & ncol (M) > 1) {
+ leaf_ind = (M[4,] == 0);
+ labels = M[5,] * leaf_ind;
+ tree_ids = M[2,];
+ parent_ids = floor (M[1,] /2);
+ cond1 = (labels[,1:(ncol (M) - 1)] == labels[,2:ncol (M)]); # sibling
leaves with same label
+ cond2 = (parent_ids[,1:(ncol (M) - 1)] == parent_ids[,2:ncol (M)]); #
same parents
+ cond3 = (tree_ids[,1:(ncol (M) - 1)] == tree_ids[,2:ncol (M)]); # same
tree
+ red_leaf_ind = cond1 * cond2 * cond3 * leaf_ind[,2:ncol (M)];
+
+ if (sum (red_leaf_ind) > 0) { # if redundant subtrees exist
+ red_leaf_ids = M[1:2,2:ncol (M)] * red_leaf_ind;
+ red_leaf_ids_nonzero = removeEmpty (target = red_leaf_ids, margin =
"cols");
+ parfor (it in 1:ncol (red_leaf_ids_nonzero), check = 0){
+ cur_right_leaf_id = as.scalar (red_leaf_ids_nonzero[1,it]);
+ cur_parent_id = floor (cur_right_leaf_id / 2);
+ cur_tree_id = as.scalar (red_leaf_ids_nonzero[2,it]);
+ cur_right_leaf_pos = as.scalar (rowIndexMax ((M[1,] ==
cur_right_leaf_id) * (M[2,] == cur_tree_id)));
+ cur_parent_pos = as.scalar(rowIndexMax ((M[1,] == cur_parent_id) *
(M[2,] == cur_tree_id)));
+ M[3:nrow (M), cur_parent_pos] = M[3:nrow (M), cur_right_leaf_pos];
+ M[4,cur_right_leaf_pos] = -1;
+ M[4,cur_right_leaf_pos - 1] = -1;
+ invalid_node_ind[1,cur_right_leaf_pos] = 1;
+ invalid_node_ind[1,cur_right_leaf_pos - 1] = 1;
+ }
+ process_red_subtree = TRUE;
+ } else {
+ red_leaf = FALSE;
+ }
+ }
+
+ if (process_red_subtree) {
+ print ("REMOVING REDUNDANT SUBTREES...");
+ valid_node_ind = (invalid_node_ind == 0);
+ M = removeEmpty (target = M * valid_node_ind, margin = "cols");
+ }
+ }
+
+ internal_ind = (M[4,] > 0);
+ internal_ids = M[1:2,] * internal_ind;
+ internal_ids_nonzero = removeEmpty (target = internal_ids, margin = "cols");
+ if (as.scalar (internal_ids_nonzero[1,1]) > 0) { # if internal nodes exist
+ a1 = internal_ids_nonzero[1,];
+ a2 = internal_ids_nonzero[1,] * 2;
+ vcur_tree_id = internal_ids_nonzero[2,];
+ pos_a1 = rowIndexMax( outer(t(a1), M[1,], "==") * outer(t(vcur_tree_id),
M[2,], "==") );
+ pos_a2 = rowIndexMax( outer(t(a2), M[1,], "==") * outer(t(vcur_tree_id),
M[2,], "==") );
+ M[3,] = t(table(pos_a1, 1, pos_a2 - pos_a1, ncol(M), 1));
+ }
+ else {
+ print ("All trees in the random forest contain only one leaf!");
+ }
+}
diff --git a/src/main/java/org/apache/sysds/common/Builtins.java
b/src/main/java/org/apache/sysds/common/Builtins.java
index 34b5cc5..897aadc 100644
--- a/src/main/java/org/apache/sysds/common/Builtins.java
+++ b/src/main/java/org/apache/sysds/common/Builtins.java
@@ -196,6 +196,7 @@ public enum Builtins {
PROD("prod", false),
QR("qr", false, ReturnType.MULTI_RETURN),
QUANTILE("quantile", false),
+ RANDOM_FOREST("randomForest", true),
RANGE("range", false),
RBIND("rbind", false),
REMOVE("remove", false, ReturnType.MULTI_RETURN),
diff --git
a/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinRandomForestTest.java
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinRandomForestTest.java
new file mode 100644
index 0000000..a68d828
--- /dev/null
+++
b/src/test/java/org/apache/sysds/test/functions/builtin/BuiltinRandomForestTest.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.sysds.test.functions.builtin;
+
+import org.junit.Ignore;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.Parameterized;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.apache.sysds.common.Types.ExecMode;
+import org.apache.sysds.test.AutomatedTestBase;
+import org.apache.sysds.test.TestConfiguration;
+import org.apache.sysds.test.TestUtils;
+
+@RunWith(value = Parameterized.class)
[email protected]
+public class BuiltinRandomForestTest extends AutomatedTestBase
+{
+ private final static String TEST_NAME = "RandomForest";
+ private final static String TEST_DIR = "functions/builtin/";
+ private static final String TEST_CLASS_DIR = TEST_DIR +
BuiltinRandomForestTest.class.getSimpleName() + "/";
+
+ @Override
+ public void setUp() {
+ TestUtils.clearAssertionInformation();
+ addTestConfiguration(TEST_NAME,new
TestConfiguration(TEST_CLASS_DIR, TEST_NAME,new String[]{"C"}));
+ }
+
+ @Parameterized.Parameter()
+ public int rows;
+ @Parameterized.Parameter(1)
+ public int cols;
+ @Parameterized.Parameter(2)
+ public int bins;
+ @Parameterized.Parameter(3)
+ public int depth;
+ @Parameterized.Parameter(4)
+ public int num_leaf;
+ @Parameterized.Parameter(5)
+ public int num_trees;
+ @Parameterized.Parameter(6)
+ public String impurity;
+
+ @Parameterized.Parameters
+ public static Collection<Object[]> data() {
+ return Arrays.asList(new Object[][] {
+ //TODO fix randomForest script (currently indexing
issues)
+ {2000, 7, 4, 7, 10, 1, "Gini"},
+ {2000, 7, 4, 7, 10, 1, "entropy"},
+ {2000, 7, 4, 3, 5, 3, "Gini"},
+ {2000, 7, 4, 3, 5, 3, "entropy"},
+ });
+ }
+
+ @Ignore
+ @Test
+ public void testRandomForestSinglenode() {
+ runRandomForestTest(ExecMode.SINGLE_NODE);
+ }
+
+ @Ignore
+ @Test
+ public void testRandomForestHybrid() {
+ runRandomForestTest(ExecMode.HYBRID);
+ }
+
+ private void runRandomForestTest(ExecMode mode)
+ {
+ ExecMode platformOld = setExecMode(mode);
+
+ try
+ {
+ loadTestConfiguration(getTestConfiguration(TEST_NAME));
+
+ String HOME = SCRIPT_DIR + TEST_DIR;
+ fullDMLScriptName = HOME + TEST_NAME + ".dml";
+ programArgs = new String[]{"-args",
+ input("X"), input("Y"), String.valueOf(bins),
+ String.valueOf(depth), String.valueOf(num_leaf),
+ String.valueOf(num_trees), impurity,
output("B") };
+
+ //generate actual datasets
+ double[][] X = getRandomMatrix(rows, cols, 0, 1, 0.7,
7);
+ double[][] Y = TestUtils.round(getRandomMatrix(rows, 1,
1, 5, 1, 3));
+ writeInputMatrixWithMTD("X", X, false);
+ writeInputMatrixWithMTD("Y", Y, false);
+
+ runTest(true, false, null, -1);
+ }
+ finally {
+ rtplatform = platformOld;
+ }
+ }
+}
diff --git a/src/test/scripts/functions/builtin/randomForest.dml
b/src/test/scripts/functions/builtin/randomForest.dml
new file mode 100644
index 0000000..e7b2b71
--- /dev/null
+++ b/src/test/scripts/functions/builtin/randomForest.dml
@@ -0,0 +1,34 @@
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+X = read($1);
+Y = read($2);
+R = matrix(1, rows=ncol(X), cols = 3);
+bins = $3;
+depth = $4;
+num_leafs = $5;
+num_trees = $6;
+impurity = $7;
+
+[M, C, S_map, C_map] = randomForest(X=X, Y=Y, R=R, bins=bins, depth=depth,
+ num_leaf=num_leafs, num_trees=num_trees, impurity=impurity);
+
+write(M, $8);