http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/random-forest.dml
----------------------------------------------------------------------
diff --git a/scripts/algorithms/random-forest.dml 
b/scripts/algorithms/random-forest.dml
index 7bdc1fb..b68d711 100644
--- a/scripts/algorithms/random-forest.dml
+++ b/scripts/algorithms/random-forest.dml
@@ -1,1375 +1,1375 @@
-#-------------------------------------------------------------
-#
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-# 
-#   http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------
-
-#  
-# THIS SCRIPT IMPLEMENTS CLASSIFICATION RANDOM FOREST WITH BOTH SCALE AND 
CATEGORICAL FEATURES
-#
-# INPUT                        PARAMETERS:
-# 
---------------------------------------------------------------------------------------------
-# NAME                         TYPE     DEFAULT      MEANING
-# 
---------------------------------------------------------------------------------------------
-# X                            String   ---          Location to read feature 
matrix X; note that X needs to be both recoded and dummy coded 
-# Y                                    String   ---              Location to 
read label matrix Y; note that Y needs to be both recoded and dummy coded
-# R                                    String   " "          Location to read 
the matrix R which for each feature in X contains the following information 
-#                                                                              
                - R[,1]: column ids
-#                                                                              
                - R[,2]: start indices 
-#                                                                              
                - R[,3]: end indices
-#                                                                              
          If R is not provided by default all variables are assumed to be scale
-# bins                         Int      20                       Number of 
equiheight bins per scale feature to choose thresholds
-# depth                        Int      25                       Maximum depth 
of the learned tree
-# num_leaf                     Int      10           Number of samples when 
splitting stops and a leaf node is added
-# num_samples                  Int      3000             Number of samples at 
which point we switch to in-memory subtree building
-# num_trees                    Int      10                       Number of 
trees to be learned in the random forest model
-# subsamp_rate                 Double   1.0              Parameter controlling 
the size of each tree in the forest; samples are selected from a 
-#                                                                              
          Poisson distribution with parameter subsamp_rate (the default value 
is 1.0)
-# feature_subset       Double   0.5              Parameter that controls the 
number of feature used as candidates for splitting at each tree node 
-#                                                                              
          as a power of number of features in the dataset;
-#                                                                              
          by default square root of features (i.e., feature_subset = 0.5) are 
used at each tree node 
-# impurity                     String   "Gini"           Impurity measure: 
entropy or Gini (the default)
-# M                            String   ---              Location to write 
matrix M containing the learned tree
-# C                                    String   " "              Location to 
write matrix C containing the number of times samples are chosen in each tree 
of the random forest 
-# S_map                                        String   " "              
Location to write the mappings from scale feature ids to global feature ids
-# C_map                                        String   " "              
Location to write the mappings from categorical feature ids to global feature 
ids
-# fmt                          String   "text"       The output format of the 
model (matrix M), such as "text" or "csv"
-# 
---------------------------------------------------------------------------------------------
-# OUTPUT: 
-# Matrix M where each column corresponds to a node in the learned tree and 
each row contains the following information:
-#       M[1,j]: id of node j (in a complete binary tree)
-#       M[2,j]: tree id to which node j belongs
-#       M[3,j]: Offset (no. of columns) to left child of j 
-#       M[4,j]: Feature index of the feature that node j looks at if j is an 
internal node, otherwise 0
-#       M[5,j]: Type of the feature that node j looks at if j is an internal 
node: 1 for scale and 2 for categorical features, 
-#                   otherwise the label that leaf node j is supposed to predict
-#       M[6,j]: 1 if j is an internal node and the feature chosen for j is 
scale, otherwise the size of the subset of values 
-#                       stored in rows 7,8,... if j is categorical
-#       M[7:,j]: Only applicable for internal nodes. Threshold the example's 
feature value is compared to is stored at M[7,j] if the feature chosen for j is 
scale;
-#                        If the feature chosen for j is categorical rows 
7,8,... depict the value subset chosen for j   
-# 
-------------------------------------------------------------------------------------------
-# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
-# hadoop jar SystemML.jar -f random-forest.dml -nvargs X=INPUT_DIR/X 
Y=INPUT_DIR/Y R=INPUT_DIR/R M=OUTPUT_DIR/model
-#                                                                              
   bins=20 depth=25 num_leaf=10 num_samples=3000 num_trees=10 impurity=Gini 
fmt=csv
-
-
-# External function for binning
-binning = externalFunction(Matrix[Double] A, Integer binsize, Integer numbins) 
return (Matrix[Double] B, Integer numbinsdef) 
-       implemented in 
(classname="org.apache.sysml.udf.lib.BinningWrapper",exectype="mem")
-
-       
-# Default values of some parameters    
-fileR = ifdef ($R, " ");
-fileC = ifdef ($C, " ");
-fileS_map = ifdef ($S_map, " ");
-fileC_map = ifdef ($C_map, " ");
-fileM = $M;    
-num_bins = ifdef($bins, 20); 
-depth = ifdef($depth, 25);
-num_leaf = ifdef($num_leaf, 10);
-num_trees = ifdef($num_trees, 1); 
-threshold = ifdef ($num_samples, 3000);
-imp = ifdef($impurity, "Gini");
-rate = ifdef ($subsamp_rate, 1);
-fpow = ifdef ($feature_subset, 0.5);
-fmtO = ifdef($fmt, "text");
-
-X = read($X);
-Y_bin = read($Y);
-num_records = nrow (X);
-num_classes = ncol (Y_bin);
-
-# check if there is only one class label
-Y_bin_sum = sum (ppred (colSums (Y_bin), num_records, "=="));
-if (Y_bin_sum == 1) {
-       stop ("Y contains only one class label. No model will be learned!");
-} else if (Y_bin_sum > 1) {
-       stop ("Y is not properly dummy coded. Multiple columns of Y contain 
only ones!")
-}
-
-# split data into X_scale and X_cat
-if (fileR != " ") {
-       R = read (fileR);
-       R = order (target = R, by = 2); # sort by start indices
-       dummy_coded = ppred (R[,2], R[,3], "!=");
-       R_scale = removeEmpty (target = R[,2:3] * (1 - dummy_coded), margin = 
"rows");
-       R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows");
-       if (fileS_map != " ") {
-               scale_feature_mapping = removeEmpty (target = (1 - dummy_coded) 
* seq (1, nrow (R)), margin = "rows");
-               write (scale_feature_mapping, fileS_map, format = fmtO);
-       }
-       if (fileC_map != " ") {
-               cat_feature_mapping = removeEmpty (target = dummy_coded * seq 
(1, nrow (R)), margin = "rows");  
-               write (cat_feature_mapping, fileC_map, format = fmtO);          
-       }
-       sum_dummy = sum (dummy_coded);  
-       if (sum_dummy == nrow (R)) { # all features categorical
-               print ("All features categorical");
-               num_cat_features = nrow (R_cat);
-               num_scale_features = 0;
-               X_cat = X;      
-               distinct_values = t (R_cat[,2] - R_cat[,1] + 1);
-               distinct_values_max = max (distinct_values);
-               distinct_values_offset = cumsum (t (distinct_values));
-               distinct_values_overall = sum (distinct_values);
-       } else if (sum_dummy == 0) { # all features scale
-               print ("All features scale");
-               num_scale_features = ncol (X);
-               num_cat_features = 0;
-               X_scale = X;
-               distinct_values_max = 1;
-       } else { # some features scale some features categorical 
-               num_cat_features = nrow (R_cat);
-               num_scale_features = nrow (R_scale);
-               distinct_values = t (R_cat[,2] - R_cat[,1] + 1);
-               distinct_values_max = max (distinct_values);
-               distinct_values_offset = cumsum (t (distinct_values));
-               distinct_values_overall = sum (distinct_values);
-               
-               W = matrix (1, rows = num_cat_features, cols = 1) %*% matrix 
("1 -1", rows = 1, cols = 2);
-               W = matrix (W, rows = 2 * num_cat_features, cols = 1);
-               if (as.scalar (R_cat[num_cat_features, 2]) == ncol (X)) {
-                       W[2 * num_cat_features,] = 0;
-               }
-               
-               last = ppred (R_cat[,2], ncol (X), "!=");
-               R_cat1 = (R_cat[,2] + 1) * last;
-               R_cat[,2] = (R_cat[,2] * (1 - last)) + R_cat1;
-               R_cat_vec = matrix (R_cat, rows = 2 * num_cat_features, cols = 
1);      
-
-               col_tab = table (R_cat_vec, 1, W, ncol (X), 1);
-               col_ind = cumsum (col_tab);
-               
-               col_ind_cat = removeEmpty (target = col_ind * seq (1, ncol 
(X)), margin = "rows");
-               col_ind_scale = removeEmpty (target = (1 - col_ind) * seq (1, 
ncol (X)), margin = "rows");      
-               X_cat = X %*% table (col_ind_cat, seq (1, nrow (col_ind_cat)), 
ncol (X), nrow (col_ind_cat));
-               X_scale = X %*% table (col_ind_scale, seq (1, nrow 
(col_ind_scale)), ncol (X), nrow (col_ind_scale));           
-       }       
-} else { # only scale features exist
-       print ("All features scale");
-       num_scale_features = ncol (X);
-       num_cat_features = 0;
-       X_scale = X;
-       distinct_values_max = 1;
-}      
-
-if (num_scale_features > 0) {
-
-       print ("COMPUTING BINNING...");
-       bin_size = max (as.integer (num_records / num_bins), 1);
-       count_thresholds = matrix (0, rows = 1, cols = num_scale_features)
-       thresholds = matrix (0, rows = num_bins + 1, cols = num_scale_features)
-       parfor(i1 in 1:num_scale_features) { 
-               col = order (target = X_scale[,i1], by = 1, decreasing = FALSE);
-               [col_bins, num_bins_defined] = binning (col, bin_size, 
num_bins);
-               count_thresholds[,i1] = num_bins_defined;
-               thresholds[,i1] = col_bins;     
-       }
-       
-       print ("PREPROCESSING SCALE FEATURE MATRIX...");
-       min_num_bins = min (count_thresholds);
-       max_num_bins = max (count_thresholds);
-       total_num_bins = sum (count_thresholds);
-       cum_count_thresholds = t (cumsum (t (count_thresholds)));
-       X_scale_ext = matrix (0, rows = num_records, cols = total_num_bins);
-       parfor (i2 in 1:num_scale_features, check = 0) { 
-               Xi2 = X_scale[,i2];
-               count_threshold = as.scalar (count_thresholds[,i2]);
-               offset_feature = 1;
-               if (i2 > 1) {
-                       offset_feature = offset_feature + as.integer (as.scalar 
(cum_count_thresholds[, (i2 - 1)]));
-               }
-
-               ti2 = t(thresholds[1:count_threshold, i2]);
-               X_scale_ext[,offset_feature:(offset_feature + count_threshold - 
1)] = outer (Xi2, ti2, "<");
-       }
-}
-
-num_features_total = num_scale_features + num_cat_features;
-num_feature_samples = as.integer (floor (num_features_total ^ fpow));
-
-##### INITIALIZATION
-L = matrix (1, rows = num_records, cols = num_trees); # last visited node id 
for each training sample
-
-# create matrix of counts (generated by Poisson distribution) storing how many 
times each sample appears in each tree
-print ("CONPUTING COUNTS...");
-C = rand (rows = num_records, cols = num_trees, pdf = "poisson", lambda = 
rate);
-Ix_nonzero = ppred (C, 0, "!=");
-L = L * Ix_nonzero;
-total_counts = sum (C);
-
-
-# model
-# LARGE leaf nodes
-# NC_large[,1]: node id
-# NC_large[,2]: tree id
-# NC_large[,3]: class label
-# NC_large[,4]: no. of misclassified samples 
-# NC_large[,5]: 1 if special leaf (impure and 3 samples at that leaf > 
threshold) or 0 otherwise 
-NC_large = matrix (0, rows = 5, cols = 1); 
-
-# SMALL leaf nodes 
-# same schema as for LARGE leaf nodes (to be initialized)
-NC_small = matrix (0, rows = 5, cols = 1); 
-
-# LARGE internal nodes
-# Q_large[,1]: node id
-# Q_large[,2]: tree id
-Q_large = matrix (0, rows = 2, cols = num_trees); 
-Q_large[1,] = matrix (1, rows = 1, cols = num_trees);
-Q_large[2,] = t (seq (1, num_trees));
-
-# SMALL internal nodes
-# same schema as for LARGE internal nodes (to be initialized)
-Q_small = matrix (0, rows = 2, cols = 1); 
-
-# F_large[,1]: feature
-# F_large[,2]: type
-# F_large[,3]: offset 
-F_large = matrix (0, rows = 3, cols = 1);
-
-# same schema as for LARGE nodes
-F_small = matrix (0, rows = 3, cols = 1); 
-
-# split points for LARGE internal nodes
-S_large = matrix (0, rows = 1, cols = 1);
-
-# split points for SMALL internal nodes 
-S_small = matrix (0, rows = 1, cols = 1); 
-
-# initialize queue
-cur_nodes_large = matrix (1, rows = 2, cols = num_trees);
-cur_nodes_large[2,] = t (seq (1, num_trees));
-
-num_cur_nodes_large = num_trees;
-num_cur_nodes_small = 0;
-level = 0;
-
-while ((num_cur_nodes_large + num_cur_nodes_small) > 0 & level < depth) {
-       
-       level = level + 1;
-       print (" --- start level " + level + " --- ");
-       
-       ##### PREPARE MODEL
-       if (num_cur_nodes_large > 0) { # LARGE nodes to process
-               cur_Q_large = matrix (0, rows = 2, cols = 2 * 
num_cur_nodes_large);
-               cur_NC_large = matrix (0, rows = 5, cols = 2 * 
num_cur_nodes_large); 
-               cur_F_large = matrix (0, rows = 3, cols = num_cur_nodes_large); 
-               cur_S_large = matrix (0, rows = 1, cols = num_cur_nodes_large * 
distinct_values_max); 
-               cur_nodes_small = matrix (0, rows = 3, cols = 2 * 
num_cur_nodes_large); 
-       }
-
-       ##### LOOP OVER LARGE NODES...
-       parfor (i6 in 1:num_cur_nodes_large, check = 0) { 
-       
-               cur_node = as.scalar (cur_nodes_large[1,i6]);
-               cur_tree = as.scalar (cur_nodes_large[2,i6]);
-                       
-               # select sample features WOR
-               feature_samples = sample (num_features_total, 
num_feature_samples);
-               feature_samples = order (target = feature_samples, by = 1);
-               num_scale_feature_samples = sum (ppred (feature_samples, 
num_scale_features, "<="));
-               num_cat_feature_samples = num_feature_samples - 
num_scale_feature_samples;
-               
-               # --- find best split ---
-               # samples that reach cur_node 
-               Ix = ppred (L[,cur_tree], cur_node, "==");              
-               
-               cur_Y_bin = Y_bin * (Ix * C[,cur_tree]);
-               label_counts_overall = colSums (cur_Y_bin);
-               label_sum_overall = sum (label_counts_overall);
-               label_dist_overall = label_counts_overall / label_sum_overall;
-
-               if (imp == "entropy") {
-                       label_dist_zero = ppred (label_dist_overall, 0, "==");
-                       cur_impurity = - sum (label_dist_overall * log 
(label_dist_overall + label_dist_zero)); # / log (2); # impurity before
-               } else { # imp == "Gini"
-                       cur_impurity = sum (label_dist_overall * (1 - 
label_dist_overall)); # impurity before
-               }
-               best_scale_gain = 0;
-               best_cat_gain = 0;
-               if (num_scale_features > 0 & num_scale_feature_samples > 0) {
-                       
-                       scale_feature_samples = 
feature_samples[1:num_scale_feature_samples,];
-                       
-                       # main operation        
-                       label_counts_left_scale = t (t (cur_Y_bin) %*% 
X_scale_ext); 
-               
-                       # compute left and right label distribution
-                       label_sum_left = rowSums (label_counts_left_scale);
-                       label_dist_left = label_counts_left_scale / 
label_sum_left;
-                       if (imp == "entropy") {
-                               label_dist_left = replace (target = 
label_dist_left, pattern = 0, replacement = 1);
-                               log_label_dist_left = log (label_dist_left); # 
/ log (2)
-                               impurity_left_scale = - rowSums 
(label_dist_left * log_label_dist_left); 
-                       } else { # imp == "Gini"
-                               impurity_left_scale = rowSums (label_dist_left 
* (1 - label_dist_left)); 
-                       }
-                       #
-                       label_counts_right_scale = - label_counts_left_scale + 
label_counts_overall; 
-                       label_sum_right = rowSums (label_counts_right_scale);
-                       label_dist_right = label_counts_right_scale / 
label_sum_right;
-                       if (imp == "entropy") {
-                               label_dist_right = replace (target = 
label_dist_right, pattern = 0, replacement = 1);
-                               log_label_dist_right = log (label_dist_right); 
# / log (2)
-                               impurity_right_scale = - rowSums 
(label_dist_right * log_label_dist_right);             
-                       } else { # imp == "Gini"
-                               impurity_right_scale = rowSums 
(label_dist_right * (1 - label_dist_right)); 
-                       }
-                       
-                       I_gain_scale = cur_impurity - ( ( label_sum_left / 
label_sum_overall ) * impurity_left_scale + ( label_sum_right / 
label_sum_overall ) * impurity_right_scale); 
-               
-                       I_gain_scale = replace (target = I_gain_scale, pattern 
= "NaN", replacement = 0);       
-                       
-                       # determine best feature to split on and the split value
-                       feature_start_ind = matrix (0, rows = 1, cols = 
num_scale_features);
-                       feature_start_ind[1,1] = 1;
-                       if (num_scale_features > 1) {
-                               feature_start_ind[1,2:num_scale_features] = 
cum_count_thresholds[1,1:(num_scale_features - 1)] + 1;
-                       }
-                       max_I_gain_found = 0;
-                       max_I_gain_found_ind = 0;
-                       best_i = 0;
-                       
-                       for (i in 1:num_scale_feature_samples) { # assuming 
feature_samples is 5x1
-                               cur_feature_samples_bin = as.scalar 
(scale_feature_samples[i,]); 
-                               cur_start_ind = as.scalar 
(feature_start_ind[,cur_feature_samples_bin]);
-                               cur_end_ind = as.scalar 
(cum_count_thresholds[,cur_feature_samples_bin]);
-                               I_gain_portion = 
I_gain_scale[cur_start_ind:cur_end_ind,];
-                               cur_max_I_gain = max (I_gain_portion);
-                               cur_max_I_gain_ind = as.scalar (rowIndexMax (t 
(I_gain_portion)));
-                               if (cur_max_I_gain > max_I_gain_found) {
-                                       max_I_gain_found = cur_max_I_gain;
-                                       max_I_gain_found_ind = 
cur_max_I_gain_ind;
-                                       best_i = i;
-                               }
-                       }
-
-                       best_scale_gain = max_I_gain_found;
-                       max_I_gain_ind_scale = max_I_gain_found_ind;
-                       best_scale_feature = 0;
-                       if (best_i > 0) {
-                               best_scale_feature = as.scalar 
(scale_feature_samples[best_i,]);
-                       }
-                       best_scale_split = max_I_gain_ind_scale;
-                       if (best_scale_feature > 1) {
-                               best_scale_split = best_scale_split + 
as.scalar(cum_count_thresholds[,(best_scale_feature - 1)]);
-                       }                                       
-               }
-       
-               if (num_cat_features > 0 & num_cat_feature_samples > 0){
-                       
-                       cat_feature_samples = 
feature_samples[(num_scale_feature_samples + 1):(num_scale_feature_samples + 
num_cat_feature_samples),] - num_scale_features;
-                       
-                       # initialization
-                       split_values_bin = matrix (0, rows = 1, cols = 
distinct_values_overall); 
-                       split_values = split_values_bin; 
-                       split_values_offset = matrix (0, rows = 1, cols = 
num_cat_features); 
-                       I_gains = split_values_offset; 
-                       impurities_left = split_values_offset;
-                       impurities_right = split_values_offset;
-                       best_label_counts_left = matrix (0, rows = 
num_cat_features, cols = num_classes);
-                       best_label_counts_right = matrix (0, rows = 
num_cat_features, cols = num_classes);
-                       
-                       # main operation
-                       label_counts = t (t (cur_Y_bin) %*% X_cat);             
        
-                       
-                       parfor (i9 in 1:num_cat_feature_samples, check = 0){
-                       
-                               cur_cat_feature = as.scalar 
(cat_feature_samples[i9,1]);
-                               start_ind = 1;
-                               if (cur_cat_feature > 1) {
-                                       start_ind = start_ind + as.scalar 
(distinct_values_offset[(cur_cat_feature - 1),]);
-                               }
-                               offset = as.scalar 
(distinct_values[1,cur_cat_feature]);
-                               
-                               cur_label_counts = 
label_counts[start_ind:(start_ind + offset - 1),];
-                                                               
-                               label_sum = rowSums (cur_label_counts);
-                               label_dist = cur_label_counts / label_sum;
-                               if (imp == "entropy") {
-                                       label_dist = replace (target = 
label_dist, pattern = 0, replacement = 1);
-                                       log_label_dist = log (label_dist); # / 
log(2)
-                                       impurity = - rowSums (label_dist * 
log_label_dist); 
-                                       impurity = replace (target = impurity, 
pattern = "NaN", replacement = 1/0); 
-                               } else { # imp == "Gini"
-                                       impurity = rowSums (label_dist * (1 - 
label_dist));                             
-                               }
-                       
-                               # sort cur feature by impurity
-                               cur_distinct_values = seq (1, nrow 
(cur_label_counts));
-                               cur_distinct_values_impurity = append 
(cur_distinct_values, impurity);
-                               cur_feature_sorted = order (target = 
cur_distinct_values_impurity, by = 2, decreasing = FALSE);
-                               P = table (cur_distinct_values, 
cur_feature_sorted); # permutation matrix
-                               label_counts_sorted = P %*% cur_label_counts;
-
-                               # compute left and right label distribution     
                
-                               label_counts_left = cumsum 
(label_counts_sorted);
-                       
-                               label_sum_left = rowSums (label_counts_left);
-                               label_dist_left = label_counts_left / 
label_sum_left;
-                               label_dist_left = replace (target = 
label_dist_left, pattern = "NaN", replacement = 1);
-                               if (imp == "entropy") {
-                                       label_dist_left = replace (target = 
label_dist_left, pattern = 0, replacement = 1);
-                                       log_label_dist_left = log 
(label_dist_left); # / log(2)
-                                       impurity_left = - rowSums 
(label_dist_left * log_label_dist_left);
-                               } else { # imp == "Gini"
-                                       impurity_left = rowSums 
(label_dist_left * (1 - label_dist_left));                                      
-                               }
-                               #
-                               label_counts_right = - label_counts_left + 
label_counts_overall;
-                               label_sum_right = rowSums (label_counts_right);
-                               label_dist_right = label_counts_right / 
label_sum_right;
-                               label_dist_right = replace (target = 
label_dist_right, pattern = "NaN", replacement = 1);
-                               if (imp == "entropy") {
-                                       label_dist_right = replace (target = 
label_dist_right, pattern = 0, replacement = 1);
-                                       log_label_dist_right = log 
(label_dist_right); # / log (2)
-                                       impurity_right = - rowSums 
(label_dist_right * log_label_dist_right);                   
-                               } else { # imp == "Gini"
-                                       impurity_right = rowSums 
(label_dist_right * (1 - label_dist_right));                                   
-                               }
-                               I_gain = cur_impurity - ( ( label_sum_left / 
label_sum_overall ) * impurity_left + ( label_sum_right / label_sum_overall ) * 
impurity_right);
-                       
-                               Ix_label_sum_left_zero = ppred (label_sum_left, 
0, "==");
-                               Ix_label_sum_right_zero = ppred 
(label_sum_right, 0, "==");
-                               Ix_label_sum_zero = Ix_label_sum_left_zero * 
Ix_label_sum_right_zero;
-                               I_gain = I_gain * (1 - Ix_label_sum_zero);      
-                               
-                               I_gain[nrow (I_gain),] = 0; # last entry invalid
-                       
-                               max_I_gain_ind = as.scalar (rowIndexMax (t 
(I_gain)));
-
-                               split_values[1, start_ind:(start_ind + 
max_I_gain_ind - 1)] = t (cur_feature_sorted[1:max_I_gain_ind,1]);
-                               for (i10 in 1:max_I_gain_ind) {
-                                       ind = as.scalar 
(cur_feature_sorted[i10,1]);
-                                       if (ind == 1) {
-                                               split_values_bin[1,start_ind] = 
1.0; 
-                                       } else {
-                                               split_values_bin[1,(start_ind + 
ind - 1)] = 1.0;
-                                       }
-                               }
-                               split_values_offset[1,cur_cat_feature] = 
max_I_gain_ind;
-                       
-                               I_gains[1,cur_cat_feature] = max (I_gain);
-                       
-                               impurities_left[1,cur_cat_feature] = as.scalar 
(impurity_left[max_I_gain_ind,]);
-                               impurities_right[1,cur_cat_feature] = as.scalar 
(impurity_right[max_I_gain_ind,]);
-                               best_label_counts_left[cur_cat_feature,] = 
label_counts_left[max_I_gain_ind,];
-                               best_label_counts_right[cur_cat_feature,] = 
label_counts_right[max_I_gain_ind,];                                
-                       }
-                       
-                       # determine best feature to split on and the split 
values
-                       best_cat_feature = as.scalar (rowIndexMax (I_gains));
-                       best_cat_gain = max (I_gains);
-                       start_ind = 1;
-                       if (best_cat_feature > 1) {
-                               start_ind = start_ind + as.scalar 
(distinct_values_offset[(best_cat_feature - 1),]);
-                       }
-                       offset = as.scalar 
(distinct_values[1,best_cat_feature]);
-                       best_split_values_bin = split_values_bin[1, 
start_ind:(start_ind + offset - 1)];                
-               }               
-       
-               # compare best scale feature to best cat. feature and pick the 
best one
-               if (num_scale_features > 0 & num_scale_feature_samples > 0 & 
best_scale_gain >= best_cat_gain & best_scale_gain > 0) {
-                       
-                       # --- update model ---
-                       cur_F_large[1,i6] = best_scale_feature;
-                       cur_F_large[2,i6] = 1;
-                       cur_F_large[3,i6] = 1;                  
-                       cur_S_large[1,(i6 - 1) * distinct_values_max + 1] = 
thresholds[max_I_gain_ind_scale, best_scale_feature]; 
-                               
-                       left_child = 2 * (cur_node - 1) + 1 + 1;
-                       right_child = 2 * (cur_node - 1) + 2 + 1;
-                                       
-                       # samples going to the left subtree
-                       Ix_left = X_scale_ext[,best_scale_split]; 
-                                                                               
        
-                       Ix_left = Ix * Ix_left;         
-                       Ix_right = Ix * (1 - Ix_left);
-                               
-                       L[,cur_tree] = L[,cur_tree] * (1 - Ix_left) + (Ix_left 
* left_child);
-                       L[,cur_tree] = L[,cur_tree] * (1 - Ix_right) + 
(Ix_right * right_child);                                                       
         
-                       
-                       left_child_size = sum (Ix_left * C[,cur_tree]);
-                       right_child_size = sum (Ix_right * C[,cur_tree]);
-                       
-                       # check if left or right child is a leaf
-                       left_pure = FALSE;
-                       right_pure = FALSE;
-                       cur_impurity_left = 
as.scalar(impurity_left_scale[best_scale_split,]); # max_I_gain_ind_scale
-                       cur_impurity_right = 
as.scalar(impurity_right_scale[best_scale_split,]); # max_I_gain_ind_scale
-                       if ( (left_child_size <= num_leaf | cur_impurity_left 
== 0 | (level == depth)) & 
-                          (right_child_size <= num_leaf | cur_impurity_right 
== 0 | (level == depth)) | 
-                          (left_child_size <= threshold & right_child_size <= 
threshold & (level == depth)) ) { # both left and right nodes are leaf   
-                               
-                               cur_label_counts_left = 
label_counts_left_scale[best_scale_split,]; # max_I_gain_ind_scale
-                               cur_NC_large[1,(2 * (i6 - 1) + 1)] = 
left_child; 
-                               cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
-                               cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( 
rowIndexMax (cur_label_counts_left)); # leaf class label
-                               left_pure = TRUE;       
-                               # compute number of misclassified points 
-                               cur_NC_large[4,(2 * (i6 - 1) + 1)] = 
left_child_size - max (cur_label_counts_left); 
-                               
-                               cur_label_counts_right = label_counts_overall - 
cur_label_counts_left;
-                               cur_NC_large[1,(2 * i6)] = right_child; 
-                               cur_NC_large[2,(2 * i6)] = cur_tree;
-                               cur_NC_large[3,(2 * i6)] = as.scalar( 
rowIndexMax (cur_label_counts_right)); # leaf class label
-                               right_pure = TRUE;      
-                               # compute number of misclassified pints
-                               cur_NC_large[4,(2 * i6)] = right_child_size - 
max (cur_label_counts_right); 
-                               
-                       } else if (left_child_size <= num_leaf | 
cur_impurity_left == 0 | (level == depth) | 
-                                         (left_child_size <= threshold & 
(level == depth))) {  
-                               
-                               cur_label_counts_left = 
label_counts_left_scale[best_scale_split,]; # max_I_gain_ind_scale
-                               cur_NC_large[1,(2 * (i6 - 1) + 1)] = 
left_child; 
-                               cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
-                               cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( 
rowIndexMax (cur_label_counts_left)); # leaf class label                        
        
-                               left_pure = TRUE;
-                               # compute number of misclassified points 
-                               cur_NC_large[4,(2 * (i6 - 1) + 1)] = 
left_child_size - max (cur_label_counts_left);
-                               
-                       } else if (right_child_size <= num_leaf | 
cur_impurity_right == 0 | (level == depth) |
-                                         (right_child_size <= threshold & 
(level == depth))) {
-                                         
-                               cur_label_counts_right = 
label_counts_right_scale[best_scale_split,]; # max_I_gain_ind_scale
-                               cur_NC_large[1,(2 * i6)] = right_child; 
-                               cur_NC_large[2,(2 * i6)] = cur_tree;
-                               cur_NC_large[3,(2 * i6)] = as.scalar( 
rowIndexMax (cur_label_counts_right)); # leaf class label
-                               right_pure = TRUE;
-                               # compute number of misclassified pints
-                               cur_NC_large[4,(2 * i6)] = right_child_size - 
max (cur_label_counts_right);
-                               
-                       }
-               
-               } else if (num_cat_features > 0 & num_cat_feature_samples > 0 & 
best_cat_gain > 0) {
-                       
-                       # --- update model ---
-                       cur_F_large[1,i6] = best_cat_feature;
-                       cur_F_large[2,i6] = 2;
-                       offset_nonzero = as.scalar 
(split_values_offset[1,best_cat_feature]);
-                       S_start_ind = (i6 - 1) * distinct_values_max + 1;
-                       cur_F_large[3,i6] = offset_nonzero;
-                       cur_S_large[1,S_start_ind:(S_start_ind + offset_nonzero 
- 1)] = split_values[1,start_ind:(start_ind + offset_nonzero - 1)];             
-               
-                       left_child = 2 * (cur_node - 1) + 1 + 1;
-                       right_child = 2 * (cur_node - 1) + 2 + 1;
-                                       
-                       # samples going to the left subtree
-                       Ix_left = rowSums (X_cat[,start_ind:(start_ind + offset 
- 1)] * best_split_values_bin);
-                       Ix_left = ppred (Ix_left, 1, ">=");
-                                                                               
          
-                       Ix_left = Ix * Ix_left;         
-                       Ix_right = Ix * (1 - Ix_left);
-                       
-                       L[,cur_tree] = L[,cur_tree] * (1 - Ix_left) + (Ix_left 
* left_child);
-                       L[,cur_tree] = L[,cur_tree] * (1 - Ix_right) + 
(Ix_right * right_child);                                                       
         
-                       
-                       left_child_size = sum (Ix_left * C[,cur_tree]);
-                       right_child_size = sum (Ix_right * C[,cur_tree]);
-                       
-                       # check if left or right child is a leaf
-                       left_pure = FALSE;
-                       right_pure = FALSE;
-                       cur_impurity_left = 
as.scalar(impurities_left[,best_cat_feature]); 
-                       cur_impurity_right = 
as.scalar(impurities_right[,best_cat_feature]); 
-                       if ( (left_child_size <= num_leaf | cur_impurity_left 
== 0 | (level == depth)) & 
-                          (right_child_size <= num_leaf | cur_impurity_right 
== 0 | (level == depth)) | 
-                          (left_child_size <= threshold & right_child_size <= 
threshold & (level == depth)) ) { # both left and right nodes are leaf   
-                               
-                               cur_label_counts_left = 
best_label_counts_left[best_cat_feature,];
-                               cur_NC_large[1,(2 * (i6 - 1) + 1)] = 
left_child; 
-                               cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
-                               cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( 
rowIndexMax (cur_label_counts_left)); # leaf class label
-                               left_pure = TRUE;
-                               # compute number of misclassified points 
-                               cur_NC_large[4,(2 * (i6 - 1) + 1)] = 
left_child_size - max (cur_label_counts_left); 
-                               
-                               cur_label_counts_right = label_counts_overall - 
cur_label_counts_left;
-                               cur_NC_large[1,(2 * i6)] = right_child; 
-                               cur_NC_large[2,(2 * i6)] = cur_tree;    
-                               cur_NC_large[3,(2 * i6)] = as.scalar( 
rowIndexMax (cur_label_counts_right)); # leaf class label
-                               right_pure = TRUE;
-                               # compute number of misclassified pints
-                               cur_NC_large[4,(2 * i6)] = right_child_size - 
max (cur_label_counts_right);                     
-                       
-                       } else if (left_child_size <= num_leaf | 
cur_impurity_left == 0 | (level == depth) |
-                                         (left_child_size <= threshold & 
(level == depth))) {  
-                               
-                               cur_label_counts_left = 
best_label_counts_left[best_cat_feature,];
-                               cur_NC_large[1,(2 * (i6 - 1) + 1)] = 
left_child; 
-                               cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;  
-                               cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( 
rowIndexMax (cur_label_counts_left)); # leaf class label                        
        
-                               left_pure = TRUE;
-                               # compute number of misclassified points 
-                               cur_NC_large[4,(2 * (i6 - 1) + 1)] = 
left_child_size - max (cur_label_counts_left);                             
-                       
-                       } else if (right_child_size <= num_leaf | 
cur_impurity_right == 0 | (level == depth) |
-                                         (right_child_size <= threshold & 
(level == depth))) {
-                               
-                               cur_label_counts_right = 
best_label_counts_right[best_cat_feature,];
-                               cur_NC_large[1,(2 * i6)] = right_child; 
-                               cur_NC_large[2,(2 * i6)] = cur_tree;
-                               cur_NC_large[3,(2 * i6)] = as.scalar( 
rowIndexMax (cur_label_counts_right)); # leaf class label
-                               right_pure = TRUE;
-                               # compute number of misclassified pints
-                               cur_NC_large[4,(2 * i6)] = right_child_size - 
max (cur_label_counts_right);             
-                       
-                       }               
-               } else {
-                       
-                       print ("NUMBER OF SAMPLES AT NODE " + cur_node + " in 
tree " + cur_tree + " CANNOT BE REDUCED TO MATCH " + num_leaf + ". THIS NODE IS 
DECLARED AS LEAF!");
-                       right_pure = TRUE;
-                       left_pure = TRUE;
-                       cur_NC_large[1,(2 * (i6 - 1) + 1)] = cur_node;
-                       cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
-                       class_label = as.scalar (rowIndexMax 
(label_counts_overall));
-                       cur_NC_large[3,(2 * (i6 - 1) + 1)] = class_label;
-                       cur_NC_large[4,(2 * (i6 - 1) + 1)] = label_sum_overall 
- max (label_counts_overall);
-                       cur_NC_large[5,(2 * (i6 - 1) + 1)] = 1; # special leaf  
-                                               
-               }
-               
-               # add nodes to Q
-               if (!left_pure) {
-                       if (left_child_size > threshold) {
-                               cur_Q_large[1,(2 * (i6 - 1)+ 1)] = left_child; 
-                               cur_Q_large[2,(2 * (i6 - 1)+ 1)] = cur_tree; 
-                       } else {
-                               cur_nodes_small[1,(2 * (i6 - 1)+ 1)] = 
left_child;
-                               cur_nodes_small[2,(2 * (i6 - 1)+ 1)] = 
left_child_size;
-                               cur_nodes_small[3,(2 * (i6 - 1)+ 1)] = 
cur_tree;                                
-                       }
-               }
-               if (!right_pure) {
-                       if (right_child_size > threshold) {
-                               cur_Q_large[1,(2 * i6)] = right_child;
-                               cur_Q_large[2,(2 * i6)] = cur_tree;             
                
-                       } else{
-                               cur_nodes_small[1,(2 * i6)] = right_child;
-                               cur_nodes_small[2,(2 * i6)] = right_child_size;
-                               cur_nodes_small[3,(2 * i6)] = cur_tree;         
                                                
-                       }       
-               }
-       }
-       
-       ##### PREPARE MODEL FOR LARGE NODES
-       if (num_cur_nodes_large > 0) {
-               cur_Q_large = removeEmpty (target = cur_Q_large, margin = 
"cols");
-               if (as.scalar (cur_Q_large[1,1]) != 0) Q_large = append 
(Q_large, cur_Q_large);
-               cur_NC_large = removeEmpty (target = cur_NC_large, margin = 
"cols");
-               if (as.scalar (cur_NC_large[1,1]) != 0) NC_large = append 
(NC_large, cur_NC_large);
-
-               cur_F_large = removeEmpty (target = cur_F_large, margin = 
"cols");
-               if (as.scalar (cur_F_large[1,1]) != 0) F_large = append 
(F_large, cur_F_large);
-               cur_S_large = removeEmpty (target = cur_S_large, margin = 
"cols");
-               if (as.scalar (cur_S_large[1,1]) != 0) S_large = append 
(S_large, cur_S_large); 
-                       
-               num_cur_nodes_large_pre = 2 * num_cur_nodes_large;
-               if (as.scalar (cur_Q_large[1,1]) == 0) {
-                       num_cur_nodes_large = 0;
-               } else {
-                       cur_nodes_large = cur_Q_large;
-                       num_cur_nodes_large = ncol (cur_Q_large);
-               }       
-       }
-
-       ##### PREPARE MODEL FOR SMALL NODES     
-       cur_nodes_small_nonzero = removeEmpty (target = cur_nodes_small, margin 
= "cols");
-       if (as.scalar (cur_nodes_small_nonzero[1,1]) != 0) { # if SMALL nodes 
exist
-               num_cur_nodes_small = ncol (cur_nodes_small_nonzero);
-       }
-               
-       if (num_cur_nodes_small > 0) { # SMALL nodes to process         
-               reserve_len = sum (2 ^ (ceil (log (cur_nodes_small_nonzero[2,]) 
/ log (2)))) + num_cur_nodes_small;
-               cur_Q_small =  matrix (0, rows = 2, cols = reserve_len);
-               cur_F_small = matrix (0, rows = 3, cols = reserve_len); 
-               cur_NC_small = matrix (0, rows = 5, cols = reserve_len); 
-               cur_S_small = matrix (0, rows = 1, cols = reserve_len * 
distinct_values_max); # split values of the best feature        
-       }
-
-       ##### LOOP OVER SMALL NODES...
-       parfor (i7 in 1:num_cur_nodes_small, check = 0) { 
-       
-               cur_node_small = as.scalar (cur_nodes_small_nonzero[1,i7]);     
-               cur_tree_small = as.scalar (cur_nodes_small_nonzero[3,i7]);
-               
-               # build dataset for SMALL node
-               Ix = ppred (L[,cur_tree_small], cur_node_small, "==");          
        
-               if (num_scale_features > 0) {
-                       X_scale_ext_small = removeEmpty (target = X_scale_ext, 
margin = "rows", select = Ix);
-               }
-               if (num_cat_features > 0) {
-                       X_cat_small = removeEmpty (target = X_cat, margin = 
"rows", select = Ix);
-               }
-               
-               L_small = removeEmpty (target = L * Ix, margin = "rows");
-               C_small = removeEmpty (target = C * Ix, margin = "rows");
-               Y_bin_small = removeEmpty (target = Y_bin * Ix, margin = 
"rows");
-                       
-               # compute offset
-               offsets = cumsum (t (2 ^ ceil (log 
(cur_nodes_small_nonzero[2,]) / log (2))));
-               start_ind_global = 1;
-               if (i7 > 1) {
-                       start_ind_global = start_ind_global + as.scalar 
(offsets[(i7 - 1),]);
-               }
-               start_ind_S_global = 1;
-               if (i7 > 1) {
-                       start_ind_S_global = start_ind_S_global + (as.scalar 
(offsets[(i7 - 1),]) * distinct_values_max);
-               }
-                       
-               Q = matrix (0, rows = 2, cols = 1); 
-               Q[1,1] = cur_node_small;
-               Q[2,1] = cur_tree_small;
-               F = matrix (0, rows = 3, cols = 1); 
-               NC = matrix (0, rows = 5, cols = 1); 
-               S = matrix (0, rows = 1, cols = 1); 
-
-               cur_nodes_ = matrix (cur_node_small, rows = 2, cols = 1);
-               cur_nodes_[1,1] = cur_node_small;
-               cur_nodes_[2,1] = cur_tree_small;
-               
-               num_cur_nodes = 1;
-               level_ = level;
-               while (num_cur_nodes > 0 & level_ < depth) {
-                       
-                       level_ = level_ + 1;
-                               
-                       cur_Q = matrix (0, rows = 2, cols = 2 * num_cur_nodes);
-                       cur_F = matrix (0, rows = 3, cols = num_cur_nodes); 
-                       cur_NC = matrix (0, rows = 5, cols = 2 * 
num_cur_nodes); 
-                       cur_S = matrix (0, rows = 1, cols = num_cur_nodes * 
distinct_values_max);
-               
-                       parfor (i8 in 1:num_cur_nodes, check = 0) { 
-                       
-                               cur_node = as.scalar (cur_nodes_[1,i8]);        
-                               cur_tree = as.scalar (cur_nodes_[2,i8]);
-                               
-                               # select sample features WOR
-                               feature_samples = sample (num_features_total, 
num_feature_samples);
-                               feature_samples = order (target = 
feature_samples, by = 1);
-                               num_scale_feature_samples = sum (ppred 
(feature_samples, num_scale_features, "<="));
-                               num_cat_feature_samples = num_feature_samples - 
num_scale_feature_samples;
-               
-                               # --- find best split ---
-                               # samples that reach cur_node 
-                               Ix = ppred (L_small[,cur_tree], cur_node, 
"==");                
-                               cur_Y_bin = Y_bin_small * (Ix * 
C_small[,cur_tree]);
-                               label_counts_overall = colSums (cur_Y_bin);
-                               
-                               label_sum_overall = sum (label_counts_overall);
-                               label_dist_overall = label_counts_overall / 
label_sum_overall;
-                               if (imp == "entropy") {
-                                       label_dist_zero = ppred 
(label_dist_overall, 0, "==");
-                                       cur_impurity = - sum 
(label_dist_overall * log (label_dist_overall + label_dist_zero)); # / log (2);
-                               } else { # imp == "Gini"
-                                       cur_impurity = sum (label_dist_overall 
* (1 - label_dist_overall));                     
-                               }
-                               best_scale_gain = 0;
-                               best_cat_gain = 0;
-                               if (num_scale_features > 0 & 
num_scale_feature_samples > 0) {
-                                       
-                                       scale_feature_samples = 
feature_samples[1:num_scale_feature_samples,];
-                                       
-                                       # main operation        
-                                       label_counts_left_scale = t (t 
(cur_Y_bin) %*% X_scale_ext_small); 
-               
-                                       # compute left and right label 
distribution
-                                       label_sum_left = rowSums 
(label_counts_left_scale);
-                                       label_dist_left = 
label_counts_left_scale / label_sum_left;
-                                       if (imp == "entropy") {
-                                               label_dist_left = replace 
(target = label_dist_left, pattern = 0, replacement = 1);
-                                               log_label_dist_left = log 
(label_dist_left); # / log (2)
-                                               impurity_left_scale = - rowSums 
(label_dist_left * log_label_dist_left); 
-                                       } else { # imp == "Gini"
-                                               impurity_left_scale = rowSums 
(label_dist_left * (1 - label_dist_left)); 
-                                       }
-                                       #
-                                       label_counts_right_scale = - 
label_counts_left_scale + label_counts_overall; 
-                                       label_sum_right = rowSums 
(label_counts_right_scale);
-                                       label_dist_right = 
label_counts_right_scale / label_sum_right;
-                                       if (imp == "entropy") {
-                                               label_dist_right = replace 
(target = label_dist_right, pattern = 0, replacement = 1);
-                                               log_label_dist_right = log 
(label_dist_right); # log (2)
-                                               impurity_right_scale = - 
rowSums (label_dist_right * log_label_dist_right);             
-                                       } else { # imp == "Gini"
-                                               impurity_right_scale = rowSums 
(label_dist_right * (1 - label_dist_right));                     
-                                       }
-                                       I_gain_scale = cur_impurity - ( ( 
label_sum_left / label_sum_overall ) * impurity_left_scale + ( label_sum_right 
/ label_sum_overall ) * impurity_right_scale); 
-                       
-                                       I_gain_scale = replace (target = 
I_gain_scale, pattern = "NaN", replacement = 0);               
-                       
-                                       # determine best feature to split on 
and the split value
-                                       feature_start_ind = matrix (0, rows = 
1, cols = num_scale_features);
-                                       feature_start_ind[1,1] = 1;
-                                       if (num_scale_features > 1) {
-                                               
feature_start_ind[1,2:num_scale_features] = 
cum_count_thresholds[1,1:(num_scale_features - 1)] + 1;
-                                       }
-                                       max_I_gain_found = 0;
-                                       max_I_gain_found_ind = 0;
-                                       best_i = 0;                     
-                                               
-                                       for (i in 1:num_scale_feature_samples) 
{ # assuming feature_samples is 5x1
-                                               cur_feature_samples_bin = 
as.scalar (scale_feature_samples[i,]); 
-                                               cur_start_ind = as.scalar 
(feature_start_ind[,cur_feature_samples_bin]);
-                                               cur_end_ind = as.scalar 
(cum_count_thresholds[,cur_feature_samples_bin]);
-                                               I_gain_portion = 
I_gain_scale[cur_start_ind:cur_end_ind,];
-                                               cur_max_I_gain = max 
(I_gain_portion);
-                                               cur_max_I_gain_ind = as.scalar 
(rowIndexMax (t (I_gain_portion)));
-                                               if (cur_max_I_gain > 
max_I_gain_found) {
-                                                       max_I_gain_found = 
cur_max_I_gain;
-                                                       max_I_gain_found_ind = 
cur_max_I_gain_ind;
-                                                       best_i = i;
-                                               }
-                                       }
-       
-                                       best_scale_gain = max_I_gain_found;
-                                       max_I_gain_ind_scale = 
max_I_gain_found_ind;
-                                       best_scale_feature = 0;
-                                       if (best_i > 0) {
-                                               best_scale_feature = as.scalar 
(scale_feature_samples[best_i,]);
-                                       }
-                                       best_scale_split = max_I_gain_ind_scale;
-                                       if (best_scale_feature > 1) {
-                                               best_scale_split = 
best_scale_split + as.scalar(cum_count_thresholds[,(best_scale_feature - 1)]);
-                                       }                               
-                               }
-                               
-                               if (num_cat_features > 0 & 
num_cat_feature_samples > 0){
-                                               
-                                       cat_feature_samples = 
feature_samples[(num_scale_feature_samples + 1):(num_scale_feature_samples + 
num_cat_feature_samples),] - num_scale_features;
-                                               
-                                       # initialization
-                                       split_values_bin = matrix (0, rows = 1, 
cols = distinct_values_overall); 
-                                       split_values = split_values_bin; 
-                                       split_values_offset = matrix (0, rows = 
1, cols = num_cat_features); 
-                                       I_gains = split_values_offset; 
-                                       impurities_left = split_values_offset;
-                                       impurities_right = split_values_offset;
-                                       best_label_counts_left = matrix (0, 
rows = num_cat_features, cols = num_classes);
-                                       best_label_counts_right = matrix (0, 
rows = num_cat_features, cols = num_classes);
-                       
-                                       # main operation
-                                       label_counts = t (t (cur_Y_bin) %*% 
X_cat_small);               
-                                       
-                                       parfor (i9 in 
1:num_cat_feature_samples, check = 0){
-                               
-                                               cur_cat_feature = as.scalar 
(cat_feature_samples[i9,1]);
-                                               start_ind = 1;
-                                               if (cur_cat_feature > 1) {
-                                                       start_ind = start_ind + 
as.scalar (distinct_values_offset[(cur_cat_feature - 1),]);
-                                               }
-                                               offset = as.scalar 
(distinct_values[1,cur_cat_feature]);
-                               
-                                               cur_label_counts = 
label_counts[start_ind:(start_ind + offset - 1),];
-                                                       
-                                               label_sum = rowSums 
(cur_label_counts);
-                                               label_dist = cur_label_counts / 
label_sum;
-                                               if (imp == "entropy") {
-                                                       label_dist = replace 
(target = label_dist, pattern = 0, replacement = 1);
-                                                       log_label_dist = log 
(label_dist); # / log(2)
-                                                       impurity = - rowSums 
(label_dist * log_label_dist); 
-                                                       impurity = replace 
(target = impurity, pattern = "NaN", replacement = 1/0); 
-                                               } else { # imp == "Gini"
-                                                       impurity = rowSums 
(label_dist * (1 - label_dist));                             
-                                               }
-                               
-                                               # sort cur feature by impurity
-                                               cur_distinct_values = seq (1, 
nrow (cur_label_counts));
-                                               cur_distinct_values_impurity = 
append (cur_distinct_values, impurity);
-                                               cur_feature_sorted = order 
(target = cur_distinct_values_impurity, by = 2, decreasing = FALSE);
-                                               P = table (cur_distinct_values, 
cur_feature_sorted); # permutation matrix
-                                               label_counts_sorted = P %*% 
cur_label_counts;
-       
-                                               # compute left and right label 
distribution                     
-                                               label_counts_left = cumsum 
(label_counts_sorted);
-                       
-                                               label_sum_left = rowSums 
(label_counts_left);
-                                               label_dist_left = 
label_counts_left / label_sum_left;
-                                               label_dist_left = replace 
(target = label_dist_left, pattern = "NaN", replacement = 1);
-                                               if (imp == "entropy") {
-                                                       label_dist_left = 
replace (target = label_dist_left, pattern = 0, replacement = 1);
-                                                       log_label_dist_left = 
log (label_dist_left); # / log(2)
-                                                       impurity_left = - 
rowSums (label_dist_left * log_label_dist_left);
-                                               } else { # imp == "Gini"
-                                                       impurity_left = rowSums 
(label_dist_left * (1 - label_dist_left));                                      
-                                               }
-                                               #
-                                               label_counts_right = - 
label_counts_left + label_counts_overall;
-                                               label_sum_right = rowSums 
(label_counts_right);
-                                               label_dist_right = 
label_counts_right / label_sum_right;
-                                               label_dist_right = replace 
(target = label_dist_right, pattern = "NaN", replacement = 1);
-                                               if (imp == "entropy") {
-                                                       label_dist_right = 
replace (target = label_dist_right, pattern = 0, replacement = 1);
-                                                       log_label_dist_right = 
log (label_dist_right); # / log (2)
-                                                       impurity_right = - 
rowSums (label_dist_right * log_label_dist_right);                   
-                                               } else { # imp == "Gini"
-                                                       impurity_right = 
rowSums (label_dist_right * (1 - label_dist_right));                            
       
-                                               }
-                                               I_gain = cur_impurity - ( ( 
label_sum_left / label_sum_overall ) * impurity_left + ( label_sum_right / 
label_sum_overall ) * impurity_right);
-                       
-                                               Ix_label_sum_left_zero = ppred 
(label_sum_left, 0, "==");
-                                               Ix_label_sum_right_zero = ppred 
(label_sum_right, 0, "==");
-                                               Ix_label_sum_zero = 
Ix_label_sum_left_zero * Ix_label_sum_right_zero;
-                                               I_gain = I_gain * (1 - 
Ix_label_sum_zero);      
-                               
-                                               I_gain[nrow (I_gain),] = 0; # 
last entry invalid
-                       
-                                               max_I_gain_ind = as.scalar 
(rowIndexMax (t (I_gain)));
-
-                                               split_values[1, 
start_ind:(start_ind + max_I_gain_ind - 1)] = t 
(cur_feature_sorted[1:max_I_gain_ind,1]);
-                                               for (i10 in 1:max_I_gain_ind) {
-                                                       ind = as.scalar 
(cur_feature_sorted[i10,1]);
-                                                       if (ind == 1) {
-                                                               
split_values_bin[1,start_ind] = 1.0; 
-                                                       } else {
-                                                               
split_values_bin[1,(start_ind + ind - 1)] = 1.0;
-                                                       }
-                                               }
-                                               
split_values_offset[1,cur_cat_feature] = max_I_gain_ind;
-                       
-                                               I_gains[1,cur_cat_feature] = 
max (I_gain);
-               
-                                               
impurities_left[1,cur_cat_feature] = as.scalar (impurity_left[max_I_gain_ind,]);
-                                               
impurities_right[1,cur_cat_feature] = as.scalar 
(impurity_right[max_I_gain_ind,]);
-                                               
best_label_counts_left[cur_cat_feature,] = label_counts_left[max_I_gain_ind,];
-                                               
best_label_counts_right[cur_cat_feature,] = 
label_counts_right[max_I_gain_ind,];                                
-                                       }
-                       
-                                       # determine best feature to split on 
and the split values
-                                       best_cat_feature = as.scalar 
(rowIndexMax (I_gains));
-                                       best_cat_gain = max (I_gains);
-                                       start_ind = 1;
-                                       if (best_cat_feature > 1) {
-                                               start_ind = start_ind + 
as.scalar (distinct_values_offset[(best_cat_feature - 1),]);
-                                       }
-                                       offset = as.scalar 
(distinct_values[1,best_cat_feature]);
-                                       best_split_values_bin = 
split_values_bin[1, start_ind:(start_ind + offset - 1)];
-                               }
-                               
-                               # compare best scale feature to best cat. 
feature and pick the best one
-                               if (num_scale_features > 0 & 
num_scale_feature_samples > 0 & best_scale_gain >= best_cat_gain & 
best_scale_gain > 0) {
-                                       
-                                       # --- update model ---
-                                       cur_F[1,i8] = best_scale_feature;
-                                       cur_F[2,i8] = 1;
-                                       cur_F[3,i8] = 1;                
-                                       cur_S[1,(i8 - 1) * distinct_values_max 
+ 1] = thresholds[max_I_gain_ind_scale, best_scale_feature]; 
-                                       
-                                       left_child = 2 * (cur_node - 1) + 1 + 1;
-                                       right_child = 2 * (cur_node - 1) + 2 + 
1;
-                                       
-                                       # samples going to the left subtree
-                                       Ix_left = X_scale_ext_small[, 
best_scale_split]; 
-                                                                               
        
-                                       Ix_left = Ix * Ix_left;         
-                                       Ix_right = Ix * (1 - Ix_left);
-                               
-                                       L_small[,cur_tree] = L_small[,cur_tree] 
* (1 - Ix_left) + (Ix_left * left_child);
-                                       L_small[,cur_tree] = L_small[,cur_tree] 
* (1 - Ix_right) + (Ix_right * right_child);                                    
                        
-                       
-                                       left_child_size = sum (Ix_left * 
C_small[,cur_tree]);
-                                       right_child_size = sum (Ix_right * 
C_small[,cur_tree]);
-                               
-                                       # check if left or right child is a leaf
-                                       left_pure = FALSE;
-                                       right_pure = FALSE;
-                                       cur_impurity_left = 
as.scalar(impurity_left_scale[best_scale_split,]); 
-                                       cur_impurity_right = 
as.scalar(impurity_right_scale[best_scale_split,]); 
-                                       if ( (left_child_size <= num_leaf | 
cur_impurity_left == 0 | level_ == depth) & 
-                                          (right_child_size <= num_leaf | 
cur_impurity_right == 0 | level_ == depth) ) { # both left and right nodes are 
leaf  
-                                               
-                                               cur_label_counts_left = 
label_counts_left_scale[best_scale_split,]; 
-                                               cur_NC[1,(2 * (i8 - 1) + 1)] = 
left_child; 
-                                               cur_NC[2,(2 * (i8 - 1) + 1)] = 
cur_tree;
-                                               cur_NC[3,(2 * (i8 - 1) + 1)] = 
as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
-                                               left_pure = TRUE;       
-                                               # compute number of 
misclassified points 
-                                               cur_NC[4,(2 * (i8 - 1) + 1)] = 
left_child_size - max (cur_label_counts_left); 
-                               
-                                               cur_label_counts_right = 
label_counts_overall - cur_label_counts_left;
-                                               cur_NC[1,(2 * i8)] = 
right_child; 
-                                               cur_NC[2,(2 * i8)] = cur_tree;  
-                                               cur_NC[3,(2 * i8)] = as.scalar( 
rowIndexMax (cur_label_counts_right)); # leaf class label
-                                               right_pure = TRUE;      
-                                               # compute number of 
misclassified points 
-                                               cur_NC[4,(2 * i8)] = 
right_child_size - max (cur_label_counts_right);                                
   
-                                               
-                                       } else if (left_child_size <= num_leaf 
| cur_impurity_left == 0 | level_ == depth) {
-                                               
-                                               cur_label_counts_left = 
label_counts_left_scale[best_scale_split,]; 
-                                               cur_NC[1,(2 * (i8 - 1) + 1)] = 
left_child;
-                                               cur_NC[2,(2 * (i8 - 1) + 1)] = 
cur_tree;                                                
-                                               cur_NC[3,(2 * (i8 - 1) + 1)] = 
as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label             
                 
-                                               left_pure = TRUE;       
-                                               # compute number of 
misclassified points 
-                                               cur_NC[4,(2 * (i8 - 1) + 1)] = 
left_child_size - max (cur_label_counts_left); 
-                                               
-                                       } else if (right_child_size <= num_leaf 
| cur_impurity_right == 0 | level_ == depth) {
-                                               
-                                               cur_label_counts_right = 
label_counts_right_scale[best_scale_split,]; 
-                                               cur_NC[1,(2 * i8)] = 
right_child;
-                                               cur_NC[2,(2 * i8)] = cur_tree;  
                                                
-                                               cur_NC[3,(2 * i8)] = as.scalar( 
rowIndexMax (cur_label_counts_right)); # leaf class label
-                                               right_pure = TRUE;
-                                               # compute number of 
misclassified points 
-                                               cur_NC[4,(2 * i8)] = 
right_child_size - max (cur_label_counts_right);                                
           
-                                               
-                                       }                                       
                                
-                                                                               
                                
-                               } else if (num_cat_features > 0 & 
num_cat_feature_samples > 0 & best_cat_gain > 0) {
-                                       
-                                       # --- update model ---
-                                       cur_F[1,i8] = best_cat_feature;
-                                       cur_F[2,i8] = 2;
-                                       offset_nonzero = as.scalar 
(split_values_offset[1,best_cat_feature]);
-                                       S_start_ind = (i8 - 1) * 
distinct_values_max + 1;
-                                       cur_F[3,i8] = offset_nonzero;
-                                       cur_S[1,S_start_ind:(S_start_ind + 
offset_nonzero - 1)] = split_values[1,start_ind:(start_ind + offset_nonzero - 
1)];   
-               
-                                       left_child = 2 * (cur_node - 1) + 1 + 1;
-                                       right_child = 2 * (cur_node - 1) + 2 + 
1;
-                                       
-                                       # samples going to the left subtree
-                                       Ix_left = rowSums 
(X_cat_small[,start_ind:(start_ind + offset - 1)] * best_split_values_bin);
-                                       Ix_left = ppred (Ix_left, 1, ">=");
-                                                                               
          
-                                       Ix_left = Ix * Ix_left;         
-                                       Ix_right = Ix * (1 - Ix_left);
-                       
-                                       L_small[,cur_tree] = L_small[,cur_tree] 
* (1 - Ix_left) + (Ix_left * left_child);
-                                       L_small[,cur_tree] = L_small[,cur_tree] 
* (1 - Ix_right) + (Ix_right * right_child);                                    
                        
-                       
-                                       left_child_size = sum (Ix_left * 
C_small[,cur_tree]);
-                                       right_child_size = sum (Ix_right * 
C_small[,cur_tree]);
-               
-                                       # check if left or right child is a leaf
-                                       left_pure = FALSE;
-                                       right_pure = FALSE;
-                                       cur_impurity_left = 
as.scalar(impurities_left[,best_cat_feature]); 
-                                       cur_impurity_right = 
as.scalar(impurities_right[,best_cat_feature]); 
-                                       if ( (left_child_size <= num_leaf | 
cur_impurity_left == 0 | level_ == depth) & 
-                                          (right_child_size <= num_leaf | 
cur_impurity_right == 0 | level_ == depth) ) { # both left and right nodes are 
leaf  
-                                               
-                                               cur_label_counts_left = 
best_label_counts_left[best_cat_feature,];
-                                               cur_NC[1,(2 * (i8 - 1) + 1)] = 
left_child;
-                                               cur_NC[2,(2 * (i8 - 1) + 1)] = 
cur_tree;                                                
-                                               cur_NC[3,(2 * (i8 - 1) + 1)] = 
as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label
-                                               left_pure = TRUE;
-                                               # compute number of 
misclassified points 
-                                               cur_NC[4,(2 * (i8 - 1) + 1)] = 
left_child_size - max (cur_label_counts_left); 
-                                               
-                                               cur_label_counts_right = 
label_counts_overall - cur_label_counts_left;
-                                               cur_NC[1,(2 * i8)] = 
right_child;
-                                               cur_NC[2,(2 * i8)] = cur_tree;  
                                        
-                                               cur_NC[3,(2 * i8)] = as.scalar( 
rowIndexMax (cur_label_counts_right)); # leaf class label
-                                               right_pure = TRUE;
-                                               # compute number of 
misclassified points 
-                                               cur_NC[4,(2 * i8)] = 
right_child_size - max (cur_label_counts_right);
-                                               
-                                       } else if (left_child_size <= num_leaf 
| cur_impurity_left == 0 | level_ == depth) {    
-                                       
-                                               cur_label_counts_left = 
best_label_counts_left[best_cat_feature,];
-                                               cur_NC[1,(2 * (i8 - 1) + 1)] = 
left_child; 
-                                               cur_NC[2,(2 * (i8 - 1) + 1)] = 
cur_tree;
-                                               cur_NC[3,(2 * (i8 - 1) + 1)] = 
as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label             
                 
-                                               left_pure = TRUE;
-                                               # compute number of 
misclassified points 
-                                               cur_NC[4,(2 * (i8 - 1) + 1)] = 
left_child_size - max (cur_label_counts_left);
-                                               
-                                       } else if (right_child_size <= num_leaf 
| cur_impurity_right == 0 | level_ == depth) {
-                                               cur_label_counts_right = 
best_label_counts_right[best_cat_feature,];
-                                               cur_NC[1,(2 * i8)] = 
right_child; 
-                                               cur_NC[2,(2 * i8)] = cur_tree;
-                                               cur_NC[3,(2 * i8)] = as.scalar( 
rowIndexMax (cur_label_counts_right)); # leaf class label
-                                               right_pure = TRUE;
-                                               # compute number of 
misclassified points 
-                                               cur_NC[4,(2 * i8)] = 
right_child_size - max (cur_label_counts_right);
-                                               
-                                       }               
-                               } else {
-                                                       
-                                       print ("NUMBER OF SAMPLES AT NODE " + 
cur_node + " in tree " + cur_tree + " CANNOT BE REDUCED TO MATCH " + num_leaf + 
". THIS NODE IS DECLARED AS LEAF!");                                            
          
-                                       right_pure = TRUE;
-                                       left_pure = TRUE;
-                                       cur_NC[1,(2 * (i8 - 1) + 1)] = cur_node;
-                                       cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree;
-                                       class_label = as.scalar (rowIndexMax 
(label_counts_overall));
-                                       cur_NC[3,(2 * (i8 - 1) + 1)] = 
class_label;
-                                       cur_NC[4,(2 * (i8 - 1) + 1)] = 
label_sum_overall - max (label_counts_overall);
-                                       cur_NC[5,(2 * (i8 - 1) + 1)] = 1; # 
special leaf
-                                       
-                               }
-                       
-                               # add nodes to Q
-                               if (!left_pure) {
-                                       cur_Q[1,(2 * (i8 - 1)+ 1)] = 
left_child; 
-                                       cur_Q[2,(2 * (i8 - 1)+ 1)] = cur_tree; 
-                               }
-                               if (!right_pure) {
-                                       cur_Q[1,(2 * i8)] = right_child;
-                                       cur_Q[2,(2 * i8)] = cur_tree;           
                
-                               }
-                       }
-               
-                       cur_Q = removeEmpty (target = cur_Q, margin = "cols"); 
-                       Q = append (Q, cur_Q);
-                       NC = append (NC, cur_NC);
-                       F = append (F, cur_F);
-                       S = append (S, cur_S);
-               
-                       num_cur_nodes_pre = 2 * num_cur_nodes;
-                       if (as.scalar (cur_Q[1,1]) == 0) {
-                               num_cur_nodes = 0;
-                       } else {
-                               cur_nodes_ = cur_Q;
-                               num_cur_nodes = ncol (cur_Q);
-                       }
-               }
-               
-               cur_Q_small[,start_ind_global:(start_ind_global + ncol (Q) - 
1)] = Q;
-               cur_NC_small[,start_ind_global:(start_ind_global + ncol (NC) - 
1)] = NC;
-               cur_F_small[,start_ind_global:(start_ind_global + ncol (F) - 
1)] = F;   
-               cur_S_small[,start_ind_S_global:(start_ind_S_global + ncol (S) 
- 1)] = S; 
-       }
-
-       ##### PREPARE MODEL FOR SMALL NODES     
-       if (num_cur_nodes_small > 0) {  # small nodes already processed
-               cur_Q_small = removeEmpty (target = cur_Q_small, margin = 
"cols");
-               if (as.scalar (cur_Q_small[1,1]) != 0) Q_small = append 
(Q_small, cur_Q_small);
-               cur_NC_small = removeEmpty (target = cur_NC_small, margin = 
"cols");
-               if (as.scalar (cur_NC_small[1,1]) != 0) NC_small = append 
(NC_small, cur_NC_small);
-       
-               cur_F_small = removeEmpty (target = cur_F_small, margin = 
"cols"); # 
-               if (as.scalar (cur_F_small[1,1]) != 0) F_small = append 
(F_small, cur_F_small);
-               cur_S_small = removeEmpty (target = cur_S_small, margin = 
"cols"); #            
-               if (as.scalar (cur_S_small[1,1]) != 0) S_small = append 
(S_small, cur_S_small); 
-               
-               num_cur_nodes_small = 0; # reset
-       } 
-               
-       print (" --- end level " + level + ", remaining no. of LARGE nodes to 
expand " + num_cur_nodes_large + " --- ");
-}
-
-#### prepare model
-print ("PREPARING MODEL...")
-### large nodes
-if (as.scalar (Q_large[1,1]) == 0 & ncol (Q_large) > 1) {
-       Q_large = Q_large[,2:ncol (Q_large)];
-}
-if (as.scalar (NC_large[1,1]) == 0 & ncol (NC_large) > 1) {
-       NC_large = NC_large[,2:ncol (NC_large)];
-}
-if (as.scalar (S_large[1,1]) == 0 & ncol (S_large) > 1) {
-       S_large = S_large[,2:ncol (S_large)];
-}
-if (as.scalar (F_large[1,1]) == 0 & ncol (F_large) > 1) {
-       F_large = F_large[,2:ncol (F_large)];
-}
-### small nodes
-if (as.scalar (Q_small[1,1]) == 0 & ncol (Q_small) > 1) {
-       Q_small = Q_small[,2:ncol (Q_small)];
-}
-if (as.scalar (NC_small[1,1]) == 0 & ncol (NC_small) > 1) {
-       NC_small = NC_small[,2:ncol (NC_small)];
-}
-if (as.scalar (S_small[1,1]) == 0 & ncol (S_small) > 1) {
-       S_small = S_small[,2:ncol (S_small)];
-}
-if (as.scalar (F_small[1,1]) == 0 & ncol (F_small) > 1) {
-       F_small = F_small[,2:ncol (F_small)];
-}
-
-# check for special leaves and if there are any remove them from Q_large and 
Q_small 
-special_large_leaves_ind = NC_large[5,];
-num_special_large_leaf = sum (special_large_leaves_ind);
-if (num_special_large_leaf > 0) {
-       print ("PROCESSING " + num_special_large_leaf + " SPECIAL LARGE 
LEAVES...");
-       special_large_leaves = removeEmpty (target = NC_large[1:2,] * 
special_large_leaves_ind, margin = "cols");
-       large_internal_ind = 1 - colSums (outer (t (special_large_leaves[1,]), 
Q_large[1,], "==") * outer (t (special_large_leaves[2,]), Q_large[2,], "=="));
-       Q_large = removeEmpty (target = Q_large * large_internal_ind, margin = 
"cols");
-       F_large = removeEmpty (target = F_large, margin = "cols"); # remove 
special leaves from F
-}
-
-special_small_leaves_ind = NC_small[5,];
-num_special_small_leaf = sum (special_small_leaves_ind);
-if (num_special_small_leaf > 0) {
-       print ("PROCESSING " + num_special_small_leaf + " SPECIAL SMALL 
LEAVES...");
-       special_small_leaves = removeEmpty (target = NC_small[1:2,] * 
special_small_leaves_ind, margin = "cols");
-       small_internal_ind = 1 - colSums (outer (t (special_small_leaves[1,]), 
Q_small[1,], "==") * outer (t (special_small_leaves[2,]), Q_small[2,], "=="));
-       Q_small = removeEmpty (target = Q_small * small_internal_ind, margin = 
"cols");
-       F_small = removeEmpty (target = F_small, margin = "cols"); # remove 
special leaves from F
-}
-
-# model corresponding to large internal nodes
-no_large_internal_node = FALSE;
-if (as.scalar (Q_large[1,1]) != 0) {
-       print ("PROCESSING LARGE INTERNAL NODES...");
-       num_large_internal = ncol (Q_large);
-       max_offset = max (max (F_large[3,]), max (F_small[3,]));
-       M1_large = matrix (0, rows = 6 + max_offset, cols = num_large_internal);
-       M1_large[1:2,] = Q_large;
-       M1_large[4:6,] = F_large;
-       # process S_large
-       cum_offsets_large = cumsum (t (F_large[3,]));
-       parfor (it in 1:num_large_internal, check = 0) {
-               start_ind = 1;
-               if (it > 1) {
-                       start_ind = start_ind + as.scalar 
(cum_offsets_large[(it - 1),]);
-               }
-               offset = as.scalar (F_large[3,it]);
-               M1_large[7:(7 + offset - 1),it] = t 
(S_large[1,start_ind:(start_ind + offset - 1)]); 
-       }       
-} else {
-       print ("No LARGE internal nodes available");
-       no_large_internal_node = TRUE;
-}
-
-# model corresponding to small internal nodes
-no_small_internal_node = FALSE;
-if (as.scalar (Q_small[1,1]) != 0) {
-       print ("PROCESSING SMALL INTERNAL NODES...");
-       num_small_internal = ncol (Q_small);
-       M1_small = matrix (0, rows = 6 + max_offset, cols = num_small_internal);
-       M1_small[1:2,] = Q_small;
-       M1_small[4:6,] = F_small;
-       # process S_small
-       cum_offsets_small = cumsum (t (F_small[3,]));
-       parfor (it in 1:num_small_internal, check = 0) {
-               start_ind = 1;
-               if (it > 1) {
-                       start_ind = start_ind + as.scalar 
(cum_offsets_small[(it - 1),]);
-               }
-               offset = as.scalar (F_small[3,it]);
-               M1_small[7:(7 + offset - 1),it] = t 
(S_small[1,start_ind:(start_ind + offset - 1)]); 
-       }
-} else {
-       print ("No SMALL internal nodes available");    
-       no_small_internal_node = TRUE;
-}
-
-# model corresponding to large leaf nodes
-no_large_leaf_node = FALSE;
-if (as.scalar (NC_large[1,1]) != 0) {
-       print ("PROCESSING LARGE LEAF NODES...");
-       num_large_leaf = ncol (NC_large);
-       M2_large = matrix (0, rows = 6 + max_offset, cols = num_large_leaf);
-       M2_large[1:2,] = NC_large[1:2,];
-       M2_large[5:7,] = NC_large[3:5,];
-} else {
-       print ("No LARGE leaf nodes available");
-       no_large_leaf_node = TRUE;
-}
-
-# model corresponding to small leaf nodes
-no_small_leaf_node = FALSE;
-if (as.scalar (NC_small[1,1]) != 0) {
-       print ("PROCESSING SMALL LEAF NODES...");
-       num_small_leaf = ncol (NC_small);
-       M2_small = matrix (0, rows = 6 + max_offset, cols = num_small_leaf);
-       M2_small[1:2,] = NC_small[1:2,];
-       M2_small[5:7,] = NC_small[3:5,];
-} else {
-       print ("No SMALL leaf nodes available");
-       no_small_leaf_node = TRUE;
-}
-
-if (no_large_internal_node) {
-       M1 = M1_small;
-} else if (no_small_internal_node) {
-       M1 = M1_large;
-} else {
-       M1 = append (M1_large, M1_small);
-}
-
-if (no_large_leaf_node) {
-       M2 = M2_small;
-} else if (no_small_leaf_node) {
-       M2 = M2_large;
-} else {
-       M2 = append (M2_large, M2_small);
-}
-
-M = append (M1, M2);
-M = t (order (target = t (M), by = 1)); # sort by node id
-M = t (order (target = t (M), by = 2)); # sort by tree id
-
-
-# removing redundant subtrees
-if (ncol (M) > 1) {
-       print ("CHECKING FOR REDUNDANT SUBTREES...");
-       red_leaf = TRUE;
-       process_red_subtree = FALSE;
-       invalid_node_ind = matrix (0, rows = 1, cols = ncol (M));
-       while (red_leaf & ncol (M) > 1) {
-               leaf_ind = ppred (M[4,], 0, "==");
-               labels = M[5,] * leaf_ind;
-               tree_ids = M[2,];
-               parent_ids = floor (M[1,] /2);
-               cond1 = ppred (labels[,1:(ncol (M) - 1)], labels[,2:ncol (M)], 
"=="); # siebling leaves with same label
-               cond2 = ppred (parent_ids[,1:(ncol (M) - 1)], 
parent_ids[,2:ncol (M)], "=="); # same parents
-               cond3 = ppred (tree_ids[,1:(ncol (M) - 1)], tree_ids[,2:ncol 
(M)], "=="); # same tree
-               red_leaf_ind =  cond1 * cond2 * cond3 * leaf_ind[,2:ncol (M)];  
-               
-               if (sum (red_leaf_ind) > 0) { # if redundant subtrees exist
-                       red_leaf_ids = M[1:2,2:ncol (M)] * red_leaf_ind;
-                       red_leaf_ids_nonzero = removeEmpty (target = 
red_leaf_ids, margin = "cols");
-                       parfor (it in 1:ncol (red_leaf_ids_nonzero), check = 0){
-                               cur_right_leaf_id = as.scalar 
(red_leaf_ids_nonzero[1,it]); 
-                               cur_parent_id = floor (cur_right_leaf_id / 2);
-                               cur_tree_id = as.scalar 
(red_leaf_ids_nonzero[2,it]); 
-                               cur_right_leaf_pos = as.scalar (rowIndexMax 
(ppred (M[1,], cur_right_leaf_id, "==") * ppred (M[2,], cur_tree_id, "==")));
-                               cur_parent_pos = as.scalar(rowIndexMax (ppred 
(M[1,], cur_parent_id, "==") * ppred (M[2,], cur_tree_id, "==")));
-                               M[3:nrow (M), cur_parent_pos] = M[3:nrow (M), 
cur_right_leaf_pos];
-                               M[4,cur_right_leaf_pos] = -1;
-                               M[4,cur_right_leaf_pos - 1] = -1;
-                               invalid_node_ind[1,cur_right_leaf_pos] = 1;
-                               invalid_node_ind[1,cur_right_leaf_pos - 1] = 1; 
                        
-                       }
-                       process_red_subtree = TRUE;
-               } else {
-                       red_leaf = FALSE;
-               }
-       }
-       
-       if (process_red_subtree) {
-               print ("REMOVING REDUNDANT SUBTREES...");
-               valid_node_ind = ppred (invalid_node_ind, 0, "==");
-               M = removeEmpty (target = M * valid_node_ind, margin = "cols");
-       }
-}
-
-internal_ind = ppred (M[4,], 0, ">");
-internal_ids = M[1:2,] * internal_ind; 
-internal_ids_nonzero = removeEmpty (target = internal_ids, margin = "cols");
-if (as.scalar (internal_ids_nonzero[1,1]) > 0) { # if internal nodes exist 
-    a1 = internal_ids_nonzero[1,];
-    a2 = internal_ids_nonzero[1,] * 2;
-    vcur_tree_id = internal_ids_nonzero[2,];
-    pos_a1 = rowIndexMax( outer(t(a1), M[1,], "==") * outer(t(vcur_tree_id), 
M[2,], "==") );
-    pos_a2 = rowIndexMax( outer(t(a2), M[1,], "==") * outer(t(vcur_tree_id), 
M[2,], "==") );
-    M[3,] = t(table(pos_a1, 1, pos_a2 - pos_a1, ncol(M), 1));
-} 
-else {
-    print ("All trees in the random forest contain only one leaf!");
-}
-
-if (fileC != " ") {
-       write (C, fileC, format = fmtO);
-}
-write (M, fileM, format = fmtO);
+#-------------------------------------------------------------
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+# 
+#   http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------
+
+#  
+# THIS SCRIPT IMPLEMENTS CLASSIFICATION RANDOM FOREST WITH BOTH SCALE AND 
CATEGORICAL FEATURES
+#
+# INPUT                        PARAMETERS:
+# 
---------------------------------------------------------------------------------------------
+# NAME                         TYPE     DEFAULT      MEANING
+# 
---------------------------------------------------------------------------------------------
+# X                            String   ---          Location to read feature 
matrix X; note that X needs to be both recoded and dummy coded 
+# Y                                    String   ---              Location to 
read label matrix Y; note that Y needs to be both recoded and dummy coded
+# R                                    String   " "          Location to read 
the matrix R which for each feature in X contains the following information 
+#                                                                              
                - R[,1]: column ids
+#                                                                              
                - R[,2]: start indices 
+#                                                                              
                - R[,3]: end indices
+#                                                                              
          If R is not provided by default all variables are assumed to be scale
+# bins                         Int      20                       Number of 
equiheight bins per scale feature to choose thresholds
+# depth                        Int      25                       Maximum depth 
of the learned tree
+# num_leaf                     Int      10           Number of samples when 
splitting stops and a leaf node is added
+# num_samples                  Int      3000             Number of samples at 
which point we switch to in-memory subtree building
+# num_trees                    Int      10                       Number of 
trees to be learned in the random forest model
+# subsamp_rate                 Double   1.0              Parameter controlling 
the size of each tree in the forest; samples are selected from a 
+#                                                                              
          Poisson distribution with parameter subsamp_rate (the default value 
is 1.0)
+# feature_subset       Double   0.5              Parameter that controls the 
number of feature used as candidates for splitting at each tree node 
+#                                                                              
          as a power of number of features in the dataset;
+#                                                                              
          by default square root of features (i.e., feature_subset = 0.5) are 
used at each tree node 
+# impurity                     String   "Gini"           Impurity measure: 
entropy or Gini (the default)
+# M                            String   ---              Location to write 
matrix M containing the learned tree
+# C                                    String   " "              Location to 
write matrix C containing the number of times samples are chosen in each tree 
of the random forest 
+# S_map                                        String   " "              
Location to write the mappings from scale feature ids to global feature ids
+# C_map                                        String   " "              
Location to write the mappings from categorical feature ids to global feature 
ids
+# fmt                          String   "text"       The output format of the 
model (matrix M), such as "text" or "csv"
+# 
---------------------------------------------------------------------------------------------
+# OUTPUT: 
+# Matrix M where each column corresponds to a node in the learned tree and 
each row contains the following information:
+#       M[1,j]: id of node j (in a complete binary tree)
+#       M[2,j]: tree id to which node j belongs
+#       M[3,j]: Offset (no. of columns) to left child of j 
+#       M[4,j]: Feature index of the feature that node j looks at if j is an 
internal node, otherwise 0
+#       M[5,j]: Type of the feature that node j looks at if j is an internal 
node: 1 for scale and 2 for categorical features, 
+#                   otherwise the label that leaf node j is supposed to predict
+#       M[6,j]: 1 if j is an internal node and the feature chosen for j is 
scale, otherwise the size of the subset of values 
+#                       stored in rows 7,8,... if j is categorical
+#       M[7:,j]: Only applicable for internal nodes. Threshold the example's 
feature value is compared to is stored at M[7,j] if the feature chosen for j is 
scale;
+#                        If the feature chosen for j is categorical rows 
7,8,... depict the value subset chosen for j   
+# 
-------------------------------------------------------------------------------------------
+# HOW TO INVOKE THIS SCRIPT - EXAMPLE:
+# hadoop jar SystemML.jar -f random-forest.dml -nvargs X=INPUT_DIR/X 
Y=INPUT_DIR/Y R=INPUT_DIR/R M=OUTPUT_DIR/model
+#                                                                              
   bins=20 depth=25 num_leaf=10 num_samples=3000 num_trees=10 impurity=Gini 
fmt=csv
+
+
+# External function for binning
+binning = externalFunction(Matrix[Double] A, Integer binsize, Integer numbins) 
return (Matrix[Double] B, Integer numbinsdef) 
+       implemented in 
(classname="org.apache.sysml.udf.lib.BinningWrapper",exectype="mem")
+
+       
+# Default values of some parameters    
+fileR = ifdef ($R, " ");
+fileC = ifdef ($C, " ");
+fileS_map = ifdef ($S_map, " ");
+fileC_map = ifdef ($C_map, " ");
+fileM = $M;    
+num_bins = ifdef($bins, 20); 
+depth = ifdef($depth, 25);
+num_leaf = ifdef($num_leaf, 10);
+num_trees = ifdef($num_trees, 1); 
+threshold = ifdef ($num_samples, 3000);
+imp = ifdef($impurity, "Gini");
+rate = ifdef ($subsamp_rate, 1);
+fpow = ifdef ($feature_subset, 0.5);
+fmtO = ifdef($fmt, "text");
+
+X = read($X);
+Y_bin = read($Y);
+num_records = nrow (X);
+num_classes = ncol (Y_bin);
+
+# check if there is only one class label
+Y_bin_sum = sum (ppred (colSums (Y_bin), num_records, "=="));
+if (Y_bin_sum == 1) {
+       stop ("Y contains only one class label. No model will be learned!");
+} else if (Y_bin_sum > 1) {
+       stop ("Y is not properly dummy coded. Multiple columns of Y contain 
only ones!")
+}
+
+# split data into X_scale and X_cat
+if (fileR != " ") {
+       R = read (fileR);
+       R = order (target = R, by = 2); # sort by start indices
+       dummy_coded = ppred (R[,2], R[,3], "!=");
+       R_scale = removeEmpty (target = R[,2:3] * (1 - dummy_coded), margin = 
"rows");
+       R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows");
+       if (fileS_map != " ") {
+               scale_feature_mapping = removeEmpty (target = (1 - dummy_coded) 
* seq (1, nrow (R)), margin = "rows");
+               write (scale_feature_mapping, fileS_map, format = fmtO);
+       }
+       if (fileC_map != " ") {
+               cat_feature_mapping = removeEmpty (target = dummy_coded * seq 
(1, nrow (R)), margin = "rows");  
+               write (cat_feature_mapping, fileC_map, format = fmtO);          
+       }
+       sum_dummy = sum (dummy_coded);  
+       if (sum_dummy == nrow (R)) { # all features categorical
+               print ("All features categorical");
+               num_cat_features = nrow (R_cat);
+               num_scale_features = 0;
+               X_cat = X;      
+               distinct_values = t (R_cat[,2] - R_cat[,1] + 1);
+               distinct_values_max = max (distinct_values);
+               distinct_values_offset = cumsum (t (distinct_values));
+               distinct_values_overall = sum (distinct_values);
+       } else if (sum_dummy == 0) { # all features scale
+               print ("All features scale");
+               num_scale_features = ncol (X);
+               num_cat_features = 0;
+               X_scale = X;
+               distinct_values_max = 1;
+       } else { # some features scale some features categorical 
+               num_cat_features = nrow (R_cat);
+               num_scale_features = nrow (R_scale);
+               distinct_values = t (R_cat[,2] - R_cat[,1] + 1);
+               distinct_values_max = max (distinct_values);
+               distinct_values_offset = cumsum (t (distinct_values));
+               distinct_values_overall = sum (distinct_values);
+               
+               W = matrix (1, rows = num_cat_features, cols = 1) %*% matrix 
("1 -1", rows = 1, cols = 2);
+               W = matrix (W, rows = 2 * num_cat_features, cols = 1);
+               if (as.scalar (R_cat[num_cat_features, 2]) == ncol (X)) {
+                       W[2 * num_cat_features,] = 0;
+               }
+               
+               last = ppred (R_cat[,2], ncol (X), "!=");
+               R_cat1 = (R_cat[,2] + 1) * last;
+               R_cat[,2] = (R_cat[,2] * (1 - last)) + R_cat1;
+               R_cat_vec = matrix (R_cat, rows = 2 * num_cat_features, cols = 
1);      
+
+               col_tab = table (R_cat_vec, 1, W, ncol (X), 1);
+               col_ind = cumsum (col_tab);
+               
+               col_ind_cat = removeEmpty (target = col_ind * seq (1, ncol 
(X)), margin = "rows");
+               col_ind_scale = removeEmpty (target = (1 - col_ind) * seq (1, 
ncol (X)), margin = "rows");      
+               X_cat = X %*% table (col_ind_cat, seq (1, nrow (col_ind_cat)), 
ncol (X), nrow (col_ind_cat));
+               X_scale = X %*% table (col_ind_scale, seq (1, nrow 
(col_ind_scale)), ncol (X), nrow (col_ind_scale));           
+       }       
+} else { # only scale features exist
+       print ("All features scale");
+       num_scale_features = ncol (X);
+       num_cat_features = 0;
+       X_scale = X;
+       distinct_values_max = 1;
+}      
+
+if (num_scale_features > 0) {
+
+       print ("COMPUTING BINNING...");
+       bin_size = max (as.integer (num_records / num_bins), 1);
+       count_thresholds = matrix (0, rows = 1, cols = num_scale_features)
+       thresholds = matrix (0, rows = num_bins + 1, cols = num_scale_features)
+       parfor(i1 in 1:num_scale_features) { 
+               col = order (target = X_scale[,i1], by = 1, decreasing = FALSE);
+               [col_bins, num_bins_defined] = binning (col, bin_size, 
num_bins);
+               count_thresholds[,i1] = num_bins_defined;
+               thresholds[,i1] = col_bins;     
+       }
+       
+       print ("PREPROCESSING SCALE FEATURE MATRIX...");
+       min_num_bins = min (count_thresholds);
+       max_num_bins = max (count_thresholds);
+       total_num_bins = sum (count_thresholds);
+       cum_count_thresholds = t (cumsum (t (count_thresholds)));
+       X_scale_ext = matrix (0, rows = num_records, cols = total_num_bins);
+       parfor (i2 in 1:num_scale_features, check = 0) { 
+               Xi2 = X_scale[,i2];
+               count_threshold = as.scalar (count_thresholds[,i2]);
+               offset_feature = 1;
+               if (i2 > 1) {
+                       offset_feature = offset_feature + as.integer (as.scalar 
(cum_count_thresholds[, (i2 - 1)]));
+               }
+
+               ti2 = t(thresholds[1:count_threshold, i2]);
+               X_scale_ext[,offset_feature:(offset_feature + count_threshold - 
1)] = outer (Xi2, ti2, "<");
+       }
+}
+
+num_features_total = num_scale_features + num_cat_features;
+num_feature_samples = as.integer (floor (num_features_total ^ fpow));
+
+##### INITIALIZATION
+L = matrix (1, rows = num_records, cols = num_trees); # last visited node id 
for each training sample
+
+# create matrix of counts (generated by Poisson distribution) storing how many 
times each sample appears in each tree
+print ("CONPUTING COUNTS...");
+C = rand (rows = num_records, cols = num_trees, pdf = "poisson", lambda = 
rate);
+Ix_nonzero = ppred (C, 0, "!=");
+L = L * Ix_nonzero;
+total_counts = sum (C);
+
+
+# model
+# LARGE leaf nodes
+# NC_large[,1]: node id
+# NC_large[,2]: tree id
+# NC_large[,3]: class label
+# NC_large[,4]: no. of misclassified samples 
+# NC_large[,5]: 1 if special leaf (impure and 3 samples at that leaf > 
threshold) or 0 otherwise 
+NC_large = matrix (0, rows = 5, cols = 1); 
+
+# SMALL leaf nodes 
+# same schema as for LARGE leaf nodes (to be initialized)
+NC_small = matrix (0, rows = 5, cols = 1); 
+
+# LARGE internal nodes
+# Q_large[,1]: node id
+# Q_large[,2]: tree id
+Q_large = matrix (0, rows = 2, cols = num_trees); 
+Q_large[1,] = matrix (1, rows = 1, cols = num_trees);
+Q_large[2,] = t (seq (1, num_trees));
+
+# SMALL internal nodes
+# same schema as for LARGE internal nodes (to be initialized)
+Q_small = matrix (0, rows = 2, cols = 1); 
+
+# F_large[,1]: feature
+# F_large[,2]: type
+# F_large[,3]: offset 
+F_large = matrix (0, rows = 3, cols = 1);
+
+# same schema as for LARGE nodes
+F_small = matrix (0, rows = 3, cols = 1); 
+
+# split points for LARGE internal nodes
+S_large = matrix (0, rows = 1, cols = 1);
+
+# split points for SMALL internal nodes 
+S_small = matrix (0, rows = 1, cols = 1); 
+
+# initialize queue
+cur_nodes_large = matrix (1, rows = 2, cols = num_trees);
+cur_nodes_large[2,] = t (seq (1, num_trees));
+
+num_cur_nodes_large = num_trees;
+num_cur_nodes_small = 0;
+level = 0;
+
+while ((num_cur_nodes_large + num_cur_nodes_small) > 0 & level < depth) {
+       
+       level = level + 1;
+       print (" --- start level " + level + " --- ");
+       
+       ##### PREPARE MODEL
+       if (num_cur_nodes_large > 0) { # LARGE nodes to process
+               cur_Q_large = matrix (0, rows = 2, cols = 2 * 
num_cur_nodes_large);
+               cur_NC_large = matrix (0, rows = 5, cols = 2 * 
num_cur_nodes_large); 
+               cur_F_large = matrix (0, rows = 3, cols = num_cur_nodes_large); 
+               cur_S_large = matrix (0, rows = 1, cols = num_cur_nodes_large * 
distinct_values_max); 
+               cur_nodes_small = matrix (0, rows = 3, cols = 2 * 
num_cur_nodes_large); 
+       }
+
+       ##### LOOP OVER LARGE NODES...
+       parfor (i6 in 1:num_cur_nodes_large, check = 0) { 
+       
+               cur_node = as.scalar (cur_nodes_large[1,i6]);
+               cur_tree = as.scalar (cur_nodes_large[2,i6]);
+                       
+               # select sample features WOR
+               feature_samples = sample (num_features_total, 
num_feature_samples);
+               feature_samples = order (target = feature_samples, by = 1);
+               num_scale_feature_samples = sum (ppred (feature_samples, 
num_scale_features, "<="));
+               num_cat_feature_samples = num_feature_samples - 
num_scale_feature_samples;
+               
+               # --- find best split ---
+               # samples that reach cur_node 
+               Ix = ppred (L[,cur_tree], cur_node, "==");              
+               
+               cur_Y_bin = Y_bin * (Ix * C[,cur_tree]);
+               label_counts_overall = colSums (cur_Y_bin);
+               label_sum_overall = sum (label_counts_overall);
+               label_dist_overall = label_counts_overall / label_sum_overall;
+
+               if (imp == "entropy") {
+                       label_dist_zero = ppred (label_dist_overall, 0, "==");
+                       cur_impurity = - sum (label_dist_overall * log 
(label_dist_overall + label_dist_zero)); # / log (2); # impurity before
+               } else { # imp == "Gini"
+                       cur_impurity = sum (label_dist_overall * (1 - 
label_dist_overall)); # impurity before
+               }
+               best_scale_gain = 0;
+               best_cat_gain = 0;
+               if (num_scale_features > 0 & num_scale_feature_samples > 0) {
+                       
+                       scale_feature_samples = 
feature_samples[1:num_scale_feature_samples,];
+                       
+                       # main operation        
+                       label_counts_left_scale = t (t (cur_Y_bin) %*% 
X_scale_ext); 
+               
+                       # compute left and right label distribution
+                       label_sum_left = rowSums (label_counts_left_scale);
+                       label_dist_left = label_counts_left_scale / 
label_sum_left;
+                       if (imp == "entropy") {
+                               label_dist_left = replace (target = 
label_dist_left, pattern = 0, replacement = 1);
+                               log_label_dist_left = log (label_dist_left); # 
/ log (2)
+                               impurity_left_scale = - rowSums 
(label_dist_left * log_label_dist_left); 
+                       } else { # imp == "Gini"
+                               impurity_left_scale = rowSums (label_dist_left 
* (1 - label_dist_left)); 
+                       }
+                       #
+                       label_counts_right_scale = - label_counts_left_scale + 
label_counts_overall; 
+                       label_sum_right = rowSums (label_counts_right_scale);
+                       label_dist_right = label_counts_right_scale / 
label_sum_right;
+                       if (imp == "entropy") {
+                               label_dist_right = replace (target = 
label_dist_right, pattern = 0, replacement = 1);
+                               log_label_dist_right = log (label_dist_right); 
# / log (2)
+                               impurity_right_scale = - rowSums 
(label_dist_right * log_label_dist_right);             
+                       } else { # imp == "Gini"
+                               impurity_right_scale = rowSums 
(label_dist_right * (1 - label_dist_right)); 
+                       }
+                       
+                       I_gain_scale = cur_impurity - ( ( label_sum_left / 
label_sum_overall ) * impurity_left_scale + ( label_sum_right / 
label_sum_overall ) * impurity_right_scale); 
+               
+                       I_gain_scale = replace (target = I_gain_scale, pattern 
= "NaN", replacement = 0);       
+                       
+                       # determine best feature to split on and the split value
+                       feature_start_ind = matrix (0, rows = 1, cols = 
num_scale_features);
+                       feature_start_ind[1,1] = 1;
+                       if (num_scale_features > 1) {
+                               feature_start_ind[1,2:num_scale_features] = 
cum_count_thresholds[1,1:(num_scale_features - 1)] + 1;
+                       }
+                       max_I_gain_found = 0;
+                       max_I_gain_found_ind = 0;
+                       best_i = 0;
+                       
+                       for (i in 1:num_scale_feature_samples) { # assuming 
feature_samples is 5x1
+                               cur_feature_samples_bin = as.scalar 
(scale_feature_samples[i,]); 
+                               cur_start_ind = as.scalar 
(feature_start_ind[,cur_feature_samples_bin]);
+                               cur_end_ind = as.scalar 
(cum_count_thresholds[,cur_feature_samples_bin]);
+                               I_gain_portion = 
I_gain_scale[cur_start_ind:cur_end_ind,];
+                               cur_max_I_gain = max (I_gain_portion);
+                               cur_max_I_gain_ind = as.scalar (rowIndexMax (t 
(I_gain_portion)));
+                               if (cur_max_I_gain > max_I_gain_found) {
+                                       max_I_gain_found = cur_max_I_gain;
+                                       max_I_gain_found_ind = 
cur_max_I_gain_ind;
+                                       best_i = i;
+                               }
+                       }
+
+                       best_scale_gain = max_I_gain_found;
+                       max_I_gain_ind_scale = max_I_gain_found_ind;
+                       best_scale_feature = 0;
+                       if (best_i > 0) {
+                               best_scale_feature = as.scalar 
(scale_feature_samples[best_i,]);
+                       }
+                       best_scale_split = max_I_gain_ind_scale;
+                       if (best_scale_feature > 1) {
+                               best_scale_split = best_scale_split + 
as.scalar(cum_count_thresholds[,(best_scale_feature - 1)]);
+                       }                                       
+               }
+       
+               if (num_cat_features > 0 & num_cat_feature_samples > 0){
+                       
+                       cat_feature_samples = 
feature_samples[(num_scale_feature_samples + 1):(num_scale_feature_samples + 
num_cat_feature_samples),] - num_scale_features;
+                       
+                       # initialization
+                       split_values_bin = matrix (0, rows = 1, cols = 
distinct_values_overall); 
+                       split_values = split_values_bin; 
+                       split_values_offset = matrix (0, rows = 1, cols = 
num_cat_features); 
+                       I_gains = split_values_offset; 
+                       impurities_left = split_values_offset;
+                       impurities_right = split_values_offset;
+                       best_label_counts_left = matrix (0, rows = 
num_cat_features, cols = num_classes);
+                       best_label_counts_right = matrix (0, rows = 
num_cat_features, cols = num_classes);
+                       
+                       # main operation
+                       label_counts = t (t (cur_Y_bin) %*% X_cat);             
        
+                       
+                       parfor (i9 in 1:num_cat_feature_samples, check = 0){
+                       
+                               cur_cat_feature = as.scalar 
(cat_feature_samples[i9,1]);
+                               start_ind = 1;
+                               if (cur_cat_feature > 1) {
+                                       start_ind = start_ind + as.scalar 
(distinct_values_offset[(cur_cat_feature - 1),]);
+                               }
+                               offset = as.scalar 
(distinct_values[1,cur_cat_feature]);
+                               
+                               cur_label_counts = 
label_counts[start_ind:(start_ind + offset - 1),];
+                                                               
+                               label_sum = rowSums (cur_label_counts);
+                               label_dist = cur_label_counts / label_sum;
+                               if (imp == "entropy") {
+                                       label_dist = replace (target = 
label_dist, pattern = 0, replacement = 1);
+                                       log_label_dist = log (label_dist); # / 
log(2)
+                                       impurity = - rowSums (label_dist * 
log_label_dist); 
+                                       impurity = replace (target = impurity, 
pattern = "NaN", replacement = 1/0); 
+                               } else { # imp == "Gini"
+                                       impurity = rowSums (label_dist * (1 - 
label_dist));                             
+                               }
+                       
+                               # sort cur feature by impurity
+                               cur_distinct_values = seq (1, nrow 
(cur_label_counts));
+                               cur_distinct_values_impurity = append 
(cur_distinct_values, impurity);
+                               cur_feature_sorted = order (target = 
cur_distinct_values_impurity, by = 2, decreasing = FALSE);
+                               P = table (cur_distinct_values, 
cur_feature_sorted); # permutation matrix
+                               label_counts_sorted = P %*% cur_label_counts;
+
+                               # compute left and right label distribution     
                
+                               label_counts_left = cumsum 
(label_counts_sorted);
+                       
+                               label_sum_left = rowSums (label_counts_left);
+                               label_dist_left = label_counts_left / 
label_sum_left;
+                               label_dist_left = replace (target = 
label_dist_left, pattern = "NaN", replacement = 1);
+                               if (imp == "entropy") {
+                                       label_dist_left = replace (target = 
label_dist_left, pattern = 0, replacement = 1);
+                                       log_label_dist_left = log 
(label_dist_left); # / log(2)
+                                       impurity_left = - rowSums 
(label_dist_left * log_label_dist_left);
+                               } else { # imp == "Gini"
+                                       impurity_left = rowSums 
(label_dist_left * (1 - label_dist_left));                                      
+                               }
+                               #
+                               label_counts_right = - label_counts_left + 
label_counts_overall;
+                               label_sum_right = rowSums (label_counts_right);
+                               label_dist_right = label_counts_right / 
label_sum_right;
+                               label_dist_right = replace (target = 
label_dist_right, pattern = "NaN", replacement = 1);
+                               if (imp == "entropy") {
+                                       label_dist_right = replace (target = 
label_dist_right, pattern = 0, replacement = 1);
+                                       log_label_dist_right = log 
(label_dist_right); # / log (2)
+                                       impurity_right = - rowSums 
(label_dist_right * log_label_dist_right);                   
+                               } else { # imp == "Gini"
+                                       impurity_right = rowSums 
(label_dist_right * (1 - label_dist_right));                                   
+                               }
+                               I_gain = cur_impurity - ( ( label_sum_left / 
label_sum_overall ) * impurity_left + ( label_sum_right / label_sum_overall ) * 
impurity_right);
+                       
+                               Ix_label_sum_left_zero = ppred (label_sum_left, 
0, "==");
+                               Ix_label_sum_right_zero = ppred 
(label_sum_right, 0, "==");
+                               Ix_label_sum_zero = Ix_label_sum_left_zero * 
Ix_label_sum_right_zero;
+                               I_gain = I_gain * (1 - Ix_label_sum_zero);      
+                               
+                               I_gain[nrow (I_gain),] = 0; # last entry invalid
+                       
+                               max_I_gain_ind = as.scalar (rowIndexMax (t 
(I_gain)));
+
+                               split_values[1, start_ind:(start_ind + 
max_I_gain_ind - 1)] = t (cur_feature_sorted[1:max_I_gain_ind,1]);
+                               for (i10 in 1:max_I_gain_ind) {
+                                       ind = as.scalar 
(cur_feature_sorted[i10,1]);
+                                       if (ind == 1) {
+                                               split_values_bin[1,start_ind] = 
1.0; 
+                                       } else {
+                                               split_values_bin[1,(start_ind + 
ind - 1)] = 1.0;
+                                       }
+                               }
+                               split_values_offset[1,cur_cat_feature] = 
max_I_gain_ind;
+                       
+                               I_gains[1,cur_cat_feature] = max (I_gain);
+                       
+                               impurities_left[1,cur_cat_feature] = as.scalar 
(impurity_left[max_I_gain_ind,]);
+                               impurities_right[1,cur_cat_feature] = as.scalar 
(impurity_right[max_I_gain_ind,]);
+                               best_label_counts_left[cur_cat_feature,] = 
label_counts_left[max_I_gain_ind,];
+                               best_label_counts_right[cur_cat_feature,] = 
label_counts_right[max_I_gain_ind,];                                
+                       }
+                       
+                       # determine best feature to split on and the split 
values
+                       best_cat_feature = as.scalar (rowIndexMax (I_gains));
+                       best_cat_gain = max (I_gains);
+                       start_ind = 1;
+                       if (best_cat_feature > 1) {
+                               start_ind = start_ind + as.scalar 
(distinct_values_offset[(best_cat_feature - 1),]);
+                       }
+                       offset = as.scalar 
(distinct_values[1,best_cat_feature]);
+                       best_split_values_bin = split_values_bin[1, 
start_ind:(start_ind + offset - 1)];                
+               }               
+       
+               # compare best scale feature to best cat. feature and pick the 
best one
+               if (num_scale_features > 0 & num_scale_feature_samples > 0 & 
best_scale_gain >= best_cat_gain & best_scale_gain > 0) {
+                       
+                       # --- update model ---
+                       cur_F_large[1,i6] = best_scale_feature;
+                       cur_F_large[2,i6] = 1;
+                       cur_F_large[3,i6] = 1;                  
+                       cur_S_large[1,(i6 - 1) * distinct_values_max + 1] = 
thresholds[max_I_gain_ind_scale, best_scale_feature]; 
+                               
+                       left_child = 2 * (cur_node - 1) + 1 + 1;
+                       right_child = 2 * (cur_node - 1) + 2 + 1;
+                                       
+                       # samples going to the left subtree
+                       Ix_left = X_scale_ext[,best_scale_split]; 
+                                                                               
        
+                       Ix_left = Ix * Ix_left;         
+                       Ix_right = Ix * (1 - Ix_left);
+                               
+                       L[,cur_tree] = L[,cur_tree] * (1 - Ix_left) + (Ix_left 
* left_child);
+                       L[,cur_tree] = L[,cur_tree] * (1 - Ix_right) + 
(Ix_right * right_child);                                                       
         
+                       
+                       left_child_size = sum (Ix_left * C[,cur_tree]);
+                       right_child_size = sum (Ix_right * C[,cur_tree]);
+                       
+                       # check if left or right child is a leaf
+                       left_pure = FALSE;
+                       right_pure = FALSE;
+                       cur_impurity_left = 
as.scalar(impurity_left_scale[best_scale_split,]); # max_I_gain_ind_scale
+                       cur_impurity_right = 
as.scalar(impurity_right_scale[best_scale_split,]); # max_I_gain_ind_scale
+                       if ( (left_child_size <= num_leaf | cur_impurity_left 
== 0 | (level == depth)) & 
+                          (right_child_size <= num_leaf | cur_impurity_right 
== 0 | (level == depth)) | 
+                          (left_child_size <= threshold & right_child_size <= 
threshold & (level == depth)) ) { # both left and right nodes are leaf   
+                               
+                               cur_label_counts_left = 
label_counts_left_scale[best_scale_split,]; # max_I_gain_ind_scale
+                               cur_NC_large[1,(2 * (i6 - 1) + 1)] = 
left_child; 
+                               cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
+                               cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( 
rowIndexMax (cur_label_counts_left)); # leaf class label
+                               left_pure = TRUE;       
+                               # compute number of misclassified points 
+                               cur_NC_large[4,(2 * (i6 - 1) + 1)] = 
left_child_size - max (cur_label_counts_left); 
+                               
+                               cur_label_counts_right = label_counts_overall - 
cur_label_counts_left;
+                               cur_NC_large[1,(2 * i6)] = right_child; 
+                               cur_NC_large[2,(2 * i6)] = cur_tree;
+                               cur_NC_large[3,(2 * i6)] = as.scalar( 
rowIndexMax (cur_label_counts_right)); # leaf class label
+                               right_pure = TRUE;      
+                               # compute number of misclassified pints
+                               cur_NC_large[4,(2 * i6)] = right_child_size - 
max (cur_label_counts_right); 
+                               
+                       } else if (left_child_size <= num_leaf | 
cur_impurity_left == 0 | (level == depth) | 
+                                         (left_child_size <= threshold & 
(level == depth))) {  
+                               
+                               cur_label_counts_left = 
label_counts_left_scale[best_scale_split,]; # max_I_gain_ind_scale
+                               cur_NC_large[1,(2 * (i6 - 1) + 1)] = 
left_child; 
+                               cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree;
+                               cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( 
rowIndexMax (cur_label_counts_left)); # leaf class label                        
        
+                               left_pure = TRUE;
+                               # compute number of misclassified points 
+                               cur_NC_large[4,(2 * (i

<TRUNCATED>

Reply via email to