http://git-wip-us.apache.org/repos/asf/incubator-systemml/blob/05d2c0a8/scripts/algorithms/random-forest.dml ---------------------------------------------------------------------- diff --git a/scripts/algorithms/random-forest.dml b/scripts/algorithms/random-forest.dml index 7bdc1fb..b68d711 100644 --- a/scripts/algorithms/random-forest.dml +++ b/scripts/algorithms/random-forest.dml @@ -1,1375 +1,1375 @@ -#------------------------------------------------------------- -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# -#------------------------------------------------------------- - -# -# THIS SCRIPT IMPLEMENTS CLASSIFICATION RANDOM FOREST WITH BOTH SCALE AND CATEGORICAL FEATURES -# -# INPUT PARAMETERS: -# --------------------------------------------------------------------------------------------- -# NAME TYPE DEFAULT MEANING -# --------------------------------------------------------------------------------------------- -# X String --- Location to read feature matrix X; note that X needs to be both recoded and dummy coded -# Y String --- Location to read label matrix Y; note that Y needs to be both recoded and dummy coded -# R String " " Location to read the matrix R which for each feature in X contains the following information -# - R[,1]: column ids -# - R[,2]: start indices -# - R[,3]: end indices -# If R is not provided by default all variables are assumed to be scale -# bins Int 20 Number of equiheight bins per scale feature to choose thresholds -# depth Int 25 Maximum depth of the learned tree -# num_leaf Int 10 Number of samples when splitting stops and a leaf node is added -# num_samples Int 3000 Number of samples at which point we switch to in-memory subtree building -# num_trees Int 10 Number of trees to be learned in the random forest model -# subsamp_rate Double 1.0 Parameter controlling the size of each tree in the forest; samples are selected from a -# Poisson distribution with parameter subsamp_rate (the default value is 1.0) -# feature_subset Double 0.5 Parameter that controls the number of feature used as candidates for splitting at each tree node -# as a power of number of features in the dataset; -# by default square root of features (i.e., feature_subset = 0.5) are used at each tree node -# impurity String "Gini" Impurity measure: entropy or Gini (the default) -# M String --- Location to write matrix M containing the learned tree -# C String " " Location to write matrix C containing the number of times samples are chosen in each tree of the random forest -# S_map String " " Location to write the mappings from scale feature ids to global feature ids -# C_map String " " Location to write the mappings from categorical feature ids to global feature ids -# fmt String "text" The output format of the model (matrix M), such as "text" or "csv" -# --------------------------------------------------------------------------------------------- -# OUTPUT: -# Matrix M where each column corresponds to a node in the learned tree and each row contains the following information: -# M[1,j]: id of node j (in a complete binary tree) -# M[2,j]: tree id to which node j belongs -# M[3,j]: Offset (no. of columns) to left child of j -# M[4,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0 -# M[5,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features, -# otherwise the label that leaf node j is supposed to predict -# M[6,j]: 1 if j is an internal node and the feature chosen for j is scale, otherwise the size of the subset of values -# stored in rows 7,8,... if j is categorical -# M[7:,j]: Only applicable for internal nodes. Threshold the example's feature value is compared to is stored at M[7,j] if the feature chosen for j is scale; -# If the feature chosen for j is categorical rows 7,8,... depict the value subset chosen for j -# ------------------------------------------------------------------------------------------- -# HOW TO INVOKE THIS SCRIPT - EXAMPLE: -# hadoop jar SystemML.jar -f random-forest.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=OUTPUT_DIR/model -# bins=20 depth=25 num_leaf=10 num_samples=3000 num_trees=10 impurity=Gini fmt=csv - - -# External function for binning -binning = externalFunction(Matrix[Double] A, Integer binsize, Integer numbins) return (Matrix[Double] B, Integer numbinsdef) - implemented in (classname="org.apache.sysml.udf.lib.BinningWrapper",exectype="mem") - - -# Default values of some parameters -fileR = ifdef ($R, " "); -fileC = ifdef ($C, " "); -fileS_map = ifdef ($S_map, " "); -fileC_map = ifdef ($C_map, " "); -fileM = $M; -num_bins = ifdef($bins, 20); -depth = ifdef($depth, 25); -num_leaf = ifdef($num_leaf, 10); -num_trees = ifdef($num_trees, 1); -threshold = ifdef ($num_samples, 3000); -imp = ifdef($impurity, "Gini"); -rate = ifdef ($subsamp_rate, 1); -fpow = ifdef ($feature_subset, 0.5); -fmtO = ifdef($fmt, "text"); - -X = read($X); -Y_bin = read($Y); -num_records = nrow (X); -num_classes = ncol (Y_bin); - -# check if there is only one class label -Y_bin_sum = sum (ppred (colSums (Y_bin), num_records, "==")); -if (Y_bin_sum == 1) { - stop ("Y contains only one class label. No model will be learned!"); -} else if (Y_bin_sum > 1) { - stop ("Y is not properly dummy coded. Multiple columns of Y contain only ones!") -} - -# split data into X_scale and X_cat -if (fileR != " ") { - R = read (fileR); - R = order (target = R, by = 2); # sort by start indices - dummy_coded = ppred (R[,2], R[,3], "!="); - R_scale = removeEmpty (target = R[,2:3] * (1 - dummy_coded), margin = "rows"); - R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows"); - if (fileS_map != " ") { - scale_feature_mapping = removeEmpty (target = (1 - dummy_coded) * seq (1, nrow (R)), margin = "rows"); - write (scale_feature_mapping, fileS_map, format = fmtO); - } - if (fileC_map != " ") { - cat_feature_mapping = removeEmpty (target = dummy_coded * seq (1, nrow (R)), margin = "rows"); - write (cat_feature_mapping, fileC_map, format = fmtO); - } - sum_dummy = sum (dummy_coded); - if (sum_dummy == nrow (R)) { # all features categorical - print ("All features categorical"); - num_cat_features = nrow (R_cat); - num_scale_features = 0; - X_cat = X; - distinct_values = t (R_cat[,2] - R_cat[,1] + 1); - distinct_values_max = max (distinct_values); - distinct_values_offset = cumsum (t (distinct_values)); - distinct_values_overall = sum (distinct_values); - } else if (sum_dummy == 0) { # all features scale - print ("All features scale"); - num_scale_features = ncol (X); - num_cat_features = 0; - X_scale = X; - distinct_values_max = 1; - } else { # some features scale some features categorical - num_cat_features = nrow (R_cat); - num_scale_features = nrow (R_scale); - distinct_values = t (R_cat[,2] - R_cat[,1] + 1); - distinct_values_max = max (distinct_values); - distinct_values_offset = cumsum (t (distinct_values)); - distinct_values_overall = sum (distinct_values); - - W = matrix (1, rows = num_cat_features, cols = 1) %*% matrix ("1 -1", rows = 1, cols = 2); - W = matrix (W, rows = 2 * num_cat_features, cols = 1); - if (as.scalar (R_cat[num_cat_features, 2]) == ncol (X)) { - W[2 * num_cat_features,] = 0; - } - - last = ppred (R_cat[,2], ncol (X), "!="); - R_cat1 = (R_cat[,2] + 1) * last; - R_cat[,2] = (R_cat[,2] * (1 - last)) + R_cat1; - R_cat_vec = matrix (R_cat, rows = 2 * num_cat_features, cols = 1); - - col_tab = table (R_cat_vec, 1, W, ncol (X), 1); - col_ind = cumsum (col_tab); - - col_ind_cat = removeEmpty (target = col_ind * seq (1, ncol (X)), margin = "rows"); - col_ind_scale = removeEmpty (target = (1 - col_ind) * seq (1, ncol (X)), margin = "rows"); - X_cat = X %*% table (col_ind_cat, seq (1, nrow (col_ind_cat)), ncol (X), nrow (col_ind_cat)); - X_scale = X %*% table (col_ind_scale, seq (1, nrow (col_ind_scale)), ncol (X), nrow (col_ind_scale)); - } -} else { # only scale features exist - print ("All features scale"); - num_scale_features = ncol (X); - num_cat_features = 0; - X_scale = X; - distinct_values_max = 1; -} - -if (num_scale_features > 0) { - - print ("COMPUTING BINNING..."); - bin_size = max (as.integer (num_records / num_bins), 1); - count_thresholds = matrix (0, rows = 1, cols = num_scale_features) - thresholds = matrix (0, rows = num_bins + 1, cols = num_scale_features) - parfor(i1 in 1:num_scale_features) { - col = order (target = X_scale[,i1], by = 1, decreasing = FALSE); - [col_bins, num_bins_defined] = binning (col, bin_size, num_bins); - count_thresholds[,i1] = num_bins_defined; - thresholds[,i1] = col_bins; - } - - print ("PREPROCESSING SCALE FEATURE MATRIX..."); - min_num_bins = min (count_thresholds); - max_num_bins = max (count_thresholds); - total_num_bins = sum (count_thresholds); - cum_count_thresholds = t (cumsum (t (count_thresholds))); - X_scale_ext = matrix (0, rows = num_records, cols = total_num_bins); - parfor (i2 in 1:num_scale_features, check = 0) { - Xi2 = X_scale[,i2]; - count_threshold = as.scalar (count_thresholds[,i2]); - offset_feature = 1; - if (i2 > 1) { - offset_feature = offset_feature + as.integer (as.scalar (cum_count_thresholds[, (i2 - 1)])); - } - - ti2 = t(thresholds[1:count_threshold, i2]); - X_scale_ext[,offset_feature:(offset_feature + count_threshold - 1)] = outer (Xi2, ti2, "<"); - } -} - -num_features_total = num_scale_features + num_cat_features; -num_feature_samples = as.integer (floor (num_features_total ^ fpow)); - -##### INITIALIZATION -L = matrix (1, rows = num_records, cols = num_trees); # last visited node id for each training sample - -# create matrix of counts (generated by Poisson distribution) storing how many times each sample appears in each tree -print ("CONPUTING COUNTS..."); -C = rand (rows = num_records, cols = num_trees, pdf = "poisson", lambda = rate); -Ix_nonzero = ppred (C, 0, "!="); -L = L * Ix_nonzero; -total_counts = sum (C); - - -# model -# LARGE leaf nodes -# NC_large[,1]: node id -# NC_large[,2]: tree id -# NC_large[,3]: class label -# NC_large[,4]: no. of misclassified samples -# NC_large[,5]: 1 if special leaf (impure and 3 samples at that leaf > threshold) or 0 otherwise -NC_large = matrix (0, rows = 5, cols = 1); - -# SMALL leaf nodes -# same schema as for LARGE leaf nodes (to be initialized) -NC_small = matrix (0, rows = 5, cols = 1); - -# LARGE internal nodes -# Q_large[,1]: node id -# Q_large[,2]: tree id -Q_large = matrix (0, rows = 2, cols = num_trees); -Q_large[1,] = matrix (1, rows = 1, cols = num_trees); -Q_large[2,] = t (seq (1, num_trees)); - -# SMALL internal nodes -# same schema as for LARGE internal nodes (to be initialized) -Q_small = matrix (0, rows = 2, cols = 1); - -# F_large[,1]: feature -# F_large[,2]: type -# F_large[,3]: offset -F_large = matrix (0, rows = 3, cols = 1); - -# same schema as for LARGE nodes -F_small = matrix (0, rows = 3, cols = 1); - -# split points for LARGE internal nodes -S_large = matrix (0, rows = 1, cols = 1); - -# split points for SMALL internal nodes -S_small = matrix (0, rows = 1, cols = 1); - -# initialize queue -cur_nodes_large = matrix (1, rows = 2, cols = num_trees); -cur_nodes_large[2,] = t (seq (1, num_trees)); - -num_cur_nodes_large = num_trees; -num_cur_nodes_small = 0; -level = 0; - -while ((num_cur_nodes_large + num_cur_nodes_small) > 0 & level < depth) { - - level = level + 1; - print (" --- start level " + level + " --- "); - - ##### PREPARE MODEL - if (num_cur_nodes_large > 0) { # LARGE nodes to process - cur_Q_large = matrix (0, rows = 2, cols = 2 * num_cur_nodes_large); - cur_NC_large = matrix (0, rows = 5, cols = 2 * num_cur_nodes_large); - cur_F_large = matrix (0, rows = 3, cols = num_cur_nodes_large); - cur_S_large = matrix (0, rows = 1, cols = num_cur_nodes_large * distinct_values_max); - cur_nodes_small = matrix (0, rows = 3, cols = 2 * num_cur_nodes_large); - } - - ##### LOOP OVER LARGE NODES... - parfor (i6 in 1:num_cur_nodes_large, check = 0) { - - cur_node = as.scalar (cur_nodes_large[1,i6]); - cur_tree = as.scalar (cur_nodes_large[2,i6]); - - # select sample features WOR - feature_samples = sample (num_features_total, num_feature_samples); - feature_samples = order (target = feature_samples, by = 1); - num_scale_feature_samples = sum (ppred (feature_samples, num_scale_features, "<=")); - num_cat_feature_samples = num_feature_samples - num_scale_feature_samples; - - # --- find best split --- - # samples that reach cur_node - Ix = ppred (L[,cur_tree], cur_node, "=="); - - cur_Y_bin = Y_bin * (Ix * C[,cur_tree]); - label_counts_overall = colSums (cur_Y_bin); - label_sum_overall = sum (label_counts_overall); - label_dist_overall = label_counts_overall / label_sum_overall; - - if (imp == "entropy") { - label_dist_zero = ppred (label_dist_overall, 0, "=="); - cur_impurity = - sum (label_dist_overall * log (label_dist_overall + label_dist_zero)); # / log (2); # impurity before - } else { # imp == "Gini" - cur_impurity = sum (label_dist_overall * (1 - label_dist_overall)); # impurity before - } - best_scale_gain = 0; - best_cat_gain = 0; - if (num_scale_features > 0 & num_scale_feature_samples > 0) { - - scale_feature_samples = feature_samples[1:num_scale_feature_samples,]; - - # main operation - label_counts_left_scale = t (t (cur_Y_bin) %*% X_scale_ext); - - # compute left and right label distribution - label_sum_left = rowSums (label_counts_left_scale); - label_dist_left = label_counts_left_scale / label_sum_left; - if (imp == "entropy") { - label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1); - log_label_dist_left = log (label_dist_left); # / log (2) - impurity_left_scale = - rowSums (label_dist_left * log_label_dist_left); - } else { # imp == "Gini" - impurity_left_scale = rowSums (label_dist_left * (1 - label_dist_left)); - } - # - label_counts_right_scale = - label_counts_left_scale + label_counts_overall; - label_sum_right = rowSums (label_counts_right_scale); - label_dist_right = label_counts_right_scale / label_sum_right; - if (imp == "entropy") { - label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1); - log_label_dist_right = log (label_dist_right); # / log (2) - impurity_right_scale = - rowSums (label_dist_right * log_label_dist_right); - } else { # imp == "Gini" - impurity_right_scale = rowSums (label_dist_right * (1 - label_dist_right)); - } - - I_gain_scale = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left_scale + ( label_sum_right / label_sum_overall ) * impurity_right_scale); - - I_gain_scale = replace (target = I_gain_scale, pattern = "NaN", replacement = 0); - - # determine best feature to split on and the split value - feature_start_ind = matrix (0, rows = 1, cols = num_scale_features); - feature_start_ind[1,1] = 1; - if (num_scale_features > 1) { - feature_start_ind[1,2:num_scale_features] = cum_count_thresholds[1,1:(num_scale_features - 1)] + 1; - } - max_I_gain_found = 0; - max_I_gain_found_ind = 0; - best_i = 0; - - for (i in 1:num_scale_feature_samples) { # assuming feature_samples is 5x1 - cur_feature_samples_bin = as.scalar (scale_feature_samples[i,]); - cur_start_ind = as.scalar (feature_start_ind[,cur_feature_samples_bin]); - cur_end_ind = as.scalar (cum_count_thresholds[,cur_feature_samples_bin]); - I_gain_portion = I_gain_scale[cur_start_ind:cur_end_ind,]; - cur_max_I_gain = max (I_gain_portion); - cur_max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain_portion))); - if (cur_max_I_gain > max_I_gain_found) { - max_I_gain_found = cur_max_I_gain; - max_I_gain_found_ind = cur_max_I_gain_ind; - best_i = i; - } - } - - best_scale_gain = max_I_gain_found; - max_I_gain_ind_scale = max_I_gain_found_ind; - best_scale_feature = 0; - if (best_i > 0) { - best_scale_feature = as.scalar (scale_feature_samples[best_i,]); - } - best_scale_split = max_I_gain_ind_scale; - if (best_scale_feature > 1) { - best_scale_split = best_scale_split + as.scalar(cum_count_thresholds[,(best_scale_feature - 1)]); - } - } - - if (num_cat_features > 0 & num_cat_feature_samples > 0){ - - cat_feature_samples = feature_samples[(num_scale_feature_samples + 1):(num_scale_feature_samples + num_cat_feature_samples),] - num_scale_features; - - # initialization - split_values_bin = matrix (0, rows = 1, cols = distinct_values_overall); - split_values = split_values_bin; - split_values_offset = matrix (0, rows = 1, cols = num_cat_features); - I_gains = split_values_offset; - impurities_left = split_values_offset; - impurities_right = split_values_offset; - best_label_counts_left = matrix (0, rows = num_cat_features, cols = num_classes); - best_label_counts_right = matrix (0, rows = num_cat_features, cols = num_classes); - - # main operation - label_counts = t (t (cur_Y_bin) %*% X_cat); - - parfor (i9 in 1:num_cat_feature_samples, check = 0){ - - cur_cat_feature = as.scalar (cat_feature_samples[i9,1]); - start_ind = 1; - if (cur_cat_feature > 1) { - start_ind = start_ind + as.scalar (distinct_values_offset[(cur_cat_feature - 1),]); - } - offset = as.scalar (distinct_values[1,cur_cat_feature]); - - cur_label_counts = label_counts[start_ind:(start_ind + offset - 1),]; - - label_sum = rowSums (cur_label_counts); - label_dist = cur_label_counts / label_sum; - if (imp == "entropy") { - label_dist = replace (target = label_dist, pattern = 0, replacement = 1); - log_label_dist = log (label_dist); # / log(2) - impurity = - rowSums (label_dist * log_label_dist); - impurity = replace (target = impurity, pattern = "NaN", replacement = 1/0); - } else { # imp == "Gini" - impurity = rowSums (label_dist * (1 - label_dist)); - } - - # sort cur feature by impurity - cur_distinct_values = seq (1, nrow (cur_label_counts)); - cur_distinct_values_impurity = append (cur_distinct_values, impurity); - cur_feature_sorted = order (target = cur_distinct_values_impurity, by = 2, decreasing = FALSE); - P = table (cur_distinct_values, cur_feature_sorted); # permutation matrix - label_counts_sorted = P %*% cur_label_counts; - - # compute left and right label distribution - label_counts_left = cumsum (label_counts_sorted); - - label_sum_left = rowSums (label_counts_left); - label_dist_left = label_counts_left / label_sum_left; - label_dist_left = replace (target = label_dist_left, pattern = "NaN", replacement = 1); - if (imp == "entropy") { - label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1); - log_label_dist_left = log (label_dist_left); # / log(2) - impurity_left = - rowSums (label_dist_left * log_label_dist_left); - } else { # imp == "Gini" - impurity_left = rowSums (label_dist_left * (1 - label_dist_left)); - } - # - label_counts_right = - label_counts_left + label_counts_overall; - label_sum_right = rowSums (label_counts_right); - label_dist_right = label_counts_right / label_sum_right; - label_dist_right = replace (target = label_dist_right, pattern = "NaN", replacement = 1); - if (imp == "entropy") { - label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1); - log_label_dist_right = log (label_dist_right); # / log (2) - impurity_right = - rowSums (label_dist_right * log_label_dist_right); - } else { # imp == "Gini" - impurity_right = rowSums (label_dist_right * (1 - label_dist_right)); - } - I_gain = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left + ( label_sum_right / label_sum_overall ) * impurity_right); - - Ix_label_sum_left_zero = ppred (label_sum_left, 0, "=="); - Ix_label_sum_right_zero = ppred (label_sum_right, 0, "=="); - Ix_label_sum_zero = Ix_label_sum_left_zero * Ix_label_sum_right_zero; - I_gain = I_gain * (1 - Ix_label_sum_zero); - - I_gain[nrow (I_gain),] = 0; # last entry invalid - - max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain))); - - split_values[1, start_ind:(start_ind + max_I_gain_ind - 1)] = t (cur_feature_sorted[1:max_I_gain_ind,1]); - for (i10 in 1:max_I_gain_ind) { - ind = as.scalar (cur_feature_sorted[i10,1]); - if (ind == 1) { - split_values_bin[1,start_ind] = 1.0; - } else { - split_values_bin[1,(start_ind + ind - 1)] = 1.0; - } - } - split_values_offset[1,cur_cat_feature] = max_I_gain_ind; - - I_gains[1,cur_cat_feature] = max (I_gain); - - impurities_left[1,cur_cat_feature] = as.scalar (impurity_left[max_I_gain_ind,]); - impurities_right[1,cur_cat_feature] = as.scalar (impurity_right[max_I_gain_ind,]); - best_label_counts_left[cur_cat_feature,] = label_counts_left[max_I_gain_ind,]; - best_label_counts_right[cur_cat_feature,] = label_counts_right[max_I_gain_ind,]; - } - - # determine best feature to split on and the split values - best_cat_feature = as.scalar (rowIndexMax (I_gains)); - best_cat_gain = max (I_gains); - start_ind = 1; - if (best_cat_feature > 1) { - start_ind = start_ind + as.scalar (distinct_values_offset[(best_cat_feature - 1),]); - } - offset = as.scalar (distinct_values[1,best_cat_feature]); - best_split_values_bin = split_values_bin[1, start_ind:(start_ind + offset - 1)]; - } - - # compare best scale feature to best cat. feature and pick the best one - if (num_scale_features > 0 & num_scale_feature_samples > 0 & best_scale_gain >= best_cat_gain & best_scale_gain > 0) { - - # --- update model --- - cur_F_large[1,i6] = best_scale_feature; - cur_F_large[2,i6] = 1; - cur_F_large[3,i6] = 1; - cur_S_large[1,(i6 - 1) * distinct_values_max + 1] = thresholds[max_I_gain_ind_scale, best_scale_feature]; - - left_child = 2 * (cur_node - 1) + 1 + 1; - right_child = 2 * (cur_node - 1) + 2 + 1; - - # samples going to the left subtree - Ix_left = X_scale_ext[,best_scale_split]; - - Ix_left = Ix * Ix_left; - Ix_right = Ix * (1 - Ix_left); - - L[,cur_tree] = L[,cur_tree] * (1 - Ix_left) + (Ix_left * left_child); - L[,cur_tree] = L[,cur_tree] * (1 - Ix_right) + (Ix_right * right_child); - - left_child_size = sum (Ix_left * C[,cur_tree]); - right_child_size = sum (Ix_right * C[,cur_tree]); - - # check if left or right child is a leaf - left_pure = FALSE; - right_pure = FALSE; - cur_impurity_left = as.scalar(impurity_left_scale[best_scale_split,]); # max_I_gain_ind_scale - cur_impurity_right = as.scalar(impurity_right_scale[best_scale_split,]); # max_I_gain_ind_scale - if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) & - (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) | - (left_child_size <= threshold & right_child_size <= threshold & (level == depth)) ) { # both left and right nodes are leaf - - cur_label_counts_left = label_counts_left_scale[best_scale_split,]; # max_I_gain_ind_scale - cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child; - cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree; - cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label - left_pure = TRUE; - # compute number of misclassified points - cur_NC_large[4,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left); - - cur_label_counts_right = label_counts_overall - cur_label_counts_left; - cur_NC_large[1,(2 * i6)] = right_child; - cur_NC_large[2,(2 * i6)] = cur_tree; - cur_NC_large[3,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label - right_pure = TRUE; - # compute number of misclassified pints - cur_NC_large[4,(2 * i6)] = right_child_size - max (cur_label_counts_right); - - } else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth) | - (left_child_size <= threshold & (level == depth))) { - - cur_label_counts_left = label_counts_left_scale[best_scale_split,]; # max_I_gain_ind_scale - cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child; - cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree; - cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label - left_pure = TRUE; - # compute number of misclassified points - cur_NC_large[4,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left); - - } else if (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth) | - (right_child_size <= threshold & (level == depth))) { - - cur_label_counts_right = label_counts_right_scale[best_scale_split,]; # max_I_gain_ind_scale - cur_NC_large[1,(2 * i6)] = right_child; - cur_NC_large[2,(2 * i6)] = cur_tree; - cur_NC_large[3,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label - right_pure = TRUE; - # compute number of misclassified pints - cur_NC_large[4,(2 * i6)] = right_child_size - max (cur_label_counts_right); - - } - - } else if (num_cat_features > 0 & num_cat_feature_samples > 0 & best_cat_gain > 0) { - - # --- update model --- - cur_F_large[1,i6] = best_cat_feature; - cur_F_large[2,i6] = 2; - offset_nonzero = as.scalar (split_values_offset[1,best_cat_feature]); - S_start_ind = (i6 - 1) * distinct_values_max + 1; - cur_F_large[3,i6] = offset_nonzero; - cur_S_large[1,S_start_ind:(S_start_ind + offset_nonzero - 1)] = split_values[1,start_ind:(start_ind + offset_nonzero - 1)]; - - left_child = 2 * (cur_node - 1) + 1 + 1; - right_child = 2 * (cur_node - 1) + 2 + 1; - - # samples going to the left subtree - Ix_left = rowSums (X_cat[,start_ind:(start_ind + offset - 1)] * best_split_values_bin); - Ix_left = ppred (Ix_left, 1, ">="); - - Ix_left = Ix * Ix_left; - Ix_right = Ix * (1 - Ix_left); - - L[,cur_tree] = L[,cur_tree] * (1 - Ix_left) + (Ix_left * left_child); - L[,cur_tree] = L[,cur_tree] * (1 - Ix_right) + (Ix_right * right_child); - - left_child_size = sum (Ix_left * C[,cur_tree]); - right_child_size = sum (Ix_right * C[,cur_tree]); - - # check if left or right child is a leaf - left_pure = FALSE; - right_pure = FALSE; - cur_impurity_left = as.scalar(impurities_left[,best_cat_feature]); - cur_impurity_right = as.scalar(impurities_right[,best_cat_feature]); - if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) & - (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) | - (left_child_size <= threshold & right_child_size <= threshold & (level == depth)) ) { # both left and right nodes are leaf - - cur_label_counts_left = best_label_counts_left[best_cat_feature,]; - cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child; - cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree; - cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label - left_pure = TRUE; - # compute number of misclassified points - cur_NC_large[4,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left); - - cur_label_counts_right = label_counts_overall - cur_label_counts_left; - cur_NC_large[1,(2 * i6)] = right_child; - cur_NC_large[2,(2 * i6)] = cur_tree; - cur_NC_large[3,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label - right_pure = TRUE; - # compute number of misclassified pints - cur_NC_large[4,(2 * i6)] = right_child_size - max (cur_label_counts_right); - - } else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth) | - (left_child_size <= threshold & (level == depth))) { - - cur_label_counts_left = best_label_counts_left[best_cat_feature,]; - cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child; - cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree; - cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label - left_pure = TRUE; - # compute number of misclassified points - cur_NC_large[4,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left); - - } else if (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth) | - (right_child_size <= threshold & (level == depth))) { - - cur_label_counts_right = best_label_counts_right[best_cat_feature,]; - cur_NC_large[1,(2 * i6)] = right_child; - cur_NC_large[2,(2 * i6)] = cur_tree; - cur_NC_large[3,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label - right_pure = TRUE; - # compute number of misclassified pints - cur_NC_large[4,(2 * i6)] = right_child_size - max (cur_label_counts_right); - - } - } else { - - print ("NUMBER OF SAMPLES AT NODE " + cur_node + " in tree " + cur_tree + " CANNOT BE REDUCED TO MATCH " + num_leaf + ". THIS NODE IS DECLARED AS LEAF!"); - right_pure = TRUE; - left_pure = TRUE; - cur_NC_large[1,(2 * (i6 - 1) + 1)] = cur_node; - cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree; - class_label = as.scalar (rowIndexMax (label_counts_overall)); - cur_NC_large[3,(2 * (i6 - 1) + 1)] = class_label; - cur_NC_large[4,(2 * (i6 - 1) + 1)] = label_sum_overall - max (label_counts_overall); - cur_NC_large[5,(2 * (i6 - 1) + 1)] = 1; # special leaf - - } - - # add nodes to Q - if (!left_pure) { - if (left_child_size > threshold) { - cur_Q_large[1,(2 * (i6 - 1)+ 1)] = left_child; - cur_Q_large[2,(2 * (i6 - 1)+ 1)] = cur_tree; - } else { - cur_nodes_small[1,(2 * (i6 - 1)+ 1)] = left_child; - cur_nodes_small[2,(2 * (i6 - 1)+ 1)] = left_child_size; - cur_nodes_small[3,(2 * (i6 - 1)+ 1)] = cur_tree; - } - } - if (!right_pure) { - if (right_child_size > threshold) { - cur_Q_large[1,(2 * i6)] = right_child; - cur_Q_large[2,(2 * i6)] = cur_tree; - } else{ - cur_nodes_small[1,(2 * i6)] = right_child; - cur_nodes_small[2,(2 * i6)] = right_child_size; - cur_nodes_small[3,(2 * i6)] = cur_tree; - } - } - } - - ##### PREPARE MODEL FOR LARGE NODES - if (num_cur_nodes_large > 0) { - cur_Q_large = removeEmpty (target = cur_Q_large, margin = "cols"); - if (as.scalar (cur_Q_large[1,1]) != 0) Q_large = append (Q_large, cur_Q_large); - cur_NC_large = removeEmpty (target = cur_NC_large, margin = "cols"); - if (as.scalar (cur_NC_large[1,1]) != 0) NC_large = append (NC_large, cur_NC_large); - - cur_F_large = removeEmpty (target = cur_F_large, margin = "cols"); - if (as.scalar (cur_F_large[1,1]) != 0) F_large = append (F_large, cur_F_large); - cur_S_large = removeEmpty (target = cur_S_large, margin = "cols"); - if (as.scalar (cur_S_large[1,1]) != 0) S_large = append (S_large, cur_S_large); - - num_cur_nodes_large_pre = 2 * num_cur_nodes_large; - if (as.scalar (cur_Q_large[1,1]) == 0) { - num_cur_nodes_large = 0; - } else { - cur_nodes_large = cur_Q_large; - num_cur_nodes_large = ncol (cur_Q_large); - } - } - - ##### PREPARE MODEL FOR SMALL NODES - cur_nodes_small_nonzero = removeEmpty (target = cur_nodes_small, margin = "cols"); - if (as.scalar (cur_nodes_small_nonzero[1,1]) != 0) { # if SMALL nodes exist - num_cur_nodes_small = ncol (cur_nodes_small_nonzero); - } - - if (num_cur_nodes_small > 0) { # SMALL nodes to process - reserve_len = sum (2 ^ (ceil (log (cur_nodes_small_nonzero[2,]) / log (2)))) + num_cur_nodes_small; - cur_Q_small = matrix (0, rows = 2, cols = reserve_len); - cur_F_small = matrix (0, rows = 3, cols = reserve_len); - cur_NC_small = matrix (0, rows = 5, cols = reserve_len); - cur_S_small = matrix (0, rows = 1, cols = reserve_len * distinct_values_max); # split values of the best feature - } - - ##### LOOP OVER SMALL NODES... - parfor (i7 in 1:num_cur_nodes_small, check = 0) { - - cur_node_small = as.scalar (cur_nodes_small_nonzero[1,i7]); - cur_tree_small = as.scalar (cur_nodes_small_nonzero[3,i7]); - - # build dataset for SMALL node - Ix = ppred (L[,cur_tree_small], cur_node_small, "=="); - if (num_scale_features > 0) { - X_scale_ext_small = removeEmpty (target = X_scale_ext, margin = "rows", select = Ix); - } - if (num_cat_features > 0) { - X_cat_small = removeEmpty (target = X_cat, margin = "rows", select = Ix); - } - - L_small = removeEmpty (target = L * Ix, margin = "rows"); - C_small = removeEmpty (target = C * Ix, margin = "rows"); - Y_bin_small = removeEmpty (target = Y_bin * Ix, margin = "rows"); - - # compute offset - offsets = cumsum (t (2 ^ ceil (log (cur_nodes_small_nonzero[2,]) / log (2)))); - start_ind_global = 1; - if (i7 > 1) { - start_ind_global = start_ind_global + as.scalar (offsets[(i7 - 1),]); - } - start_ind_S_global = 1; - if (i7 > 1) { - start_ind_S_global = start_ind_S_global + (as.scalar (offsets[(i7 - 1),]) * distinct_values_max); - } - - Q = matrix (0, rows = 2, cols = 1); - Q[1,1] = cur_node_small; - Q[2,1] = cur_tree_small; - F = matrix (0, rows = 3, cols = 1); - NC = matrix (0, rows = 5, cols = 1); - S = matrix (0, rows = 1, cols = 1); - - cur_nodes_ = matrix (cur_node_small, rows = 2, cols = 1); - cur_nodes_[1,1] = cur_node_small; - cur_nodes_[2,1] = cur_tree_small; - - num_cur_nodes = 1; - level_ = level; - while (num_cur_nodes > 0 & level_ < depth) { - - level_ = level_ + 1; - - cur_Q = matrix (0, rows = 2, cols = 2 * num_cur_nodes); - cur_F = matrix (0, rows = 3, cols = num_cur_nodes); - cur_NC = matrix (0, rows = 5, cols = 2 * num_cur_nodes); - cur_S = matrix (0, rows = 1, cols = num_cur_nodes * distinct_values_max); - - parfor (i8 in 1:num_cur_nodes, check = 0) { - - cur_node = as.scalar (cur_nodes_[1,i8]); - cur_tree = as.scalar (cur_nodes_[2,i8]); - - # select sample features WOR - feature_samples = sample (num_features_total, num_feature_samples); - feature_samples = order (target = feature_samples, by = 1); - num_scale_feature_samples = sum (ppred (feature_samples, num_scale_features, "<=")); - num_cat_feature_samples = num_feature_samples - num_scale_feature_samples; - - # --- find best split --- - # samples that reach cur_node - Ix = ppred (L_small[,cur_tree], cur_node, "=="); - cur_Y_bin = Y_bin_small * (Ix * C_small[,cur_tree]); - label_counts_overall = colSums (cur_Y_bin); - - label_sum_overall = sum (label_counts_overall); - label_dist_overall = label_counts_overall / label_sum_overall; - if (imp == "entropy") { - label_dist_zero = ppred (label_dist_overall, 0, "=="); - cur_impurity = - sum (label_dist_overall * log (label_dist_overall + label_dist_zero)); # / log (2); - } else { # imp == "Gini" - cur_impurity = sum (label_dist_overall * (1 - label_dist_overall)); - } - best_scale_gain = 0; - best_cat_gain = 0; - if (num_scale_features > 0 & num_scale_feature_samples > 0) { - - scale_feature_samples = feature_samples[1:num_scale_feature_samples,]; - - # main operation - label_counts_left_scale = t (t (cur_Y_bin) %*% X_scale_ext_small); - - # compute left and right label distribution - label_sum_left = rowSums (label_counts_left_scale); - label_dist_left = label_counts_left_scale / label_sum_left; - if (imp == "entropy") { - label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1); - log_label_dist_left = log (label_dist_left); # / log (2) - impurity_left_scale = - rowSums (label_dist_left * log_label_dist_left); - } else { # imp == "Gini" - impurity_left_scale = rowSums (label_dist_left * (1 - label_dist_left)); - } - # - label_counts_right_scale = - label_counts_left_scale + label_counts_overall; - label_sum_right = rowSums (label_counts_right_scale); - label_dist_right = label_counts_right_scale / label_sum_right; - if (imp == "entropy") { - label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1); - log_label_dist_right = log (label_dist_right); # log (2) - impurity_right_scale = - rowSums (label_dist_right * log_label_dist_right); - } else { # imp == "Gini" - impurity_right_scale = rowSums (label_dist_right * (1 - label_dist_right)); - } - I_gain_scale = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left_scale + ( label_sum_right / label_sum_overall ) * impurity_right_scale); - - I_gain_scale = replace (target = I_gain_scale, pattern = "NaN", replacement = 0); - - # determine best feature to split on and the split value - feature_start_ind = matrix (0, rows = 1, cols = num_scale_features); - feature_start_ind[1,1] = 1; - if (num_scale_features > 1) { - feature_start_ind[1,2:num_scale_features] = cum_count_thresholds[1,1:(num_scale_features - 1)] + 1; - } - max_I_gain_found = 0; - max_I_gain_found_ind = 0; - best_i = 0; - - for (i in 1:num_scale_feature_samples) { # assuming feature_samples is 5x1 - cur_feature_samples_bin = as.scalar (scale_feature_samples[i,]); - cur_start_ind = as.scalar (feature_start_ind[,cur_feature_samples_bin]); - cur_end_ind = as.scalar (cum_count_thresholds[,cur_feature_samples_bin]); - I_gain_portion = I_gain_scale[cur_start_ind:cur_end_ind,]; - cur_max_I_gain = max (I_gain_portion); - cur_max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain_portion))); - if (cur_max_I_gain > max_I_gain_found) { - max_I_gain_found = cur_max_I_gain; - max_I_gain_found_ind = cur_max_I_gain_ind; - best_i = i; - } - } - - best_scale_gain = max_I_gain_found; - max_I_gain_ind_scale = max_I_gain_found_ind; - best_scale_feature = 0; - if (best_i > 0) { - best_scale_feature = as.scalar (scale_feature_samples[best_i,]); - } - best_scale_split = max_I_gain_ind_scale; - if (best_scale_feature > 1) { - best_scale_split = best_scale_split + as.scalar(cum_count_thresholds[,(best_scale_feature - 1)]); - } - } - - if (num_cat_features > 0 & num_cat_feature_samples > 0){ - - cat_feature_samples = feature_samples[(num_scale_feature_samples + 1):(num_scale_feature_samples + num_cat_feature_samples),] - num_scale_features; - - # initialization - split_values_bin = matrix (0, rows = 1, cols = distinct_values_overall); - split_values = split_values_bin; - split_values_offset = matrix (0, rows = 1, cols = num_cat_features); - I_gains = split_values_offset; - impurities_left = split_values_offset; - impurities_right = split_values_offset; - best_label_counts_left = matrix (0, rows = num_cat_features, cols = num_classes); - best_label_counts_right = matrix (0, rows = num_cat_features, cols = num_classes); - - # main operation - label_counts = t (t (cur_Y_bin) %*% X_cat_small); - - parfor (i9 in 1:num_cat_feature_samples, check = 0){ - - cur_cat_feature = as.scalar (cat_feature_samples[i9,1]); - start_ind = 1; - if (cur_cat_feature > 1) { - start_ind = start_ind + as.scalar (distinct_values_offset[(cur_cat_feature - 1),]); - } - offset = as.scalar (distinct_values[1,cur_cat_feature]); - - cur_label_counts = label_counts[start_ind:(start_ind + offset - 1),]; - - label_sum = rowSums (cur_label_counts); - label_dist = cur_label_counts / label_sum; - if (imp == "entropy") { - label_dist = replace (target = label_dist, pattern = 0, replacement = 1); - log_label_dist = log (label_dist); # / log(2) - impurity = - rowSums (label_dist * log_label_dist); - impurity = replace (target = impurity, pattern = "NaN", replacement = 1/0); - } else { # imp == "Gini" - impurity = rowSums (label_dist * (1 - label_dist)); - } - - # sort cur feature by impurity - cur_distinct_values = seq (1, nrow (cur_label_counts)); - cur_distinct_values_impurity = append (cur_distinct_values, impurity); - cur_feature_sorted = order (target = cur_distinct_values_impurity, by = 2, decreasing = FALSE); - P = table (cur_distinct_values, cur_feature_sorted); # permutation matrix - label_counts_sorted = P %*% cur_label_counts; - - # compute left and right label distribution - label_counts_left = cumsum (label_counts_sorted); - - label_sum_left = rowSums (label_counts_left); - label_dist_left = label_counts_left / label_sum_left; - label_dist_left = replace (target = label_dist_left, pattern = "NaN", replacement = 1); - if (imp == "entropy") { - label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1); - log_label_dist_left = log (label_dist_left); # / log(2) - impurity_left = - rowSums (label_dist_left * log_label_dist_left); - } else { # imp == "Gini" - impurity_left = rowSums (label_dist_left * (1 - label_dist_left)); - } - # - label_counts_right = - label_counts_left + label_counts_overall; - label_sum_right = rowSums (label_counts_right); - label_dist_right = label_counts_right / label_sum_right; - label_dist_right = replace (target = label_dist_right, pattern = "NaN", replacement = 1); - if (imp == "entropy") { - label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1); - log_label_dist_right = log (label_dist_right); # / log (2) - impurity_right = - rowSums (label_dist_right * log_label_dist_right); - } else { # imp == "Gini" - impurity_right = rowSums (label_dist_right * (1 - label_dist_right)); - } - I_gain = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left + ( label_sum_right / label_sum_overall ) * impurity_right); - - Ix_label_sum_left_zero = ppred (label_sum_left, 0, "=="); - Ix_label_sum_right_zero = ppred (label_sum_right, 0, "=="); - Ix_label_sum_zero = Ix_label_sum_left_zero * Ix_label_sum_right_zero; - I_gain = I_gain * (1 - Ix_label_sum_zero); - - I_gain[nrow (I_gain),] = 0; # last entry invalid - - max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain))); - - split_values[1, start_ind:(start_ind + max_I_gain_ind - 1)] = t (cur_feature_sorted[1:max_I_gain_ind,1]); - for (i10 in 1:max_I_gain_ind) { - ind = as.scalar (cur_feature_sorted[i10,1]); - if (ind == 1) { - split_values_bin[1,start_ind] = 1.0; - } else { - split_values_bin[1,(start_ind + ind - 1)] = 1.0; - } - } - split_values_offset[1,cur_cat_feature] = max_I_gain_ind; - - I_gains[1,cur_cat_feature] = max (I_gain); - - impurities_left[1,cur_cat_feature] = as.scalar (impurity_left[max_I_gain_ind,]); - impurities_right[1,cur_cat_feature] = as.scalar (impurity_right[max_I_gain_ind,]); - best_label_counts_left[cur_cat_feature,] = label_counts_left[max_I_gain_ind,]; - best_label_counts_right[cur_cat_feature,] = label_counts_right[max_I_gain_ind,]; - } - - # determine best feature to split on and the split values - best_cat_feature = as.scalar (rowIndexMax (I_gains)); - best_cat_gain = max (I_gains); - start_ind = 1; - if (best_cat_feature > 1) { - start_ind = start_ind + as.scalar (distinct_values_offset[(best_cat_feature - 1),]); - } - offset = as.scalar (distinct_values[1,best_cat_feature]); - best_split_values_bin = split_values_bin[1, start_ind:(start_ind + offset - 1)]; - } - - # compare best scale feature to best cat. feature and pick the best one - if (num_scale_features > 0 & num_scale_feature_samples > 0 & best_scale_gain >= best_cat_gain & best_scale_gain > 0) { - - # --- update model --- - cur_F[1,i8] = best_scale_feature; - cur_F[2,i8] = 1; - cur_F[3,i8] = 1; - cur_S[1,(i8 - 1) * distinct_values_max + 1] = thresholds[max_I_gain_ind_scale, best_scale_feature]; - - left_child = 2 * (cur_node - 1) + 1 + 1; - right_child = 2 * (cur_node - 1) + 2 + 1; - - # samples going to the left subtree - Ix_left = X_scale_ext_small[, best_scale_split]; - - Ix_left = Ix * Ix_left; - Ix_right = Ix * (1 - Ix_left); - - L_small[,cur_tree] = L_small[,cur_tree] * (1 - Ix_left) + (Ix_left * left_child); - L_small[,cur_tree] = L_small[,cur_tree] * (1 - Ix_right) + (Ix_right * right_child); - - left_child_size = sum (Ix_left * C_small[,cur_tree]); - right_child_size = sum (Ix_right * C_small[,cur_tree]); - - # check if left or right child is a leaf - left_pure = FALSE; - right_pure = FALSE; - cur_impurity_left = as.scalar(impurity_left_scale[best_scale_split,]); - cur_impurity_right = as.scalar(impurity_right_scale[best_scale_split,]); - if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | level_ == depth) & - (right_child_size <= num_leaf | cur_impurity_right == 0 | level_ == depth) ) { # both left and right nodes are leaf - - cur_label_counts_left = label_counts_left_scale[best_scale_split,]; - cur_NC[1,(2 * (i8 - 1) + 1)] = left_child; - cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree; - cur_NC[3,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label - left_pure = TRUE; - # compute number of misclassified points - cur_NC[4,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); - - cur_label_counts_right = label_counts_overall - cur_label_counts_left; - cur_NC[1,(2 * i8)] = right_child; - cur_NC[2,(2 * i8)] = cur_tree; - cur_NC[3,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label - right_pure = TRUE; - # compute number of misclassified points - cur_NC[4,(2 * i8)] = right_child_size - max (cur_label_counts_right); - - } else if (left_child_size <= num_leaf | cur_impurity_left == 0 | level_ == depth) { - - cur_label_counts_left = label_counts_left_scale[best_scale_split,]; - cur_NC[1,(2 * (i8 - 1) + 1)] = left_child; - cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree; - cur_NC[3,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label - left_pure = TRUE; - # compute number of misclassified points - cur_NC[4,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); - - } else if (right_child_size <= num_leaf | cur_impurity_right == 0 | level_ == depth) { - - cur_label_counts_right = label_counts_right_scale[best_scale_split,]; - cur_NC[1,(2 * i8)] = right_child; - cur_NC[2,(2 * i8)] = cur_tree; - cur_NC[3,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label - right_pure = TRUE; - # compute number of misclassified points - cur_NC[4,(2 * i8)] = right_child_size - max (cur_label_counts_right); - - } - - } else if (num_cat_features > 0 & num_cat_feature_samples > 0 & best_cat_gain > 0) { - - # --- update model --- - cur_F[1,i8] = best_cat_feature; - cur_F[2,i8] = 2; - offset_nonzero = as.scalar (split_values_offset[1,best_cat_feature]); - S_start_ind = (i8 - 1) * distinct_values_max + 1; - cur_F[3,i8] = offset_nonzero; - cur_S[1,S_start_ind:(S_start_ind + offset_nonzero - 1)] = split_values[1,start_ind:(start_ind + offset_nonzero - 1)]; - - left_child = 2 * (cur_node - 1) + 1 + 1; - right_child = 2 * (cur_node - 1) + 2 + 1; - - # samples going to the left subtree - Ix_left = rowSums (X_cat_small[,start_ind:(start_ind + offset - 1)] * best_split_values_bin); - Ix_left = ppred (Ix_left, 1, ">="); - - Ix_left = Ix * Ix_left; - Ix_right = Ix * (1 - Ix_left); - - L_small[,cur_tree] = L_small[,cur_tree] * (1 - Ix_left) + (Ix_left * left_child); - L_small[,cur_tree] = L_small[,cur_tree] * (1 - Ix_right) + (Ix_right * right_child); - - left_child_size = sum (Ix_left * C_small[,cur_tree]); - right_child_size = sum (Ix_right * C_small[,cur_tree]); - - # check if left or right child is a leaf - left_pure = FALSE; - right_pure = FALSE; - cur_impurity_left = as.scalar(impurities_left[,best_cat_feature]); - cur_impurity_right = as.scalar(impurities_right[,best_cat_feature]); - if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | level_ == depth) & - (right_child_size <= num_leaf | cur_impurity_right == 0 | level_ == depth) ) { # both left and right nodes are leaf - - cur_label_counts_left = best_label_counts_left[best_cat_feature,]; - cur_NC[1,(2 * (i8 - 1) + 1)] = left_child; - cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree; - cur_NC[3,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label - left_pure = TRUE; - # compute number of misclassified points - cur_NC[4,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); - - cur_label_counts_right = label_counts_overall - cur_label_counts_left; - cur_NC[1,(2 * i8)] = right_child; - cur_NC[2,(2 * i8)] = cur_tree; - cur_NC[3,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label - right_pure = TRUE; - # compute number of misclassified points - cur_NC[4,(2 * i8)] = right_child_size - max (cur_label_counts_right); - - } else if (left_child_size <= num_leaf | cur_impurity_left == 0 | level_ == depth) { - - cur_label_counts_left = best_label_counts_left[best_cat_feature,]; - cur_NC[1,(2 * (i8 - 1) + 1)] = left_child; - cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree; - cur_NC[3,(2 * (i8 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label - left_pure = TRUE; - # compute number of misclassified points - cur_NC[4,(2 * (i8 - 1) + 1)] = left_child_size - max (cur_label_counts_left); - - } else if (right_child_size <= num_leaf | cur_impurity_right == 0 | level_ == depth) { - cur_label_counts_right = best_label_counts_right[best_cat_feature,]; - cur_NC[1,(2 * i8)] = right_child; - cur_NC[2,(2 * i8)] = cur_tree; - cur_NC[3,(2 * i8)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label - right_pure = TRUE; - # compute number of misclassified points - cur_NC[4,(2 * i8)] = right_child_size - max (cur_label_counts_right); - - } - } else { - - print ("NUMBER OF SAMPLES AT NODE " + cur_node + " in tree " + cur_tree + " CANNOT BE REDUCED TO MATCH " + num_leaf + ". THIS NODE IS DECLARED AS LEAF!"); - right_pure = TRUE; - left_pure = TRUE; - cur_NC[1,(2 * (i8 - 1) + 1)] = cur_node; - cur_NC[2,(2 * (i8 - 1) + 1)] = cur_tree; - class_label = as.scalar (rowIndexMax (label_counts_overall)); - cur_NC[3,(2 * (i8 - 1) + 1)] = class_label; - cur_NC[4,(2 * (i8 - 1) + 1)] = label_sum_overall - max (label_counts_overall); - cur_NC[5,(2 * (i8 - 1) + 1)] = 1; # special leaf - - } - - # add nodes to Q - if (!left_pure) { - cur_Q[1,(2 * (i8 - 1)+ 1)] = left_child; - cur_Q[2,(2 * (i8 - 1)+ 1)] = cur_tree; - } - if (!right_pure) { - cur_Q[1,(2 * i8)] = right_child; - cur_Q[2,(2 * i8)] = cur_tree; - } - } - - cur_Q = removeEmpty (target = cur_Q, margin = "cols"); - Q = append (Q, cur_Q); - NC = append (NC, cur_NC); - F = append (F, cur_F); - S = append (S, cur_S); - - num_cur_nodes_pre = 2 * num_cur_nodes; - if (as.scalar (cur_Q[1,1]) == 0) { - num_cur_nodes = 0; - } else { - cur_nodes_ = cur_Q; - num_cur_nodes = ncol (cur_Q); - } - } - - cur_Q_small[,start_ind_global:(start_ind_global + ncol (Q) - 1)] = Q; - cur_NC_small[,start_ind_global:(start_ind_global + ncol (NC) - 1)] = NC; - cur_F_small[,start_ind_global:(start_ind_global + ncol (F) - 1)] = F; - cur_S_small[,start_ind_S_global:(start_ind_S_global + ncol (S) - 1)] = S; - } - - ##### PREPARE MODEL FOR SMALL NODES - if (num_cur_nodes_small > 0) { # small nodes already processed - cur_Q_small = removeEmpty (target = cur_Q_small, margin = "cols"); - if (as.scalar (cur_Q_small[1,1]) != 0) Q_small = append (Q_small, cur_Q_small); - cur_NC_small = removeEmpty (target = cur_NC_small, margin = "cols"); - if (as.scalar (cur_NC_small[1,1]) != 0) NC_small = append (NC_small, cur_NC_small); - - cur_F_small = removeEmpty (target = cur_F_small, margin = "cols"); # - if (as.scalar (cur_F_small[1,1]) != 0) F_small = append (F_small, cur_F_small); - cur_S_small = removeEmpty (target = cur_S_small, margin = "cols"); # - if (as.scalar (cur_S_small[1,1]) != 0) S_small = append (S_small, cur_S_small); - - num_cur_nodes_small = 0; # reset - } - - print (" --- end level " + level + ", remaining no. of LARGE nodes to expand " + num_cur_nodes_large + " --- "); -} - -#### prepare model -print ("PREPARING MODEL...") -### large nodes -if (as.scalar (Q_large[1,1]) == 0 & ncol (Q_large) > 1) { - Q_large = Q_large[,2:ncol (Q_large)]; -} -if (as.scalar (NC_large[1,1]) == 0 & ncol (NC_large) > 1) { - NC_large = NC_large[,2:ncol (NC_large)]; -} -if (as.scalar (S_large[1,1]) == 0 & ncol (S_large) > 1) { - S_large = S_large[,2:ncol (S_large)]; -} -if (as.scalar (F_large[1,1]) == 0 & ncol (F_large) > 1) { - F_large = F_large[,2:ncol (F_large)]; -} -### small nodes -if (as.scalar (Q_small[1,1]) == 0 & ncol (Q_small) > 1) { - Q_small = Q_small[,2:ncol (Q_small)]; -} -if (as.scalar (NC_small[1,1]) == 0 & ncol (NC_small) > 1) { - NC_small = NC_small[,2:ncol (NC_small)]; -} -if (as.scalar (S_small[1,1]) == 0 & ncol (S_small) > 1) { - S_small = S_small[,2:ncol (S_small)]; -} -if (as.scalar (F_small[1,1]) == 0 & ncol (F_small) > 1) { - F_small = F_small[,2:ncol (F_small)]; -} - -# check for special leaves and if there are any remove them from Q_large and Q_small -special_large_leaves_ind = NC_large[5,]; -num_special_large_leaf = sum (special_large_leaves_ind); -if (num_special_large_leaf > 0) { - print ("PROCESSING " + num_special_large_leaf + " SPECIAL LARGE LEAVES..."); - special_large_leaves = removeEmpty (target = NC_large[1:2,] * special_large_leaves_ind, margin = "cols"); - large_internal_ind = 1 - colSums (outer (t (special_large_leaves[1,]), Q_large[1,], "==") * outer (t (special_large_leaves[2,]), Q_large[2,], "==")); - Q_large = removeEmpty (target = Q_large * large_internal_ind, margin = "cols"); - F_large = removeEmpty (target = F_large, margin = "cols"); # remove special leaves from F -} - -special_small_leaves_ind = NC_small[5,]; -num_special_small_leaf = sum (special_small_leaves_ind); -if (num_special_small_leaf > 0) { - print ("PROCESSING " + num_special_small_leaf + " SPECIAL SMALL LEAVES..."); - special_small_leaves = removeEmpty (target = NC_small[1:2,] * special_small_leaves_ind, margin = "cols"); - small_internal_ind = 1 - colSums (outer (t (special_small_leaves[1,]), Q_small[1,], "==") * outer (t (special_small_leaves[2,]), Q_small[2,], "==")); - Q_small = removeEmpty (target = Q_small * small_internal_ind, margin = "cols"); - F_small = removeEmpty (target = F_small, margin = "cols"); # remove special leaves from F -} - -# model corresponding to large internal nodes -no_large_internal_node = FALSE; -if (as.scalar (Q_large[1,1]) != 0) { - print ("PROCESSING LARGE INTERNAL NODES..."); - num_large_internal = ncol (Q_large); - max_offset = max (max (F_large[3,]), max (F_small[3,])); - M1_large = matrix (0, rows = 6 + max_offset, cols = num_large_internal); - M1_large[1:2,] = Q_large; - M1_large[4:6,] = F_large; - # process S_large - cum_offsets_large = cumsum (t (F_large[3,])); - parfor (it in 1:num_large_internal, check = 0) { - start_ind = 1; - if (it > 1) { - start_ind = start_ind + as.scalar (cum_offsets_large[(it - 1),]); - } - offset = as.scalar (F_large[3,it]); - M1_large[7:(7 + offset - 1),it] = t (S_large[1,start_ind:(start_ind + offset - 1)]); - } -} else { - print ("No LARGE internal nodes available"); - no_large_internal_node = TRUE; -} - -# model corresponding to small internal nodes -no_small_internal_node = FALSE; -if (as.scalar (Q_small[1,1]) != 0) { - print ("PROCESSING SMALL INTERNAL NODES..."); - num_small_internal = ncol (Q_small); - M1_small = matrix (0, rows = 6 + max_offset, cols = num_small_internal); - M1_small[1:2,] = Q_small; - M1_small[4:6,] = F_small; - # process S_small - cum_offsets_small = cumsum (t (F_small[3,])); - parfor (it in 1:num_small_internal, check = 0) { - start_ind = 1; - if (it > 1) { - start_ind = start_ind + as.scalar (cum_offsets_small[(it - 1),]); - } - offset = as.scalar (F_small[3,it]); - M1_small[7:(7 + offset - 1),it] = t (S_small[1,start_ind:(start_ind + offset - 1)]); - } -} else { - print ("No SMALL internal nodes available"); - no_small_internal_node = TRUE; -} - -# model corresponding to large leaf nodes -no_large_leaf_node = FALSE; -if (as.scalar (NC_large[1,1]) != 0) { - print ("PROCESSING LARGE LEAF NODES..."); - num_large_leaf = ncol (NC_large); - M2_large = matrix (0, rows = 6 + max_offset, cols = num_large_leaf); - M2_large[1:2,] = NC_large[1:2,]; - M2_large[5:7,] = NC_large[3:5,]; -} else { - print ("No LARGE leaf nodes available"); - no_large_leaf_node = TRUE; -} - -# model corresponding to small leaf nodes -no_small_leaf_node = FALSE; -if (as.scalar (NC_small[1,1]) != 0) { - print ("PROCESSING SMALL LEAF NODES..."); - num_small_leaf = ncol (NC_small); - M2_small = matrix (0, rows = 6 + max_offset, cols = num_small_leaf); - M2_small[1:2,] = NC_small[1:2,]; - M2_small[5:7,] = NC_small[3:5,]; -} else { - print ("No SMALL leaf nodes available"); - no_small_leaf_node = TRUE; -} - -if (no_large_internal_node) { - M1 = M1_small; -} else if (no_small_internal_node) { - M1 = M1_large; -} else { - M1 = append (M1_large, M1_small); -} - -if (no_large_leaf_node) { - M2 = M2_small; -} else if (no_small_leaf_node) { - M2 = M2_large; -} else { - M2 = append (M2_large, M2_small); -} - -M = append (M1, M2); -M = t (order (target = t (M), by = 1)); # sort by node id -M = t (order (target = t (M), by = 2)); # sort by tree id - - -# removing redundant subtrees -if (ncol (M) > 1) { - print ("CHECKING FOR REDUNDANT SUBTREES..."); - red_leaf = TRUE; - process_red_subtree = FALSE; - invalid_node_ind = matrix (0, rows = 1, cols = ncol (M)); - while (red_leaf & ncol (M) > 1) { - leaf_ind = ppred (M[4,], 0, "=="); - labels = M[5,] * leaf_ind; - tree_ids = M[2,]; - parent_ids = floor (M[1,] /2); - cond1 = ppred (labels[,1:(ncol (M) - 1)], labels[,2:ncol (M)], "=="); # siebling leaves with same label - cond2 = ppred (parent_ids[,1:(ncol (M) - 1)], parent_ids[,2:ncol (M)], "=="); # same parents - cond3 = ppred (tree_ids[,1:(ncol (M) - 1)], tree_ids[,2:ncol (M)], "=="); # same tree - red_leaf_ind = cond1 * cond2 * cond3 * leaf_ind[,2:ncol (M)]; - - if (sum (red_leaf_ind) > 0) { # if redundant subtrees exist - red_leaf_ids = M[1:2,2:ncol (M)] * red_leaf_ind; - red_leaf_ids_nonzero = removeEmpty (target = red_leaf_ids, margin = "cols"); - parfor (it in 1:ncol (red_leaf_ids_nonzero), check = 0){ - cur_right_leaf_id = as.scalar (red_leaf_ids_nonzero[1,it]); - cur_parent_id = floor (cur_right_leaf_id / 2); - cur_tree_id = as.scalar (red_leaf_ids_nonzero[2,it]); - cur_right_leaf_pos = as.scalar (rowIndexMax (ppred (M[1,], cur_right_leaf_id, "==") * ppred (M[2,], cur_tree_id, "=="))); - cur_parent_pos = as.scalar(rowIndexMax (ppred (M[1,], cur_parent_id, "==") * ppred (M[2,], cur_tree_id, "=="))); - M[3:nrow (M), cur_parent_pos] = M[3:nrow (M), cur_right_leaf_pos]; - M[4,cur_right_leaf_pos] = -1; - M[4,cur_right_leaf_pos - 1] = -1; - invalid_node_ind[1,cur_right_leaf_pos] = 1; - invalid_node_ind[1,cur_right_leaf_pos - 1] = 1; - } - process_red_subtree = TRUE; - } else { - red_leaf = FALSE; - } - } - - if (process_red_subtree) { - print ("REMOVING REDUNDANT SUBTREES..."); - valid_node_ind = ppred (invalid_node_ind, 0, "=="); - M = removeEmpty (target = M * valid_node_ind, margin = "cols"); - } -} - -internal_ind = ppred (M[4,], 0, ">"); -internal_ids = M[1:2,] * internal_ind; -internal_ids_nonzero = removeEmpty (target = internal_ids, margin = "cols"); -if (as.scalar (internal_ids_nonzero[1,1]) > 0) { # if internal nodes exist - a1 = internal_ids_nonzero[1,]; - a2 = internal_ids_nonzero[1,] * 2; - vcur_tree_id = internal_ids_nonzero[2,]; - pos_a1 = rowIndexMax( outer(t(a1), M[1,], "==") * outer(t(vcur_tree_id), M[2,], "==") ); - pos_a2 = rowIndexMax( outer(t(a2), M[1,], "==") * outer(t(vcur_tree_id), M[2,], "==") ); - M[3,] = t(table(pos_a1, 1, pos_a2 - pos_a1, ncol(M), 1)); -} -else { - print ("All trees in the random forest contain only one leaf!"); -} - -if (fileC != " ") { - write (C, fileC, format = fmtO); -} -write (M, fileM, format = fmtO); +#------------------------------------------------------------- +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +#------------------------------------------------------------- + +# +# THIS SCRIPT IMPLEMENTS CLASSIFICATION RANDOM FOREST WITH BOTH SCALE AND CATEGORICAL FEATURES +# +# INPUT PARAMETERS: +# --------------------------------------------------------------------------------------------- +# NAME TYPE DEFAULT MEANING +# --------------------------------------------------------------------------------------------- +# X String --- Location to read feature matrix X; note that X needs to be both recoded and dummy coded +# Y String --- Location to read label matrix Y; note that Y needs to be both recoded and dummy coded +# R String " " Location to read the matrix R which for each feature in X contains the following information +# - R[,1]: column ids +# - R[,2]: start indices +# - R[,3]: end indices +# If R is not provided by default all variables are assumed to be scale +# bins Int 20 Number of equiheight bins per scale feature to choose thresholds +# depth Int 25 Maximum depth of the learned tree +# num_leaf Int 10 Number of samples when splitting stops and a leaf node is added +# num_samples Int 3000 Number of samples at which point we switch to in-memory subtree building +# num_trees Int 10 Number of trees to be learned in the random forest model +# subsamp_rate Double 1.0 Parameter controlling the size of each tree in the forest; samples are selected from a +# Poisson distribution with parameter subsamp_rate (the default value is 1.0) +# feature_subset Double 0.5 Parameter that controls the number of feature used as candidates for splitting at each tree node +# as a power of number of features in the dataset; +# by default square root of features (i.e., feature_subset = 0.5) are used at each tree node +# impurity String "Gini" Impurity measure: entropy or Gini (the default) +# M String --- Location to write matrix M containing the learned tree +# C String " " Location to write matrix C containing the number of times samples are chosen in each tree of the random forest +# S_map String " " Location to write the mappings from scale feature ids to global feature ids +# C_map String " " Location to write the mappings from categorical feature ids to global feature ids +# fmt String "text" The output format of the model (matrix M), such as "text" or "csv" +# --------------------------------------------------------------------------------------------- +# OUTPUT: +# Matrix M where each column corresponds to a node in the learned tree and each row contains the following information: +# M[1,j]: id of node j (in a complete binary tree) +# M[2,j]: tree id to which node j belongs +# M[3,j]: Offset (no. of columns) to left child of j +# M[4,j]: Feature index of the feature that node j looks at if j is an internal node, otherwise 0 +# M[5,j]: Type of the feature that node j looks at if j is an internal node: 1 for scale and 2 for categorical features, +# otherwise the label that leaf node j is supposed to predict +# M[6,j]: 1 if j is an internal node and the feature chosen for j is scale, otherwise the size of the subset of values +# stored in rows 7,8,... if j is categorical +# M[7:,j]: Only applicable for internal nodes. Threshold the example's feature value is compared to is stored at M[7,j] if the feature chosen for j is scale; +# If the feature chosen for j is categorical rows 7,8,... depict the value subset chosen for j +# ------------------------------------------------------------------------------------------- +# HOW TO INVOKE THIS SCRIPT - EXAMPLE: +# hadoop jar SystemML.jar -f random-forest.dml -nvargs X=INPUT_DIR/X Y=INPUT_DIR/Y R=INPUT_DIR/R M=OUTPUT_DIR/model +# bins=20 depth=25 num_leaf=10 num_samples=3000 num_trees=10 impurity=Gini fmt=csv + + +# External function for binning +binning = externalFunction(Matrix[Double] A, Integer binsize, Integer numbins) return (Matrix[Double] B, Integer numbinsdef) + implemented in (classname="org.apache.sysml.udf.lib.BinningWrapper",exectype="mem") + + +# Default values of some parameters +fileR = ifdef ($R, " "); +fileC = ifdef ($C, " "); +fileS_map = ifdef ($S_map, " "); +fileC_map = ifdef ($C_map, " "); +fileM = $M; +num_bins = ifdef($bins, 20); +depth = ifdef($depth, 25); +num_leaf = ifdef($num_leaf, 10); +num_trees = ifdef($num_trees, 1); +threshold = ifdef ($num_samples, 3000); +imp = ifdef($impurity, "Gini"); +rate = ifdef ($subsamp_rate, 1); +fpow = ifdef ($feature_subset, 0.5); +fmtO = ifdef($fmt, "text"); + +X = read($X); +Y_bin = read($Y); +num_records = nrow (X); +num_classes = ncol (Y_bin); + +# check if there is only one class label +Y_bin_sum = sum (ppred (colSums (Y_bin), num_records, "==")); +if (Y_bin_sum == 1) { + stop ("Y contains only one class label. No model will be learned!"); +} else if (Y_bin_sum > 1) { + stop ("Y is not properly dummy coded. Multiple columns of Y contain only ones!") +} + +# split data into X_scale and X_cat +if (fileR != " ") { + R = read (fileR); + R = order (target = R, by = 2); # sort by start indices + dummy_coded = ppred (R[,2], R[,3], "!="); + R_scale = removeEmpty (target = R[,2:3] * (1 - dummy_coded), margin = "rows"); + R_cat = removeEmpty (target = R[,2:3] * dummy_coded, margin = "rows"); + if (fileS_map != " ") { + scale_feature_mapping = removeEmpty (target = (1 - dummy_coded) * seq (1, nrow (R)), margin = "rows"); + write (scale_feature_mapping, fileS_map, format = fmtO); + } + if (fileC_map != " ") { + cat_feature_mapping = removeEmpty (target = dummy_coded * seq (1, nrow (R)), margin = "rows"); + write (cat_feature_mapping, fileC_map, format = fmtO); + } + sum_dummy = sum (dummy_coded); + if (sum_dummy == nrow (R)) { # all features categorical + print ("All features categorical"); + num_cat_features = nrow (R_cat); + num_scale_features = 0; + X_cat = X; + distinct_values = t (R_cat[,2] - R_cat[,1] + 1); + distinct_values_max = max (distinct_values); + distinct_values_offset = cumsum (t (distinct_values)); + distinct_values_overall = sum (distinct_values); + } else if (sum_dummy == 0) { # all features scale + print ("All features scale"); + num_scale_features = ncol (X); + num_cat_features = 0; + X_scale = X; + distinct_values_max = 1; + } else { # some features scale some features categorical + num_cat_features = nrow (R_cat); + num_scale_features = nrow (R_scale); + distinct_values = t (R_cat[,2] - R_cat[,1] + 1); + distinct_values_max = max (distinct_values); + distinct_values_offset = cumsum (t (distinct_values)); + distinct_values_overall = sum (distinct_values); + + W = matrix (1, rows = num_cat_features, cols = 1) %*% matrix ("1 -1", rows = 1, cols = 2); + W = matrix (W, rows = 2 * num_cat_features, cols = 1); + if (as.scalar (R_cat[num_cat_features, 2]) == ncol (X)) { + W[2 * num_cat_features,] = 0; + } + + last = ppred (R_cat[,2], ncol (X), "!="); + R_cat1 = (R_cat[,2] + 1) * last; + R_cat[,2] = (R_cat[,2] * (1 - last)) + R_cat1; + R_cat_vec = matrix (R_cat, rows = 2 * num_cat_features, cols = 1); + + col_tab = table (R_cat_vec, 1, W, ncol (X), 1); + col_ind = cumsum (col_tab); + + col_ind_cat = removeEmpty (target = col_ind * seq (1, ncol (X)), margin = "rows"); + col_ind_scale = removeEmpty (target = (1 - col_ind) * seq (1, ncol (X)), margin = "rows"); + X_cat = X %*% table (col_ind_cat, seq (1, nrow (col_ind_cat)), ncol (X), nrow (col_ind_cat)); + X_scale = X %*% table (col_ind_scale, seq (1, nrow (col_ind_scale)), ncol (X), nrow (col_ind_scale)); + } +} else { # only scale features exist + print ("All features scale"); + num_scale_features = ncol (X); + num_cat_features = 0; + X_scale = X; + distinct_values_max = 1; +} + +if (num_scale_features > 0) { + + print ("COMPUTING BINNING..."); + bin_size = max (as.integer (num_records / num_bins), 1); + count_thresholds = matrix (0, rows = 1, cols = num_scale_features) + thresholds = matrix (0, rows = num_bins + 1, cols = num_scale_features) + parfor(i1 in 1:num_scale_features) { + col = order (target = X_scale[,i1], by = 1, decreasing = FALSE); + [col_bins, num_bins_defined] = binning (col, bin_size, num_bins); + count_thresholds[,i1] = num_bins_defined; + thresholds[,i1] = col_bins; + } + + print ("PREPROCESSING SCALE FEATURE MATRIX..."); + min_num_bins = min (count_thresholds); + max_num_bins = max (count_thresholds); + total_num_bins = sum (count_thresholds); + cum_count_thresholds = t (cumsum (t (count_thresholds))); + X_scale_ext = matrix (0, rows = num_records, cols = total_num_bins); + parfor (i2 in 1:num_scale_features, check = 0) { + Xi2 = X_scale[,i2]; + count_threshold = as.scalar (count_thresholds[,i2]); + offset_feature = 1; + if (i2 > 1) { + offset_feature = offset_feature + as.integer (as.scalar (cum_count_thresholds[, (i2 - 1)])); + } + + ti2 = t(thresholds[1:count_threshold, i2]); + X_scale_ext[,offset_feature:(offset_feature + count_threshold - 1)] = outer (Xi2, ti2, "<"); + } +} + +num_features_total = num_scale_features + num_cat_features; +num_feature_samples = as.integer (floor (num_features_total ^ fpow)); + +##### INITIALIZATION +L = matrix (1, rows = num_records, cols = num_trees); # last visited node id for each training sample + +# create matrix of counts (generated by Poisson distribution) storing how many times each sample appears in each tree +print ("CONPUTING COUNTS..."); +C = rand (rows = num_records, cols = num_trees, pdf = "poisson", lambda = rate); +Ix_nonzero = ppred (C, 0, "!="); +L = L * Ix_nonzero; +total_counts = sum (C); + + +# model +# LARGE leaf nodes +# NC_large[,1]: node id +# NC_large[,2]: tree id +# NC_large[,3]: class label +# NC_large[,4]: no. of misclassified samples +# NC_large[,5]: 1 if special leaf (impure and 3 samples at that leaf > threshold) or 0 otherwise +NC_large = matrix (0, rows = 5, cols = 1); + +# SMALL leaf nodes +# same schema as for LARGE leaf nodes (to be initialized) +NC_small = matrix (0, rows = 5, cols = 1); + +# LARGE internal nodes +# Q_large[,1]: node id +# Q_large[,2]: tree id +Q_large = matrix (0, rows = 2, cols = num_trees); +Q_large[1,] = matrix (1, rows = 1, cols = num_trees); +Q_large[2,] = t (seq (1, num_trees)); + +# SMALL internal nodes +# same schema as for LARGE internal nodes (to be initialized) +Q_small = matrix (0, rows = 2, cols = 1); + +# F_large[,1]: feature +# F_large[,2]: type +# F_large[,3]: offset +F_large = matrix (0, rows = 3, cols = 1); + +# same schema as for LARGE nodes +F_small = matrix (0, rows = 3, cols = 1); + +# split points for LARGE internal nodes +S_large = matrix (0, rows = 1, cols = 1); + +# split points for SMALL internal nodes +S_small = matrix (0, rows = 1, cols = 1); + +# initialize queue +cur_nodes_large = matrix (1, rows = 2, cols = num_trees); +cur_nodes_large[2,] = t (seq (1, num_trees)); + +num_cur_nodes_large = num_trees; +num_cur_nodes_small = 0; +level = 0; + +while ((num_cur_nodes_large + num_cur_nodes_small) > 0 & level < depth) { + + level = level + 1; + print (" --- start level " + level + " --- "); + + ##### PREPARE MODEL + if (num_cur_nodes_large > 0) { # LARGE nodes to process + cur_Q_large = matrix (0, rows = 2, cols = 2 * num_cur_nodes_large); + cur_NC_large = matrix (0, rows = 5, cols = 2 * num_cur_nodes_large); + cur_F_large = matrix (0, rows = 3, cols = num_cur_nodes_large); + cur_S_large = matrix (0, rows = 1, cols = num_cur_nodes_large * distinct_values_max); + cur_nodes_small = matrix (0, rows = 3, cols = 2 * num_cur_nodes_large); + } + + ##### LOOP OVER LARGE NODES... + parfor (i6 in 1:num_cur_nodes_large, check = 0) { + + cur_node = as.scalar (cur_nodes_large[1,i6]); + cur_tree = as.scalar (cur_nodes_large[2,i6]); + + # select sample features WOR + feature_samples = sample (num_features_total, num_feature_samples); + feature_samples = order (target = feature_samples, by = 1); + num_scale_feature_samples = sum (ppred (feature_samples, num_scale_features, "<=")); + num_cat_feature_samples = num_feature_samples - num_scale_feature_samples; + + # --- find best split --- + # samples that reach cur_node + Ix = ppred (L[,cur_tree], cur_node, "=="); + + cur_Y_bin = Y_bin * (Ix * C[,cur_tree]); + label_counts_overall = colSums (cur_Y_bin); + label_sum_overall = sum (label_counts_overall); + label_dist_overall = label_counts_overall / label_sum_overall; + + if (imp == "entropy") { + label_dist_zero = ppred (label_dist_overall, 0, "=="); + cur_impurity = - sum (label_dist_overall * log (label_dist_overall + label_dist_zero)); # / log (2); # impurity before + } else { # imp == "Gini" + cur_impurity = sum (label_dist_overall * (1 - label_dist_overall)); # impurity before + } + best_scale_gain = 0; + best_cat_gain = 0; + if (num_scale_features > 0 & num_scale_feature_samples > 0) { + + scale_feature_samples = feature_samples[1:num_scale_feature_samples,]; + + # main operation + label_counts_left_scale = t (t (cur_Y_bin) %*% X_scale_ext); + + # compute left and right label distribution + label_sum_left = rowSums (label_counts_left_scale); + label_dist_left = label_counts_left_scale / label_sum_left; + if (imp == "entropy") { + label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1); + log_label_dist_left = log (label_dist_left); # / log (2) + impurity_left_scale = - rowSums (label_dist_left * log_label_dist_left); + } else { # imp == "Gini" + impurity_left_scale = rowSums (label_dist_left * (1 - label_dist_left)); + } + # + label_counts_right_scale = - label_counts_left_scale + label_counts_overall; + label_sum_right = rowSums (label_counts_right_scale); + label_dist_right = label_counts_right_scale / label_sum_right; + if (imp == "entropy") { + label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1); + log_label_dist_right = log (label_dist_right); # / log (2) + impurity_right_scale = - rowSums (label_dist_right * log_label_dist_right); + } else { # imp == "Gini" + impurity_right_scale = rowSums (label_dist_right * (1 - label_dist_right)); + } + + I_gain_scale = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left_scale + ( label_sum_right / label_sum_overall ) * impurity_right_scale); + + I_gain_scale = replace (target = I_gain_scale, pattern = "NaN", replacement = 0); + + # determine best feature to split on and the split value + feature_start_ind = matrix (0, rows = 1, cols = num_scale_features); + feature_start_ind[1,1] = 1; + if (num_scale_features > 1) { + feature_start_ind[1,2:num_scale_features] = cum_count_thresholds[1,1:(num_scale_features - 1)] + 1; + } + max_I_gain_found = 0; + max_I_gain_found_ind = 0; + best_i = 0; + + for (i in 1:num_scale_feature_samples) { # assuming feature_samples is 5x1 + cur_feature_samples_bin = as.scalar (scale_feature_samples[i,]); + cur_start_ind = as.scalar (feature_start_ind[,cur_feature_samples_bin]); + cur_end_ind = as.scalar (cum_count_thresholds[,cur_feature_samples_bin]); + I_gain_portion = I_gain_scale[cur_start_ind:cur_end_ind,]; + cur_max_I_gain = max (I_gain_portion); + cur_max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain_portion))); + if (cur_max_I_gain > max_I_gain_found) { + max_I_gain_found = cur_max_I_gain; + max_I_gain_found_ind = cur_max_I_gain_ind; + best_i = i; + } + } + + best_scale_gain = max_I_gain_found; + max_I_gain_ind_scale = max_I_gain_found_ind; + best_scale_feature = 0; + if (best_i > 0) { + best_scale_feature = as.scalar (scale_feature_samples[best_i,]); + } + best_scale_split = max_I_gain_ind_scale; + if (best_scale_feature > 1) { + best_scale_split = best_scale_split + as.scalar(cum_count_thresholds[,(best_scale_feature - 1)]); + } + } + + if (num_cat_features > 0 & num_cat_feature_samples > 0){ + + cat_feature_samples = feature_samples[(num_scale_feature_samples + 1):(num_scale_feature_samples + num_cat_feature_samples),] - num_scale_features; + + # initialization + split_values_bin = matrix (0, rows = 1, cols = distinct_values_overall); + split_values = split_values_bin; + split_values_offset = matrix (0, rows = 1, cols = num_cat_features); + I_gains = split_values_offset; + impurities_left = split_values_offset; + impurities_right = split_values_offset; + best_label_counts_left = matrix (0, rows = num_cat_features, cols = num_classes); + best_label_counts_right = matrix (0, rows = num_cat_features, cols = num_classes); + + # main operation + label_counts = t (t (cur_Y_bin) %*% X_cat); + + parfor (i9 in 1:num_cat_feature_samples, check = 0){ + + cur_cat_feature = as.scalar (cat_feature_samples[i9,1]); + start_ind = 1; + if (cur_cat_feature > 1) { + start_ind = start_ind + as.scalar (distinct_values_offset[(cur_cat_feature - 1),]); + } + offset = as.scalar (distinct_values[1,cur_cat_feature]); + + cur_label_counts = label_counts[start_ind:(start_ind + offset - 1),]; + + label_sum = rowSums (cur_label_counts); + label_dist = cur_label_counts / label_sum; + if (imp == "entropy") { + label_dist = replace (target = label_dist, pattern = 0, replacement = 1); + log_label_dist = log (label_dist); # / log(2) + impurity = - rowSums (label_dist * log_label_dist); + impurity = replace (target = impurity, pattern = "NaN", replacement = 1/0); + } else { # imp == "Gini" + impurity = rowSums (label_dist * (1 - label_dist)); + } + + # sort cur feature by impurity + cur_distinct_values = seq (1, nrow (cur_label_counts)); + cur_distinct_values_impurity = append (cur_distinct_values, impurity); + cur_feature_sorted = order (target = cur_distinct_values_impurity, by = 2, decreasing = FALSE); + P = table (cur_distinct_values, cur_feature_sorted); # permutation matrix + label_counts_sorted = P %*% cur_label_counts; + + # compute left and right label distribution + label_counts_left = cumsum (label_counts_sorted); + + label_sum_left = rowSums (label_counts_left); + label_dist_left = label_counts_left / label_sum_left; + label_dist_left = replace (target = label_dist_left, pattern = "NaN", replacement = 1); + if (imp == "entropy") { + label_dist_left = replace (target = label_dist_left, pattern = 0, replacement = 1); + log_label_dist_left = log (label_dist_left); # / log(2) + impurity_left = - rowSums (label_dist_left * log_label_dist_left); + } else { # imp == "Gini" + impurity_left = rowSums (label_dist_left * (1 - label_dist_left)); + } + # + label_counts_right = - label_counts_left + label_counts_overall; + label_sum_right = rowSums (label_counts_right); + label_dist_right = label_counts_right / label_sum_right; + label_dist_right = replace (target = label_dist_right, pattern = "NaN", replacement = 1); + if (imp == "entropy") { + label_dist_right = replace (target = label_dist_right, pattern = 0, replacement = 1); + log_label_dist_right = log (label_dist_right); # / log (2) + impurity_right = - rowSums (label_dist_right * log_label_dist_right); + } else { # imp == "Gini" + impurity_right = rowSums (label_dist_right * (1 - label_dist_right)); + } + I_gain = cur_impurity - ( ( label_sum_left / label_sum_overall ) * impurity_left + ( label_sum_right / label_sum_overall ) * impurity_right); + + Ix_label_sum_left_zero = ppred (label_sum_left, 0, "=="); + Ix_label_sum_right_zero = ppred (label_sum_right, 0, "=="); + Ix_label_sum_zero = Ix_label_sum_left_zero * Ix_label_sum_right_zero; + I_gain = I_gain * (1 - Ix_label_sum_zero); + + I_gain[nrow (I_gain),] = 0; # last entry invalid + + max_I_gain_ind = as.scalar (rowIndexMax (t (I_gain))); + + split_values[1, start_ind:(start_ind + max_I_gain_ind - 1)] = t (cur_feature_sorted[1:max_I_gain_ind,1]); + for (i10 in 1:max_I_gain_ind) { + ind = as.scalar (cur_feature_sorted[i10,1]); + if (ind == 1) { + split_values_bin[1,start_ind] = 1.0; + } else { + split_values_bin[1,(start_ind + ind - 1)] = 1.0; + } + } + split_values_offset[1,cur_cat_feature] = max_I_gain_ind; + + I_gains[1,cur_cat_feature] = max (I_gain); + + impurities_left[1,cur_cat_feature] = as.scalar (impurity_left[max_I_gain_ind,]); + impurities_right[1,cur_cat_feature] = as.scalar (impurity_right[max_I_gain_ind,]); + best_label_counts_left[cur_cat_feature,] = label_counts_left[max_I_gain_ind,]; + best_label_counts_right[cur_cat_feature,] = label_counts_right[max_I_gain_ind,]; + } + + # determine best feature to split on and the split values + best_cat_feature = as.scalar (rowIndexMax (I_gains)); + best_cat_gain = max (I_gains); + start_ind = 1; + if (best_cat_feature > 1) { + start_ind = start_ind + as.scalar (distinct_values_offset[(best_cat_feature - 1),]); + } + offset = as.scalar (distinct_values[1,best_cat_feature]); + best_split_values_bin = split_values_bin[1, start_ind:(start_ind + offset - 1)]; + } + + # compare best scale feature to best cat. feature and pick the best one + if (num_scale_features > 0 & num_scale_feature_samples > 0 & best_scale_gain >= best_cat_gain & best_scale_gain > 0) { + + # --- update model --- + cur_F_large[1,i6] = best_scale_feature; + cur_F_large[2,i6] = 1; + cur_F_large[3,i6] = 1; + cur_S_large[1,(i6 - 1) * distinct_values_max + 1] = thresholds[max_I_gain_ind_scale, best_scale_feature]; + + left_child = 2 * (cur_node - 1) + 1 + 1; + right_child = 2 * (cur_node - 1) + 2 + 1; + + # samples going to the left subtree + Ix_left = X_scale_ext[,best_scale_split]; + + Ix_left = Ix * Ix_left; + Ix_right = Ix * (1 - Ix_left); + + L[,cur_tree] = L[,cur_tree] * (1 - Ix_left) + (Ix_left * left_child); + L[,cur_tree] = L[,cur_tree] * (1 - Ix_right) + (Ix_right * right_child); + + left_child_size = sum (Ix_left * C[,cur_tree]); + right_child_size = sum (Ix_right * C[,cur_tree]); + + # check if left or right child is a leaf + left_pure = FALSE; + right_pure = FALSE; + cur_impurity_left = as.scalar(impurity_left_scale[best_scale_split,]); # max_I_gain_ind_scale + cur_impurity_right = as.scalar(impurity_right_scale[best_scale_split,]); # max_I_gain_ind_scale + if ( (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth)) & + (right_child_size <= num_leaf | cur_impurity_right == 0 | (level == depth)) | + (left_child_size <= threshold & right_child_size <= threshold & (level == depth)) ) { # both left and right nodes are leaf + + cur_label_counts_left = label_counts_left_scale[best_scale_split,]; # max_I_gain_ind_scale + cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child; + cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree; + cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label + left_pure = TRUE; + # compute number of misclassified points + cur_NC_large[4,(2 * (i6 - 1) + 1)] = left_child_size - max (cur_label_counts_left); + + cur_label_counts_right = label_counts_overall - cur_label_counts_left; + cur_NC_large[1,(2 * i6)] = right_child; + cur_NC_large[2,(2 * i6)] = cur_tree; + cur_NC_large[3,(2 * i6)] = as.scalar( rowIndexMax (cur_label_counts_right)); # leaf class label + right_pure = TRUE; + # compute number of misclassified pints + cur_NC_large[4,(2 * i6)] = right_child_size - max (cur_label_counts_right); + + } else if (left_child_size <= num_leaf | cur_impurity_left == 0 | (level == depth) | + (left_child_size <= threshold & (level == depth))) { + + cur_label_counts_left = label_counts_left_scale[best_scale_split,]; # max_I_gain_ind_scale + cur_NC_large[1,(2 * (i6 - 1) + 1)] = left_child; + cur_NC_large[2,(2 * (i6 - 1) + 1)] = cur_tree; + cur_NC_large[3,(2 * (i6 - 1) + 1)] = as.scalar( rowIndexMax (cur_label_counts_left)); # leaf class label + left_pure = TRUE; + # compute number of misclassified points + cur_NC_large[4,(2 * (i
<TRUNCATED>
