RF: Allow array input for features

riyer Tue, 16 May 2017 15:41:19 -0700

DT/RF: Allow array input for features

JIRA: MADLIB-965


Currently array columns are not allowed features in decision tree and
random forest train functions. This commit adds support for a mixed list
of features: arrays and individual columns of multiple types can be
combined into a single list. Each array is expanded to treat each element
of the array as a feature.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/de71bd7e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/de71bd7e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/de71bd7e

Branch: refs/heads/master
Commit: de71bd7e8c14c4d5ef326500773aab2ff70edaa9
Parents: 9b45eca
Author: Rahul Iyer <ri...@apache.org>
Authored: Tue May 9 14:50:52 2017 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed May 10 17:48:49 2017 -0700

----------------------------------------------------------------------
 .../recursive_partitioning/decision_tree.py_in  |  42 +-
 .../recursive_partitioning/decision_tree.sql_in | 487 ++++++++++---------
 .../test/decision_tree.sql_in                   |  50 +-
 .../test/random_forest.sql_in                   |  45 +-
 .../postgres/modules/utilities/utilities.py_in  |  18 +-
 5 files changed, 354 insertions(+), 288 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/de71bd7e/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in 
b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index d3ca9b2..fac74df 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -30,9 +30,9 @@ from utilities.validate_args import unquote_ident
 from utilities.utilities import _assert
 from utilities.utilities import extract_keyvalue_params
 from utilities.utilities import unique_string
+from utilities.utilities import _array_to_string
 from utilities.utilities import add_postfix
 from utilities.utilities import split_quoted_delimited_str
-from utilities.utilities import _array_to_string
 from utilities.utilities import py_list_to_sql_string
 # ------------------------------------------------------------
 
@@ -126,14 +126,38 @@ def _get_features_to_use(schema_madlib, 
training_table_name,
         all_col_set = set(get_cols(training_table_name, schema_madlib))
         exclude_set = 
set(split_quoted_delimited_str(list_of_features_to_exclude))
         feature_set = all_col_set - exclude_set
-        return list(feature_set - group_set - other_col_set)
+        filtered_feature_list = list(feature_set - group_set - other_col_set)
     else:
         feature_list = split_quoted_delimited_str(list_of_features)
         feature_exclude = 
split_quoted_delimited_str(list_of_features_to_exclude)
         return_set = set(feature_list) - set(feature_exclude) - group_set - 
other_col_set
         # instead of returning list(return_set) we create a list that has
         # elements in same order as original feature_list
-        return [feat for feat in feature_list if feat in return_set]
+        filtered_feature_list = [feat for feat in feature_list if feat in 
return_set]
+
+    # check if any of the features is an array and expand the array
+    final_feature_list = []
+    for feat in filtered_feature_list:
+        feat_type = get_expr_type(feat, training_table_name)
+        if '[]' in feat_type:
+            # expand array by indexing into it
+            n_dims = plpy.execute("SELECT array_ndims({feat}) as n "
+                                  "FROM {tbl} LIMIT 1".
+                                  format(feat=feat, 
tbl=training_table_name))[0]['n']
+            _assert(n_dims == 1,
+                    "Only single dimensional arrays allowed for features. "
+                    "Invalid dimensions in {0}".format(feat))
+            feat_dims = plpy.execute("""
+                                        SELECT array_lower({f}, 1) as l,
+                                               array_upper({f}, 1) as u
+                                        FROM {tbl}
+                                        LIMIT 1
+                                     """.format(f=feat, 
tbl=training_table_name))[0]
+            final_feature_list += ["{f}[{i}]".format(f=feat, i=i)
+                                   for i in range(feat_dims['l'], 
feat_dims['u'] + 1)]
+        else:
+            final_feature_list.append(feat)
+    return final_feature_list
 # ------------------------------------------------------------
 
 
@@ -154,12 +178,9 @@ def _classify_features(feature_to_type, features):
     ordered_cat_features = [c for c in features
                             if feature_to_type[c] in ordered_cat_types]
 
-    cat_features_set = set(cat_features)
     # continuous types - 'real' is cast to 'double precision' for uniformity
     con_types = ['real', 'float8', 'double precision', 'numeric']
-    con_features = [c for c in features
-                    if (c not in cat_features_set and
-                        feature_to_type[c] in con_types)]
+    con_features = [c for c in features if feature_to_type[c] in con_types]
 
     # In order to be able to form an array, all categorical variables
     # will be cast into TEXT type, but GPDB cannot cast a boolean
@@ -601,6 +622,7 @@ def tree_train(schema_madlib, training_table_name, 
output_table_name,
                                for f in features])
         cat_features, ordered_cat_features, con_features, boolean_cats = \
             _classify_features(all_cols_types, features)
+
         # get all rows
         n_all_rows = plpy.execute("SELECT count(*) FROM {source_table}".
                                   format(source_table=training_table_name)
@@ -1657,6 +1679,10 @@ def tree_predict(schema_madlib, model, source, output, 
pred_type='response',
 
     cat_features = split_quoted_delimited_str(summary_elements["cat_features"])
     con_features = split_quoted_delimited_str(summary_elements["con_features"])
+    _assert(
+        is_var_valid(source, ','.join(cat_features + con_features)),
+        "Decision tree error: Missing columns in predict data table ({0}) "
+        "that were used during training".format(source))
     id_col_name = summary_elements["id_col_name"]
     grouping_cols_str = summary_elements["grouping_cols"]
     dep_varname = summary_elements["dependent_varname"]
@@ -2067,10 +2093,12 @@ def _xvalidate(schema_madlib, tree_states, 
training_table_name, output_table_nam
     metric_function = "_tree_misclassified" if is_classification else 
"_tree_rmse"
     pred_name = '"estimated_{0}"'.format(dependent_variable.strip(' "'))
     grouping_str = 'NULL' if not grouping_cols else '"' + grouping_cols + '"'
+
     cat_feature_str = _array_to_string(cat_features)
     ordered_cat_feature_str = _array_to_string(ordered_cat_features)
     boolean_cat_str = _array_to_string(boolean_cats)
     con_feature_str = _array_to_string(con_features)
+
     modeling_params = [str(i) for i in
                        (is_classification,
                         split_criterion, "%data%", "%model%", id_col_name,

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/de71bd7e/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index 7251a9c..ac08466 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -64,9 +64,149 @@ tree_train(
   <dd>TEXT. The name of the table containing the training data.</dd>
 
   <dt>output_table_name</dt>
-  <dd>TEXT. The name of the generated table containing the model. If a table
-  with the same name already exists, then the function will return an 
error.</dd>
+  <dd>TEXT. The name of the generated table containing the model.
+  If a table with the same name already exists, then the function will return 
an
+  error. A summary table named <em>\<output_table_name\>_summary</em> is also
+  created at the same time.
+  </DD>
+
+  <DT>id_col_name</DT>
+  <DD>TEXT. Name of the column containing id information in the training data.
+  This is a mandatory argument and is used for prediction and cross-validation.
+  The values are expected to be unique for each row.
+  </DD>
+
+  <DT>dependent_variable</DT>
+  <DD>TEXT. Name of the column that contains the output (response) for
+  training. Boolean, integer and text types are considered classification
+  outputs, while double precision values are considered regression outputs.
+  The response variable for a classification tree can be multinomial, but the
+  time and space complexity of the training function increases linearly as the
+  number of response classes increases.</DD>
+
+  <DT>list_of_features</DT>
+  <DD>TEXT. Comma-separated string of column names or expressions to use as 
predictors.
+  Can also be a '*' implying all columns are to be used as predictors (except 
the
+  ones included in the next argument). The types of the features can be mixed
+  where boolean, integer, and text columns are considered categorical and
+  double precision columns are considered continuous. The categorical variables
+  are not encoded and used as is for the training. Array columns can also be
+  included in the list, where each element of the array is considered as a
+  feature.
+
+  It is important to note that not every combination of the levels of a
+  categorical variable is checked when evaluating a split. The levels of the
+  non-integer categorical variable are ordered by the entropy of the variable 
in
+  predicting the response. The split at each node is evaluated between these
+  ordered levels. Integer categorical variables, howeve, are simply ordered
+  by their value.
+  </DD>
+
+  <DT>list_of_features_to_exclude</DT>
+  <DD>TEXT. Comma-separated string of column names to exclude from the 
predictors
+      list. If the <em>dependent_variable</em> is an expression (including 
cast of a column name),
+      then this list should include the columns present in the
+      <em>dependent_variable</em> expression,
+      otherwise those columns will be included in the features.
+      The names in this parameter should be identical to the names used in the 
table and
+      quoted appropriately. </DD>
 
+  <DT>split_criterion</DT>
+  <DD>TEXT, default = 'gini' for classification, 'mse' for regression.
+  Impurity function to compute the feature to use for the split.
+  Supported criteria are 'gini', 'entropy', 'misclassification' for
+  classification trees. For regression trees, split_criterion of 'mse'
+  is always used (irrespective of the input for this argument). </DD>
+
+  <DT>grouping_cols (optional)</DT>
+  <DD>TEXT, default: NULL. Comma-separated list of column names to group the
+      data by. This will result in multiple decision trees, one for
+      each group. </DD>
+
+  <DT>weights (optional)</DT>
+  <DD>TEXT. Column name containing weights for each observation.</DD>
+
+  <DT>max_depth (optional)</DT>
+  <DD>INTEGER, default: 7. Maximum depth of any node of the final tree,
+      with the root node counted as depth 0. A deeper tree can
+      lead to better prediction but will also result in
+      longer processing time and higher memory usage.</DD>
+
+  <DT>min_split (optional)</DT>
+  <DD>INTEGER, default: 20. Minimum number of observations that must exist
+      in a node for a split to be attempted. The best value for this parameter
+      depends on the number of tuples in the dataset.</DD>
+
+  <DT>min_bucket (optional)</DT>
+  <DD>INTEGER, default: min_split/3. Minimum number of observations in any 
terminal
+      node. If only one of min_bucket or min_split is specified, min_split is
+      set to min_bucket*3 or min_bucket to min_split/3, as appropriate.</DD>
+
+  <DT>num_splits (optional)</DT>
+  <DD>INTEGER, default: 20. Continuous-valued features are binned into
+      discrete quantiles to compute split boundaries. This global parameter
+      is used to compute the resolution of splits for continuous features.
+      Higher number of bins will lead to better prediction,
+      but will also result in longer processing time and higher memory 
usage.</DD>
+
+  <DT>pruning_params (optional)</DT>
+  <DD>TEXT. Comma-separated string of key-value pairs giving
+  the parameters for pruning the tree. The parameters currently accepted are:
+    <table class='output'>
+      <tr>
+      <th>cp</th>
+      <td>
+        Default: 0. A split on a node is attempted only if it
+        decreases the overall lack of fit by a factor of 'cp', else the split 
is
+        pruned away. This value is used to create an initial tree before 
running
+        cross-validation (see below).
+
+      </td>
+      </tr>
+      <tr>
+      <th>n_folds</th>
+      <td>
+        Default: 0 (i.e. no cross-validation).
+        Number of cross-validation folds to use to compute the best value of
+        <em>cp</em>. To perform cross-validation, a positive value of
+        <em>n_folds</em> (greater than 2) should be given. An additional output
+        table <em>\<model_table\>_cv</em> is created containing the values of
+        evaluated <em>cp</em> and the cross-validation error. The tree returned
+        in the output table corresponds to the <em>cp</em> with the lowest
+        cross-validation error (we pick the maximum <em>cp</em> if multiple
+        values have same error).
+
+        The list of <em>cp</em> values is automatically computed by parsing
+        through the tree initially trained on the complete dataset. The tree
+        output is a subset of this initial tree corresponding to the best
+        computed <em>cp</em>.
+
+      </td>
+      </tr>
+    </table>
+  </DD>
+
+  <DT>surrogate_params</DT>
+  <DD>TEXT. Comma-separated string of key-value pairs controlling the behavior
+  of surrogate splits for each node. A surrogate variable is another predictor
+  variable that is associated (correlated) with the primary predictor variable
+  for a split. The surrogate variable comes into use when the primary 
predictior
+  value is NULL. This parameter currently accepts one argument:
+    <table class='output'>
+    <tr>
+      <th>max_surrogates</th>
+      <td>Default: 0. Number of surrogates to store for each node.</td>
+    </tr>
+    </table>
+  </DD>
+
+  <DT>verbosity (optional)</DT>
+  <DD>BOOLEAN, default: FALSE. Provides verbose output of the training 
result.</DD>
+</DL>
+
+\b Output
+<dl class="arglist">
+<DD>
     The model table produced by the training function contains the following 
columns:
 
     <table class="output">
@@ -103,7 +243,7 @@ tree_train(
 
     </table>
 
-    A summary table named <em>\<model_table\>_summary</em> is also created at
+    A summary table named <em>\<output_table_name\>_summary</em> is also 
created at
     the same time, which has the following columns:
      <table class="output">
 
@@ -199,150 +339,19 @@ tree_train(
     <th>independent_var_types</th>
     <td>TEXT. A comma separated string for the types of independent 
variables.</td>
     </tr>
-
    </table>
   </DD>
-
-  <DT>id_col_name</DT>
-  <DD>TEXT. Name of the column containing id information in the training data.
-  This is a mandatory argument and is used for prediction and cross-validation.
-  The values are expected to be unique for each row.
-  </DD>
-
-  <DT>dependent_variable</DT>
-  <DD>TEXT. Name of the column that contains the output (response) for
-  training. Boolean, integer and text types are considered classification
-  outputs, while double precision values are considered regression outputs.
-  The response variable for a classification tree can be multinomial, but the
-  time and space complexity of the training function increases linearly as the
-  number of response classes increases.</DD>
-
-  <DT>list_of_features</DT>
-  <DD>TEXT. Comma-separated string of column names to use as predictors. Can
-  also be a '*' implying all columns are to be used as predictors (except the
-  ones included in the next argument). The types of the features can be mixed
-  where boolean, integer, and text columns are considered categorical and
-  double precision columns are considered continuous. The categorical variables
-  are not encoded and used as is for the training.
-
-  It is important to note that we don't test for every combination of
-  levels of a categorical variable when evaluating a split. We order the levels
-  of the non-integer categorical variable by the entropy of the variable in
-  predicting the response. The split at each node is evaluated between these
-  ordered levels. Integer categorical variables are ordered by their value.
-  </DD>
-
-  <DT>list_of_features_to_exclude</DT>
-  <DD>TEXT. Comma-separated string of column names to exclude from the 
predictors
-      list. If the <em>dependent_variable</em> is an expression (including 
cast of a column name),
-      then this list should include all columns present in the
-      <em>dependent_variable</em> expression,
-      otherwise those columns will be included in the features.
-      The names in this parameter should be identical to the names used in the 
table and
-      quoted appropriately.</DD>
-
-  <DT>split_criterion</DT>
-  <DD>TEXT, default = 'gini' for classification, 'mse' for regression.
-  Impurity function to compute the feature to use for the split.
-  Supported criteria are 'gini', 'entropy', 'misclassification' for
-  classification trees. For regression trees, split_criterion of 'mse'
-  is always used (irrespective of the input for this argument).
-  </DD>
-
-  <DT>grouping_cols (optional)</DT>
-  <DD>TEXT, default: NULL. Comma-separated list of column names to group the
-      data by. This will result in multiple decision trees, one for
-      each group.</DD>
-
-  <DT>weights (optional)</DT>
-  <DD>TEXT. Column name containing weights for each observation.</DD>
-
-  <DT>max_depth (optional)</DT>
-  <DD>INTEGER, default: 7. Maximum depth of any node of the final tree,
-      with the root node counted as depth 0. A deeper tree can
-      lead to better prediction but will also result in
-      longer processing time and higher memory usage.</DD>
-
-  <DT>min_split (optional)</DT>
-  <DD>INTEGER, default: 20. Minimum number of observations that must exist
-      in a node for a split to be attempted. The best value for this parameter
-      depends on the number of tuples in the dataset.</DD>
-
-  <DT>min_bucket (optional)</DT>
-  <DD>INTEGER, default: min_split/3. Minimum number of observations in any 
terminal
-      node. If only one of min_bucket or min_split is specified, min_split is
-      set to min_bucket*3 or min_bucket to min_split/3, as appropriate.</DD>
-
-  <DT>num_splits (optional)</DT>
-  <DD>INTEGER, default: 20. Continuous-valued features are binned into
-      discrete quantiles to compute split boundaries. This global parameter
-      is used to compute the resolution of splits for continuous features.
-      Higher number of bins will lead to better prediction,
-      but will also result in longer processing time and higher memory 
usage.</DD>
-
-  <DT>pruning_params (optional)</DT>
-  <DD>TEXT. Comma-separated string of key-value pairs giving
-  the parameters for pruning the tree. The parameters currently accepted are:
-    <table class='output'>
-      <tr>
-      <th>cp</th>
-      <td>
-        Default: 0. A split on a node is attempted only if it
-        decreases the overall lack of fit by a factor of 'cp', else the split 
is
-        pruned away. This value is used to create an initial tree before 
running
-        cross-validation (see below).
-
-      </td>
-      </tr>
-      <tr>
-      <th>n_folds</th>
-      <td>
-        Default: 0 (i.e. no cross-validation).
-        Number of cross-validation folds to use to compute the best value of
-        <em>cp</em>. To perform cross-validation, a positive value of
-        <em>n_folds</em> (greater than 2) should be given. An additional output
-        table <em>\<model_table\>_cv</em> is created containing the values of
-        evaluated <em>cp</em> and the cross-validation error. The tree returned
-        in the output table corresponds to the <em>cp</em> with the lowest
-        cross-validation error (we pick the maximum <em>cp</em> if multiple
-        values have same error).
-
-        The list of <em>cp</em> values is automatically computed by parsing
-        through the tree initially trained on the complete dataset. The tree
-        output is a subset of this initial tree corresponding to the best
-        computed <em>cp</em>.
-
-      </td>
-      </tr>
-    </table>
-  </DD>
-
-  <DT>surrogate_params</DT>
-  <DD>TEXT. Comma-separated string of key-value pairs controlling the behavior
-  of surrogate splits for each node. A surrogate variable is another predictor
-  variable that is associated (correlated) with the primary predictor variable
-  for a split. The surrogate variable comes into use when the primary 
predictior
-  value is NULL. This parameter currently accepts one argument:
-    <table class='output'>
-    <tr>
-      <th>max_surrogates</th>
-      <td>Default: 0. Number of surrogates to store for each node.</td>
-    </tr>
-    </table>
-  </DD>
-
-  <DT>verbosity (optional)</DT>
-  <DD>BOOLEAN, default: FALSE. Provides verbose output of the training 
result.</DD>
 </DL>
 
+
 @note
 - Many of the parameters are designed to be similar to the popular R package 
'rpart'.
 An important distinction between rpart and the MADlib function is that
 for both response and feature variables, MADlib considers integer values as
 categorical values, while rpart considers them as continuous. To use integers 
as
-continuous, please cast them to double precision.
-- Integer values are ordered by value for computing the split boundaries. 
Please
-cast to TEXT if the entropy-based ordering method is desired.
+continuous, cast them to double precision.
+- Integer values are ordered by value for computing the split boundaries. Cast
+to TEXT if the entropy-based ordering method is desired.
 - When using no surrogates (<em>max_surrogates</em>=0), all rows containing 
NULL values
 for any of the features used for training will be ignored from training and 
prediction.
 - When cross-validation is not used (<em>n_folds</em>=0), each tree output
@@ -388,14 +397,14 @@ tree_predict(tree_model,
   The table contains the <em>id_col_name</em> column giving
   the 'id' for each prediction and the prediction columns for the dependent 
variable.
 
-  If <em>type</em> = 'response', then the table has a single additional
-  column with the prediction value of the response. The type of this column 
depends on the type
-  of the response variable used during training.
+  If <em>type</em> = 'response', then the table has a single additional column
+  with the prediction value of the response. The type of this column depends on
+  the type of the response variable used during training.
 
-  If <em>type</em> = 'prob', then the table has multiple additional columns, 
one for each
-  possible value of the response variable. The columns are labeled as
-  'estimated_prob_<em>dep_value</em>', where <em>dep_value</em> represents 
each value
-  of the response variable.</DD>
+  If <em>type</em> = 'prob', then the table has multiple additional columns, 
one
+  for each possible value of the response variable. The columns are labeled as
+  'estimated_prob_<em>dep_value</em>', where <em>dep_value</em> represents each
+  value of the response variable.</DD>
 
   <DT>type</DT>
   <DD>TEXT, optional, default: 'response'. For regression trees, the output is
@@ -434,9 +443,10 @@ split (variable and threshold) and also give the number of 
rows that were common
 between the primary split and the surrogate split. Finally, the number of rows
 present in the majority branch of the primary split is also shown. Only
 surrogates that perform better than this majority branch are included in the
-surrogate list. When the primary variable has a NULL value the surrogate 
variables
-are used in order to compute the split for that node. If all surrogates 
variables
-are NULL, then the majority branch is used to compute the split for a tuple.
+surrogate list. When the primary variable has a NULL value the surrogate
+variables are used in order to compute the split for that node. If all
+surrogates variables are NULL, then the majority branch is used to compute the
+split for a tuple.
 
 \b Arguments
 <DL class="arglist">
@@ -489,32 +499,34 @@ of the tree output formats.
 
 -# Prepare input data:
 <pre class="example">
-DROP TABLE IF EXISTS dt_golf;
+DROP TABLE IF EXISTS dt_golf CASCADE;
 CREATE TABLE dt_golf (
     id integer NOT NULL,
     "OUTLOOK" text,
     temperature double precision,
     humidity double precision,
-    windy text,
+    "Cont_features" double precision[],
+    cat_features text[],
+    windy boolean,
     class text
 );
 </pre>
 <pre class="example">
-COPY dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) FROM stdin WITH 
DELIMITER '|';
-1|sunny|85|85|'false'|'Don't Play'
-2|sunny|80|90|'true'|'Don't Play'
-3|overcast|83|78|'false'|'Play'
-4|rain|70|96|'false'|'Play'
-5|rain|68|80|'false'|'Play'
-6|rain|65|70|'true'|'Don't Play'
-7|overcast|64|65|'true'|'Play'
-8|sunny|72|95|'false'|'Don't Play'
-9|sunny|69|70|'false'|'Play'
-10|rain|75|80|'false'|'Play'
-11|sunny|75|70|'true'|'Play'
-12|overcast|72|90|'true'|'Play'
-13|overcast|81|75|'false'|'Play'
-14|rain|71|80|'true'|'Don't Play'
+COPY dt_golf 
(id,"OUTLOOK",temperature,humidity,"Cont_features",cat_features,windy,class) 
FROM stdin WITH DELIMITER '|';
+1|sunny|85|85|{85, 85}|{'a', 'b'}|false|'Don't Play'
+2|sunny|80|90|{80, 90}|{'a', 'b'}|true|'Don't Play'
+3|overcast|83|78|{83, 78}|{'a', 'b'}|false|'Play'
+4|rain|70|96|{70, 96}|{'c', 'd'}|false|'Play'
+5|rain|68|80|{68, 80}|{'a', 'b'}|false|'Play'
+6|rain|65|70|{65, 70}|{'a', 'b'}|true|'Don't Play'
+7|overcast|64|65|{64, 65}|{'a', 'b'}|true|'Play'
+8|sunny|72|95|{72, 95}|{'c', 'd'}|false|'Don't Play'
+9|sunny|69|70|{69, 70}|{'a', 'b'}|false|'Play'
+10|rain|75|80|{75, 80}|{'a', 'b'}|false|'Play'
+11|sunny|75|70|{75, 70}|{'c', 'd'}|true|'Play'
+12|overcast|72|90|{72, 90}|{'a', 'b'}|true|'Play'
+13|overcast|81|75|{81, 75}|{'a', 'b'}|false|'Play'
+14|rain|71|80|{71, 80}|{'a', 'b'}|true|'Don't Play'
 \\.
 </pre>
 
@@ -525,7 +537,7 @@ SELECT madlib.tree_train('dt_golf',         -- source table
                          'train_output',    -- output model table
                          'id',              -- id column
                          'class',           -- response
-                         '"OUTLOOK", temperature, humidity, windy',   -- 
features
+                         '"OUTLOOK", "Cont_features", windy',   -- features
                          NULL::text,        -- exclude columns
                          'gini',            -- split criterion
                          NULL::text,        -- no grouping
@@ -544,26 +556,26 @@ SELECT madlib.tree_predict('train_output',          -- 
tree model
                            'dt_golf',               -- new data table
                            'prediction_results',    -- output table
                            'response');             -- show prediction
-SELECT * FROM prediction_results ORDER BY id;
+SELECT g.id, class, estimated_class FROM prediction_results p, dt_golf g where 
p.id = g.id ORDER BY g.id;
 </pre>
 Result:
 <pre class="result">
- id | estimated_class
-----+-----------------
-  1 | 'Don't Play'
-  2 | 'Don't Play'
-  3 | 'Play'
-  4 | 'Play'
-  5 | 'Play'
-  6 | 'Don't Play'
-  7 | 'Play'
-  8 | 'Don't Play'
-  9 | 'Play'
- 10 | 'Play'
- 11 | 'Play'
- 12 | 'Play'
- 13 | 'Play'
- 14 | 'Don't Play'
+id |    class     | estimated_class
+----+--------------+-----------------
+  1 | 'Don't Play' | 'Don't Play'
+  2 | 'Don't Play' | 'Don't Play'
+  3 | 'Play'       | 'Play'
+  4 | 'Play'       | 'Play'
+  5 | 'Play'       | 'Play'
+  6 | 'Don't Play' | 'Don't Play'
+  7 | 'Play'       | 'Play'
+  8 | 'Don't Play' | 'Don't Play'
+  9 | 'Play'       | 'Play'
+ 10 | 'Play'       | 'Play'
+ 11 | 'Play'       | 'Play'
+ 12 | 'Play'       | 'Play'
+ 13 | 'Play'       | 'Play'
+ 14 | 'Don't Play' | 'Don't Play'
 (14 rows)
 </pre>
 
@@ -575,41 +587,41 @@ Result:
 <pre class="result">
 &nbsp;-------------------------------------
 &nbsp;- Each node represented by 'id' inside ().
-&nbsp;- Leaf nodes have a * while internal nodes have the split condition at 
the end.
-&nbsp;- For each internal node (i), it's children will be at (2i+1) and (2i+2).
-&nbsp;- For each split the first indented child (2i+1) is the 'True' node and
-second indented child (2i+2) is the 'False' node.
-&nbsp;- Number of (weighted) rows for each response variable inside [].
-&nbsp;- Order of values = ['"Don\'t Play"', '"Play"']
+&nbsp;- Each internal nodes has the split condition at the end, while each
+        leaf node has a * at the end.
+&nbsp;- For each internal node (i), its child nodes are indented by 1 level
+        with ids (2i+1) for True node and (2i+2) for False node.
+&nbsp;- Number of (weighted) rows for each response variable inside [].'
+        The response label order is given as ['"\'Don\'t Play\'"', 
'"\'Play\'"'].
+        For each leaf, the prediction is given after the '-->'
 &nbsp;-------------------------------------
-(0)[ 5 9]  "OUTLOOK"<={overcast}
-  (1)[ 0 4]  *
-  (2)[ 5 5]  temperature<=75
-    (5)[ 3 5]  temperature<=65
-      (11)[ 1 0]  *
-      (12)[ 2 5]  temperature<=70
-        (25)[ 0 3]  *
-        (26)[ 2 2]  temperature<=72
-          (53)[ 2 0]  *
-          (54)[ 0 2]  *
-    (6)[ 2 0]  *
+(0)[5 9]  "OUTLOOK" in {overcast}
+   (1)[0 4]  * --> "'Play'"
+   (2)[5 5]  "Cont_features"[1] <= 75
+      (5)[3 5]  "Cont_features"[1] <= 65
+         (11)[1 0]  * --> "'Don't Play'"
+         (12)[2 5]  "Cont_features"[1] <= 70
+            (25)[0 3]  * --> "'Play'"
+            (26)[2 2]  "Cont_features"[1] <= 72
+               (53)[2 0]  * --> "'Don't Play'"
+               (54)[0 2]  * --> "'Play'"
+      (6)[2 0]  * --> "'Don't Play'"
 &nbsp;-------------------------------------
 </pre>
 Here are some more details on how to interpret the tree display above...
-Node numbering starts at 0 for the root node and would be
+  - Node numbering starts at 0 for the root node and would be
 contiguous 1,2...n if the tree was completely full (no pruning).
-Since the tree has been pruned, the node numbering is not
+  - Since the tree has been pruned, the node numbering is not
 contiguous.
-The order of values [x y] indicates the number of weighted
+  - The order of values [x y] indicates the number of weighted
 rows that correspond to ["Don't play" "Play"] <em>before</em> the node test.
-For example, at the root node 0, there are 5 rows that "Don't play"
-and 9 rows that "Play" in the raw data.
-If we apply the test
-of "OUTLOOK" being overcast, then the True result is
+For example, at (root) node 0, there are 5 rows that are "Don't play"
+and 9 rows that are "Play" in the raw data.
+  - If we apply the test of "OUTLOOK" being overcast, then the True result is
 leaf node 1 which is "Play".  There are 0 "Don't play" rows
 and 4 "Play" rows that correspond to this case (overcast).
 The remaining 5 "Don't play" rows and 5 "Play rows" are then
-tested at node 2 on temperature<=75.  And so on down the tree.
+tested at node 2 on "Cont_features[1]"<=75.  And so on down the tree.
 
 -# Create a dot format display of the tree:
 <pre class="example">
@@ -620,27 +632,27 @@ Result:
 digraph "Classification tree for dt_golf" {
          subgraph "cluster0"{
          label=""
-"g0_0" [label="\"OUTLOOK\"<={overcast}", shape=ellipse];
+"g0_0" [label="\"OUTLOOK\" <= overcast", shape=ellipse];
 "g0_0" -> "g0_1"[label="yes"];
-"g0_1" [label="\"Play\"",shape=box];
+"g0_1" [label="\"'Play'\"",shape=box];
 "g0_0" -> "g0_2"[label="no"];
-"g0_2" [label="temperature<=75", shape=ellipse];
+"g0_2" [label="\"Cont_features\"[1] <= 75", shape=ellipse];
 "g0_2" -> "g0_5"[label="yes"];
 "g0_2" -> "g0_6"[label="no"];
-"g0_6" [label="\"Don't Play\"",shape=box];
-"g0_5" [label="temperature<=65", shape=ellipse];
+"g0_6" [label="\"'Don't Play'\"",shape=box];
+"g0_5" [label="\"Cont_features\"[1] <= 65", shape=ellipse];
 "g0_5" -> "g0_11"[label="yes"];
-"g0_11" [label="\"Don't Play\"",shape=box];
+"g0_11" [label="\"'Don't Play'\"",shape=box];
 "g0_5" -> "g0_12"[label="no"];
-"g0_12" [label="temperature<=70", shape=ellipse];
+"g0_12" [label="\"Cont_features\"[1] <= 70", shape=ellipse];
 "g0_12" -> "g0_25"[label="yes"];
-"g0_25" [label="\"Play\"",shape=box];
+"g0_25" [label="\"'Play'\"",shape=box];
 "g0_12" -> "g0_26"[label="no"];
-"g0_26" [label="temperature<=72", shape=ellipse];
+"g0_26" [label="\"Cont_features\"[1] <= 72", shape=ellipse];
 "g0_26" -> "g0_53"[label="yes"];
-"g0_53" [label="\"Don't Play\"",shape=box];
+"g0_53" [label="\"'Don't Play'\"",shape=box];
 "g0_26" -> "g0_54"[label="no"];
-"g0_54" [label="\"Play\"",shape=box];
+"g0_54" [label="\"'Play'\"",shape=box];
 &nbsp;&nbsp;&nbsp;} //--- end of subgraph------------
 &nbsp;} //---end of digraph---------
 </pre>
@@ -654,31 +666,44 @@ Result:
 digraph "Classification tree for dt_golf" {
          subgraph "cluster0"{
          label=""
-"g0_0" [label="\"OUTLOOK\" in {overcast}\\n impurity = 0.459184\\n samples = 
14\\n value = [5 9]\\n class = \"'Play'\"", shape=ellipse];
+"g0_0" [label="\"OUTLOOK\" <= overcast\\n impurity = 0.459184\\n samples = 
14\\n value = [5 9
+]\\n class = \"'Play'\"", shape=ellipse];
 "g0_0" -> "g0_1"[label="yes"];
-"g0_1" [label="\"'Play'\"\\n samples = 4\\n value = [0 4]",shape=box];
+"g0_1" [label="\"'Play'\"\\n impurity = 0\\n samples = 4\\n value = [0 4
+]",shape=box];
 "g0_0" -> "g0_2"[label="no"];
-"g0_2" [label="temperature <= 75\\n impurity = 0.5\\n samples = 10\\n value = 
[5 5]\\n class = \"'Don't Play'\"", shape=ellipse];
+"g0_2" [label="\"Cont_features\"[1] <= 75\\n impurity = 0.5\\n samples = 10\\n 
value = [5 5
+]\\n class = \"'Don't Play'\"", shape=ellipse];
 "g0_2" -> "g0_5"[label="yes"];
 "g0_2" -> "g0_6"[label="no"];
-"g0_6" [label="\"'Don't Play'\"\\n samples = 2\\n value = [2 0]",shape=box];
-"g0_5" [label="temperature <= 65\\n impurity = 0.46875\\n samples = 8\\n value 
= [3 5]\\n class = \"'Play'\"", shape=ellipse];
+"g0_6" [label="\"'Don't Play'\"\\n impurity = 0\\n samples = 2\\n value = [2 0
+]",shape=box];
+"g0_5" [label="\"Cont_features\"[1] <= 65\\n impurity = 0.46875\\n samples = 
8\\n value = [3 5
+]\\n class = \"'Play'\"", shape=ellipse];
 "g0_5" -> "g0_11"[label="yes"];
-"g0_11" [label="\"'Don't Play'\"\\n samples = 1\\n value = [1 0]",shape=box];
+"g0_11" [label="\"'Don't Play'\"\\n impurity = 0\\n samples = 1\\n value = [1 0
+]",shape=box];
 "g0_5" -> "g0_12"[label="no"];
-"g0_12" [label="temperature <= 70\\n impurity = 0.408163\\n samples = 7\\n 
value = [2 5]\\n class = \"'Play'\"", shape=ellipse];
+"g0_12" [label="\"Cont_features\"[1] <= 70\\n impurity = 0.408163\\n samples = 
7\\n value = [2 5
+]\\n class = \"'Play'\"", shape=ellipse];
 "g0_12" -> "g0_25"[label="yes"];
-"g0_25" [label="\"'Play'\"\\n samples = 3\\n value = [0 3]",shape=box];
+"g0_25" [label="\"'Play'\"\\n impurity = 0\\n samples = 3\\n value = [0 3
+]",shape=box];
 "g0_12" -> "g0_26"[label="no"];
-"g0_26" [label="temperature <= 72\\n impurity = 0.5\\n samples = 4\\n value = 
[2 2]\\n class = \"'Don't Play'\"", shape=ellipse];
+"g0_26" [label="\"Cont_features\"[1] <= 72\\n impurity = 0.5\\n samples = 4\\n 
value = [2 2
+]\\n class = \"'Don't Play'\"", shape=ellipse];
 "g0_26" -> "g0_53"[label="yes"];
-"g0_53" [label="\"'Don't Play'\"\\n samples = 2\\n value = [2 0]",shape=box];
+"g0_53" [label="\"'Don't Play'\"\\n impurity = 0\\n samples = 2\\n value = [2 0
+]",shape=box];
 "g0_26" -> "g0_54"[label="no"];
-"g0_54" [label="\"'Play'\"\\n samples = 2\\n value = [0 2]",shape=box];
+"g0_54" [label="\"'Play'\"\\n impurity = 0\\n samples = 2\\n value = [0 2
+]",shape=box];
 &nbsp;&nbsp;&nbsp;} //--- end of subgraph------------
 &nbsp;} //---end of digraph---------
 </pre>
-The additional information in each node is: impurity, sample size, number of 
weighted rows for each response variable, and classification if the tree was 
pruned at this level.
+The additional information in each node is: impurity, sample size, number of
+weighted rows for each response variable, and classification if the tree was
+pruned at this level.
 
 <h4>Decision Tree Regression Example</h4>
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/de71bd7e/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
index dd861a0..b135108 100644
--- 
a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
+++ 
b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
@@ -239,25 +239,29 @@ CREATE TABLE dt_golf (
     "OUTLOOK" text,
     temperature double precision,
     humidity double precision,
+    "Cont_features" double precision[],
+    cat_features text[],
     windy boolean,
     class text
 ) ;
 
-INSERT INTO dt_golf (id,"OUTLOOK",temperature,humidity,windy,class) VALUES
-(1, 'sunny', 85, 85, false, 'Don''t Play'),
-(2, 'sunny', 80, 90, true, 'Don''t Play'),
-(3, 'overcast', 83, 78, false, 'Play'),
-(4, 'rain', 70, 96, false, 'Play'),
-(5, 'rain', 68, 80, false, 'Play'),
-(6, 'rain', 65, 70, true, 'Don''t Play'),
-(7, 'overcast', 64, 65, true, 'Play'),
-(8, 'sunny', 72, 95, false, 'Don''t Play'),
-(9, 'sunny', 69, 70, false, 'Play'),
-(10, 'rain', 75, 80, false, 'Play'),
-(11, 'sunny', 75, 70, true, 'Play'),
-(12, 'overcast', 72, 90, true, 'Play'),
-(13, 'overcast', 81, 75, false, 'Play'),
-(14, 'rain', 71, 80, true, 'Don''t Play');
+INSERT INTO dt_golf 
(id,"OUTLOOK",temperature,humidity,"Cont_features",cat_features, windy,class) 
VALUES
+(1, 'sunny', 85, 85,ARRAY[85, 85], ARRAY['a', 'b'], false, 'Don''t Play'),
+(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['a', 'b'], true, 'Don''t Play'),
+(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['a', 'b'], false, 'Play'),
+(4, 'rain', 70, NULL, ARRAY[70, 96], ARRAY['a', 'b'], false, 'Play'),
+(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['a', 'b'], false, 'Play'),
+(6, 'rain', NULL, 70, ARRAY[65, 70], ARRAY['a', 'b'], true, 'Don''t Play'),
+(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['c', 'b'], NULL , 'Play'),
+(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['a', 'b'], false, 'Don''t Play'),
+(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['a', 'b'], false, 'Play'),
+(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['a', 'b'], false, 'Play'),
+(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['a', 'd'], true, 'Play'),
+(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['c', 'b'], NULL, 'Play'),
+(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
+(15, NULL, 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
+(16, 'overcast', NULL, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, 'Play'),
+(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['c', 'b'], true, 'Don''t Play');
 
 -- no grouping
 DROP TABLE IF EXISTS train_output, train_output_summary;
@@ -265,7 +269,7 @@ SELECT tree_train('dt_golf'::text,         -- source table
                          'train_output'::text,    -- output model table
                          'id'::text,              -- id column
                          'temperature::double precision'::text,           -- 
response
-                         'humidity, windy'::text,   -- features
+                         'humidity, windy, "Cont_features"'::text,   -- 
features
                          NULL::text,        -- exclude columns
                          'gini'::text,      -- split criterion
                          NULL::text,        -- no grouping
@@ -287,7 +291,7 @@ SELECT tree_train('dt_golf'::text,         -- source table
                          'train_output'::text,    -- output model table
                          'id'::text,              -- id column
                          'temperature::double precision'::text,           -- 
response
-                         '"OUTLOOK", humidity, windy'::text,   -- features
+                         '"OUTLOOK", humidity, windy, cat_features'::text,   
-- features
                          NULL::text,        -- exclude columns
                          'gini'::text,      -- split criterion
                          'class'::text,     -- grouping
@@ -302,13 +306,17 @@ SELECT tree_train('dt_golf'::text,         -- source table
 SELECT _print_decision_tree(tree) from train_output;
 SELECT tree_display('train_output', False);
 
+
+-- testing tree_predict with a category not present in training table
 CREATE TABLE dt_golf2 as
 SELECT * FROM dt_golf
 UNION
 SELECT 15 as id, 'humid' as "OUTLOOK", 71 as temperature, 80 as humidity,
-        true as windy, 'Don''t Play' as class;
-SELECT tree_predict('train_output', 'dt_golf2', 'predict_output');
+       ARRAY[90, 90] as "Cont_features", ARRAY['b', 'c'] as cat_features,
+       true as windy, 'Don''t Play' as class;
 \x off
+SELECT * FROM dt_golf2;
+SELECT tree_predict('train_output', 'dt_golf2', 'predict_output');
 SELECT *
 FROM
     predict_output
@@ -326,7 +334,7 @@ SELECT tree_train('dt_golf'::text,         -- source table
                          'train_output'::text,    -- output model table
                          'id'::text,              -- id column
                          'temperature::double precision'::text,           -- 
response
-                         'humidity, windy'::text,   -- features
+                         '"OUTLOOK", cat_features, "Cont_features"'::text,   
-- features
                          NULL::text,        -- exclude columns
                          'mse'::text,      -- split criterion
                          NULL::text,        -- no grouping
@@ -383,7 +391,7 @@ select __build_tree(
     4,
     2,
     1,
-    5,
+    3,
     'group_cp',
     0::smallint,
     'notice',

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/de71bd7e/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
index f3ad93c..8aec1f0 100644
--- 
a/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
+++ 
b/src/ports/postgres/modules/recursive_partitioning/test/random_forest.sql_in
@@ -4,28 +4,29 @@ CREATE TABLE dt_golf (
     "OUTLOOK" text,
     temperature double precision,
     humidity double precision,
-    cont_features double precision[],
+    "Cont_features" double precision[],
+    cat_features text[],
     windy boolean,
     class text
 ) ;
 
-INSERT INTO dt_golf 
(id,"OUTLOOK",temperature,humidity,cont_features,windy,class) VALUES
-(1, 'sunny', 85, 85,ARRAY[85, 85], false, 'Don''t Play'),
-(2, 'sunny', 80, 90,ARRAY[80, 90], true, 'Don''t Play'),
-(3, 'overcast', 83, 78,ARRAY[83, 78], false, 'Play'),
-(4, 'rain', 70, NULL,ARRAY[70, 96], false, 'Play'),
-(5, 'rain', 68, 80,ARRAY[68, 80], false, 'Play'),
-(6, 'rain', NULL, 70,ARRAY[65, 70], true, 'Don''t Play'),
-(7, 'overcast', 64, 65,ARRAY[64, 65],NULL, 'Play'),
-(8, 'sunny', 72, 95,ARRAY[72, 95], false, 'Don''t Play'),
-(9, 'sunny', 69, 70,ARRAY[69, 70], false, 'Play'),
-(10, 'rain', 75, 80,ARRAY[75, 80], false, 'Play'),
-(11, 'sunny', 75, 70,ARRAY[75, 70], true, 'Play'),
-(12, 'overcast', 72, 90,ARRAY[72, 90], NULL, 'Play'),
-(13, 'overcast', 81, 75,ARRAY[81, 75], false, 'Play'),
-(15, NULL, 81, 75,ARRAY[81, 75], false, 'Play'),
-(16, 'overcast', NULL, 75,ARRAY[81, 75], false, 'Play'),
-(14, 'rain', 71, 80,ARRAY[71, 80], true, 'Don''t Play');
+INSERT INTO dt_golf 
(id,"OUTLOOK",temperature,humidity,"Cont_features",cat_features, windy,class) 
VALUES
+(1, 'sunny', 85, 85,ARRAY[85, 85], ARRAY['a', 'b'], false, 'Don''t Play'),
+(2, 'sunny', 80, 90, ARRAY[80, 90], ARRAY['a', 'b'], true, 'Don''t Play'),
+(3, 'overcast', 83, 78, ARRAY[83, 78], ARRAY['a', 'b'], false, 'Play'),
+(4, 'rain', 70, NULL, ARRAY[70, 96], ARRAY['a', 'b'], false, 'Play'),
+(5, 'rain', 68, 80, ARRAY[68, 80], ARRAY['a', 'b'], false, 'Play'),
+(6, 'rain', NULL, 70, ARRAY[65, 70], ARRAY['a', 'b'], true, 'Don''t Play'),
+(7, 'overcast', 64, 65, ARRAY[64, 65], ARRAY['c', 'b'], NULL , 'Play'),
+(8, 'sunny', 72, 95, ARRAY[72, 95], ARRAY['a', 'b'], false, 'Don''t Play'),
+(9, 'sunny', 69, 70, ARRAY[69, 70], ARRAY['a', 'b'], false, 'Play'),
+(10, 'rain', 75, 80, ARRAY[75, 80], ARRAY['a', 'b'], false, 'Play'),
+(11, 'sunny', 75, 70, ARRAY[75, 70], ARRAY['a', 'd'], true, 'Play'),
+(12, 'overcast', 72, 90, ARRAY[72, 90], ARRAY['c', 'b'], NULL, 'Play'),
+(13, 'overcast', 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
+(15, NULL, 81, 75, ARRAY[81, 75], ARRAY['a', 'b'], false, 'Play'),
+(16, 'overcast', NULL, 75, ARRAY[81, 75], ARRAY['a', 'd'], false, 'Play'),
+(14, 'rain', 71, 80, ARRAY[71, 80], ARRAY['c', 'b'], true, 'Don''t Play');
 
 -------------------------------------------------------------------------
 DROP TABLE IF EXISTS train_output, train_output_summary, train_output_group;
@@ -34,7 +35,7 @@ SELECT forest_train(
                   'train_output'::TEXT,    -- output model table
                   'id'::TEXT,              -- id column
                   'class'::TEXT,           -- response
-                  'windy, cont_features[1]'::TEXT,   -- features
+                  'windy, "Cont_features"[1]'::TEXT,   -- features
                   NULL::TEXT,        -- exclude columns
                   NULL::TEXT,        -- no grouping
                   5,                -- num of trees
@@ -60,7 +61,7 @@ SELECT forest_train(
                   'train_output'::TEXT,    -- output model table
                   'id'::TEXT,              -- id column
                   'temperature::double precision'::TEXT,           -- response
-                  'humidity, windy'::TEXT,   -- features
+                  'humidity, cat_features, windy, "Cont_features"'::TEXT,   -- 
features
                   NULL::TEXT,        -- exclude columns
                   'class',          -- grouping
                   5,                -- num of trees
@@ -225,7 +226,7 @@ SELECT forest_train(
                   'train_output'::TEXT,    -- output model table
                   'id'::TEXT,              -- id column
                   'class'::TEXT,           -- response
-                  'humidity, temperature'::TEXT,   -- features
+                  '"Cont_features", humidity, temperature'::TEXT,   -- features
                   NULL::TEXT,        -- exclude columns
                   NULL::TEXT,        -- no grouping
                   5,                -- num of trees
@@ -237,7 +238,7 @@ SELECT forest_train(
                   1::INTEGER,        -- min bucket
                   3::INTEGER,        -- number of bins per continuous variable
                   'max_surrogates=0',
-                  TRUE,
+                  FALSE,
                   .5
                   );
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/de71bd7e/src/ports/postgres/modules/utilities/utilities.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/utilities/utilities.py_in 
b/src/ports/postgres/modules/utilities/utilities.py_in
index 126f4e6..6a5e8f9 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in
@@ -202,13 +202,15 @@ def _string_to_array_with_quotes(s):
 # ------------------------------------------------------------------------
 
 
-def py_list_to_sql_string(array, array_type=None):
+def py_list_to_sql_string(array, array_type=None, long_format=None):
     """Convert a list to SQL array string """
-    long_format = True
-    if (array_type and
-            (any(i in array_type
-                 for i in ["text", "varchar", "character varying"]))):
-        long_format = False
+    if long_format is None:
+        if (array_type is not None and
+                (any(array_type.startswith(i)
+                     for i in ["text", "varchar", "character varying"]))):
+            long_format = False
+        else:
+            long_format = True
     if not array_type:
         array_type = "double precision[]"
     else:
@@ -228,7 +230,9 @@ def _array_to_string(origin):
     """
     Convert an array to string
     """
-    return "{" + ",".join(map(str, origin)) + "}"
+    def _escape(s):
+        return re.sub(r'"', r'\"', str(s))
+    return "{" + ",".join(map(_escape, origin)) + "}"
 # ------------------------------------------------------------------------

[2/3] incubator-madlib git commit: DT/RF: Allow array input for features

Reply via email to