incubator-madlib git commit: DT: Add option to treat NULL as category

riyer Wed, 02 Aug 2017 20:54:12 -0700

Repository: incubator-madlib
Updated Branches:
  refs/heads/master a1f980336 -> a2f474007



DT: Add option to treat NULL as category

This commit adds an option to treat NULL as a level in the categorical
feature. The level is added as a string (instead of a NULL value) to
ensure MADlib arrays don't have NULLs in them during the binning
procedure.

Closes #156


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/a2f47400
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/a2f47400
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/a2f47400

Branch: refs/heads/master
Commit: a2f47400705da784f9a727d8b926da34c84465a2
Parents: a1f9803
Author: Rahul Iyer <ri...@apache.org>
Authored: Tue Jul 18 14:04:37 2017 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed Aug 2 20:52:27 2017 -0700

----------------------------------------------------------------------
 src/modules/recursive_partitioning/DT_impl.hpp  |   2 +-
 .../recursive_partitioning/decision_tree.py_in  | 187 +++++++++++++------
 .../recursive_partitioning/decision_tree.sql_in | 108 ++++++++++-
 .../recursive_partitioning/random_forest.py_in  |  38 ++--
 .../recursive_partitioning/random_forest.sql_in |  96 ++++++----
 .../test/decision_tree.sql_in                   |   2 +-
 6 files changed, 309 insertions(+), 124 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a2f47400/src/modules/recursive_partitioning/DT_impl.hpp
----------------------------------------------------------------------
diff --git a/src/modules/recursive_partitioning/DT_impl.hpp 
b/src/modules/recursive_partitioning/DT_impl.hpp
index 27bc647..808e48c 100644
--- a/src/modules/recursive_partitioning/DT_impl.hpp
+++ b/src/modules/recursive_partitioning/DT_impl.hpp
@@ -1181,7 +1181,7 @@ DecisionTree<Container>::displayInternalNode(
         label_str << escape_quotes(feature_name) << " <= " << 
feature_thresholds(id);
     } else {
         feature_name = get_text(cat_features_str, feature_indices(id));
-        label_str << escape_quotes(feature_name) << " = ";
+        label_str << escape_quotes(feature_name) << " <= ";
 
         // Text for all categoricals are stored in a flat array 
(cat_levels_text);
         // find the appropriate index for this node

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a2f47400/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in 
b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index 7b724c1..d82936a 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -246,11 +246,13 @@ SELECT {schema_madlib}.tree_train(
                                     'cp' - complexity parameter with 
default=0.01,
                                     'n_folds' - number of cross-validation 
folds
                                         with default value of 0 (= no 
cross-validation)
-    surrogate_params,       -- A comma-separated text containing
-                                key=value pairs of parameters for computing 
surrogate splits.
+    null_handling_params,   -- A comma-separated text containing
+                                key=value pairs of parameters for handling 
NULL values.
                                 Parameters accepted:
                                     'max_surrogates' - Maximum number of 
surrogates to
                                         compute for each split
+                                    'null_as_category' - Boolean to indicate if
+                                        NULL should be treated as a special 
category
     verbose                 -- Boolean, whether to print more info, default is 
False
 );
 
@@ -291,6 +293,9 @@ The output summary table ('output_table_summary') has the 
following columns:
     input_cp                -- The complexity parameter (cp) used for pruning 
the
                                  trained tree(s) (before cross-validation is 
run)
     independent_var_types   -- The types of independent variables, 
comma-separated
+    k                       -- Number of folds (NULL if not using cross 
validation)
+    null_proxy              -- String used as replacement for NULL values
+                                (NULL if null_as_category = False)
 
         """
     elif message.lower().strip() in ['example', 'examples']:
@@ -371,7 +376,7 @@ def _get_tree_states(schema_madlib, is_classification, 
split_criterion,
                      con_features, n_bins, boolean_cats,
                      min_split, min_bucket, weights,
                      max_depth, grp_key_to_cp, compute_cp_list=False,
-                     max_n_surr=0, **kwargs):
+                     max_n_surr=0, null_proxy=None, **kwargs):
     """
         Args:
             grp_key_to_cp : Dictionary, mapping from group key to the cp value
@@ -419,7 +424,7 @@ def _get_tree_states(schema_madlib, is_classification, 
split_criterion,
         bins = _get_bins(schema_madlib, training_table_name, cat_features,
                          ordered_cat_features, con_features, n_bins,
                          dep_var_str, boolean_cats, n_rows, is_classification,
-                         dep_n_levels, filter_dep)
+                         dep_n_levels, filter_dep, null_proxy)
         # some features may be dropped if they have only one value
         cat_features = bins['cat_features']
 
@@ -446,7 +451,7 @@ def _get_tree_states(schema_madlib, is_classification, 
split_criterion,
                                       boolean_cats, grouping_cols,
                                       grouping_array_str, n_rows,
                                       is_classification, dep_n_levels,
-                                      filter_dep)
+                                      filter_dep, null_proxy)
                 cat_features = bins['cat_features']
 
                 # 3b) Load each group's tree state in memory and set to the 
initial tree
@@ -507,7 +512,8 @@ def _build_tree(schema_madlib, is_classification, 
split_criterion,
                 cat_features, ordered_cat_features,
                 boolean_cats, con_features, grouping_cols,
                 weights, max_depth, min_split, min_bucket, n_bins,
-                cp_table, max_n_surr=0, msg_level="warning", k=0, **kwargs):
+                cp_table, max_n_surr=0, null_as_category=False,
+                msg_level="warning", k=0, **kwargs):
 
     compute_cp_list = False
     if grouping_cols:
@@ -545,7 +551,7 @@ def tree_train(schema_madlib, training_table_name, 
output_table_name,
                list_of_features_to_exclude, split_criterion,
                grouping_cols, weights, max_depth,
                min_split, min_bucket, n_bins, pruning_params,
-               surrogate_params, verbose_mode, **kwargs):
+               null_handling_params, verbose_mode, **kwargs):
     """ Decision tree main training function
 
     Args:
@@ -589,10 +595,18 @@ def tree_train(schema_madlib, training_table_name, 
output_table_name,
     pruning_param_dict = _extract_pruning_params(pruning_params)
     cp = pruning_param_dict['cp']
     n_folds = pruning_param_dict['n_folds']
-    surrogate_param_dict = extract_keyvalue_params(surrogate_params,
-                                                   dict(max_surrogates=int),
-                                                   dict(max_surrogates=0))
-    max_n_surr = surrogate_param_dict['max_surrogates']
+
+    # null handing parameters: max_n_surr and null_as_category
+    null_handling_dict = extract_keyvalue_params(
+        null_handling_params,
+        dict(max_surrogates=int, null_as_category=bool),
+        dict(max_surrogates=0, null_as_category=False))
+    max_n_surr = null_handling_dict['max_surrogates']
+    null_as_category = null_handling_dict['null_as_category']
+    null_proxy = "__NULL__" if null_as_category else None
+    if null_as_category:
+        # can't have two ways of handling tuples with NULL values
+        max_n_surr = 0
     _assert(max_n_surr >= 0,
             "Maximum number of surrogates ({0}) should be non-negative".
             format(max_n_surr))
@@ -650,7 +664,8 @@ def _create_output_tables(schema_madlib, 
training_table_name, output_table_name,
                           id_col_name, dependent_variable,
                           is_classification, n_all_rows, n_rows, dep_list, cp,
                           all_cols_types, grouping_cols=None,
-                          use_existing_tables=False, running_cv=False, k=0, 
**kwargs):
+                          use_existing_tables=False, running_cv=False, k=0,
+                          null_proxy=None, **kwargs):
     if not grouping_cols:
         _create_result_table(schema_madlib, tree_states[0],
                              bins['cat_origin'], bins['cat_n'], cat_features,
@@ -667,8 +682,8 @@ def _create_output_tables(schema_madlib, 
training_table_name, output_table_name,
         schema_madlib, split_criterion, training_table_name,
         output_table_name, id_col_name, cat_features, con_features,
         dependent_variable, failed_groups, is_classification, n_all_rows,
-        n_rows, dep_list, all_cols_types, cp, grouping_cols, 1, 
use_existing_tables,
-        running_cv, k)
+        n_rows, dep_list, all_cols_types, cp, grouping_cols, 1,
+        use_existing_tables, running_cv, k, null_proxy)
 # -------------------------------------------------------------------------
 
 
@@ -715,7 +730,8 @@ def _is_dep_categorical(training_table_name, 
dependent_variable):
 def _get_bins(schema_madlib, training_table_name,
               cat_features, ordered_cat_features,
               con_features, n_bins, dependent_variable, boolean_cats, n_rows,
-              is_classification, dep_n_levels, filter_null):
+              is_classification, dep_n_levels, filter_null_dep,
+              null_proxy=None):
     """ Compute the bins of all features
 
     @param training_table_name Data source table
@@ -760,7 +776,7 @@ def _get_bins(schema_madlib, training_table_name,
                 FROM (
                     SELECT *, random() AS rand
                     FROM {training_table_name}
-                    WHERE {filter_null}
+                    WHERE {filter_null_dep}
                     AND not 
{schema_madlib}.array_contains_null({con_features_str})
                 ) subq
                 WHERE rand <= {percentage}
@@ -808,10 +824,24 @@ def _get_bins(schema_madlib, training_table_name,
             # For regressions
             order_fun = "AVG({0})".format(dependent_variable)
 
-        # Note that 'sql_cat_levels' goes through two levels of formatting
+        # Note that 'sql_cat_levels' goes through two levels of string 
formatting
         # Try to obtain all the levels in one scan of the table.
         # () are needed when casting the categorical variables because
         # they can be expressions.
+
+        filter_str = filter_null_dep + " AND {col} IS NOT NULL"
+
+        if null_proxy is None:
+            union_null_proxy = ""
+        else:
+            union_null_proxy = "UNION " + """
+                SELECT '{null_proxy}'::text as levels,
+                       'Infinity'::double precision as dep_avg
+                FROM {training_table_name}
+                WHERE {{col}} IS NULL
+                GROUP BY {{col}}
+                """.format(**locals())
+
         sql_cat_levels = """
             SELECT
                 '{{col_name}}'::text AS colname,
@@ -825,22 +855,30 @@ def _get_bins(schema_madlib, training_table_name,
                         ({{col}})::text AS levels,
                         {{order_fun}} AS dep_avg
                     FROM {training_table_name}
-                    WHERE {filter_null}
-                        AND {{col}} is not NULL
+                    WHERE {filter_str}
                     GROUP BY {{col}}
+                    {union_null_proxy}
                 ) s
             ) s1
             WHERE array_upper(levels, 1) > 1
             """.format(training_table_name=training_table_name,
-                       filter_null=filter_null)
+                       filter_str=filter_str,
+                       union_null_proxy=union_null_proxy)
+
+        all_col_expressions = {}
+        for col in cat_features:
+            if col in boolean_cats:
+                all_col_expressions[col] = ("(CASE WHEN " + col +
+                                            " THEN 'True' ELSE 'False' END)")
+            else:
+                all_col_expressions[col] = col
 
         sql_all_cats = ' UNION '.join(
             sql_cat_levels.format(
-                col="(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)"
-                    if col in boolean_cats else col,
-                col_name=col,
-                order_fun=col if col in ordered_cat_features else order_fun)
-            for col in cat_features)
+                col=expr,
+                col_name=col_name,
+                order_fun=expr if col_name in ordered_cat_features else 
order_fun)
+            for col_name, expr in all_col_expressions.items())
         all_levels = plpy.execute(sql_all_cats)
 
         if len(all_levels) != len(cat_features):
@@ -925,7 +963,7 @@ def _get_bins_grps(
         schema_madlib, training_table_name, cat_features, ordered_cat_features,
         con_features, n_bins, dependent_variable, boolean_cats,
         grouping_cols, grouping_array_str, n_rows, is_classification,
-        dep_n_levels, filter_null):
+        dep_n_levels, filter_null_dep, null_proxy=None):
     """ Compute the bins for all features in each group
 
     @brief Similar to _get_bins except that this is for multiple groups.
@@ -974,14 +1012,14 @@ def _get_bins_grps(
                     (
                         SELECT *
                         FROM {training_table_name}
-                        WHERE {filter_null}
+                        WHERE {filter_null_dep}
                         AND not 
{schema_madlib}.array_contains_null({con_features_str})
                     ) src
                     JOIN
                     (
                         SELECT {grouping_cols}, count(*) AS {grp_size_str}
                         FROM {training_table_name}
-                        WHERE {filter_null}
+                        WHERE {filter_null_dep}
                             AND not 
{schema_madlib}.array_contains_null({con_features_str})
                         GROUP BY {grouping_cols}
                     ) grp_info
@@ -1024,6 +1062,18 @@ def _get_bins_grps(
         else:
             order_fun = "avg({0})".format(dependent_variable)
 
+        filter_str = filter_null_dep + " AND {col} IS NOT NULL"
+        if null_proxy is None:
+            union_null_proxy = ""
+        else:
+            union_null_proxy = "UNION " + """
+                SELECT '{null_proxy}'::text as levels,
+                       'Infinity'::double precision as dep_avg
+                FROM {training_table_name}
+                WHERE {{col}} IS NULL
+                GROUP BY {{col}}
+                """.format(**locals())
+
         sql_cat_levels = """
                 SELECT
                     colname::text,
@@ -1040,8 +1090,7 @@ def _get_bins_grps(
                             ({{col}})::text as levels,
                             {{order_fun}} as dep_avg
                         FROM {training_table_name}
-                        WHERE {filter_null}
-                            AND {{col}} is not NULL
+                        WHERE {filter_str}
                         GROUP BY {{col}}, {grouping_cols}
                     ) s
                     GROUP BY grp_key
@@ -1049,13 +1098,24 @@ def _get_bins_grps(
                 where array_upper(levels, 1) > 1
                 """.format(**locals())
 
+        all_col_expressions = {}
+        for col in cat_features:
+            if col in boolean_cats:
+                all_col_expressions[col] = ("(CASE WHEN " + col +
+                                            " THEN 'True' ELSE 'False' END)")
+            else:
+                if null_proxy is not None:
+                    all_col_expressions[col] = ("COALESCE({0}, {1})".
+                                                format(col, null_proxy))
+                else:
+                    all_col_expressions[col] = col
+
         sql_all_cats = ' UNION ALL '.join(
             sql_cat_levels.format(
-                col=("(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)"
-                     if col in boolean_cats else col),
-                col_name=col,
-                order_fun=col if col in ordered_cat_features else order_fun)
-            for col in cat_features)
+                col=expr,
+                col_name=col_name,
+                order_fun=expr if col_name in ordered_cat_features else 
order_fun)
+            for col_name, expr in all_col_expressions.items())
 
         all_levels = list(plpy.execute(sql_all_cats))
         all_levels.sort(key=itemgetter('grp_key'))
@@ -1100,9 +1160,16 @@ def _get_bins_grps(
 
 def get_feature_str(schema_madlib, boolean_cats,
                     cat_features, con_features,
-                    levels_str, n_levels_str):
+                    levels_str, n_levels_str,
+                    null_proxy=None):
     if len(cat_features) > 0:
-        unique_val = unique_string()
+        # null_val is the replacement for NULL in categorial feature. If a
+        # null_proxy is set then the proxy is used to assign NULL as a valid
+        # category. If no proxy is available then NULL is replaced with a 
unique
+        # value.  In a later step, the categorical levels are mapped to 
integers
+        # (1 to N). The unique value will be mapped to -1 indicating an
+        # unknown/missing value in the underlying layers.
+        null_val = unique_string() if null_proxy is None else null_proxy
         cat_features_cast = []
         for col in cat_features:
             if col in boolean_cats:
@@ -1110,8 +1177,7 @@ def get_feature_str(schema_madlib, boolean_cats,
                     "(CASE WHEN " + col + " THEN 'True' ELSE 'False' 
END)::text")
             else:
                 cat_features_cast.append(
-                    "(coalesce(" + col + "::text,'{0}')".format(unique_val) +
-                    ")::text")
+                    "(coalesce({0}::text, '{1}'))::text".format(col, null_val))
 
         cat_features_str = ("{0}._map_catlevel_to_int(array[" +
                             ", ".join(cat_features_cast) + "], {1}, {2})"
@@ -1132,7 +1198,8 @@ def get_feature_str(schema_madlib, boolean_cats,
 def _one_step(schema_madlib, training_table_name, cat_features,
               con_features, boolean_cats, bins, n_bins, tree_state, weights,
               dep_var, min_split, min_bucket, max_depth, filter_null,
-              dep_n_levels, subsample, n_random_features, max_n_surr=0):
+              dep_n_levels, subsample, n_random_features,
+              max_n_surr=0, null_proxy=None):
     """ One step of tree training
 
     @param tree_state A big double precision array that conatins
@@ -1147,7 +1214,8 @@ def _one_step(schema_madlib, training_table_name, 
cat_features,
                                                          boolean_cats,
                                                          cat_features,
                                                          con_features,
-                                                         "$3", "$2")
+                                                         "$3", "$2",
+                                                         null_proxy)
 
     # The arguments of the aggregate (in the same order):
     # 1. current tree state, madlib.bytea8
@@ -1229,7 +1297,7 @@ def _one_step_for_grps(
         con_features, boolean_cats, bins, n_bins, tree_states, weights,
         grouping_cols, grouping_array_str, dep_var, min_split, min_bucket,
         max_depth, filter_null, dep_n_levels, subsample, n_random_features,
-        max_n_surr=0):
+        max_n_surr=0, null_proxy=None):
     """ One step of trees training with grouping support
     """
     # The function _map_catlevel_to_int maps a categorical variable value to 
its
@@ -1247,7 +1315,7 @@ def _one_step_for_grps(
 
     cat_features_str, con_features_str = get_feature_str(
         schema_madlib, boolean_cats, cat_features, con_features,
-        cat_levels_in_text, cat_n_levels)
+        cat_levels_in_text, cat_n_levels, null_proxy)
 
     train_apply_func = """
         {schema_madlib}._dt_apply(
@@ -1530,7 +1598,8 @@ def _create_summary_table(
         cat_features, con_features, dependent_variable,
         num_failed_groups, is_classification, n_all_rows, n_rows,
         dep_list, all_cols_types, cp, grouping_cols=None, n_groups=1,
-        use_existing_tables=False, running_cv=False, k=0):
+        use_existing_tables=False, running_cv=False, k=0, null_proxy=None):
+
     # dependent variables
     features = ', '.join(cat_features + con_features)
     if dep_list:
@@ -1553,14 +1622,15 @@ def _create_summary_table(
         cp_str = py_list_to_sql_string(cp, 'double precision')
     else:
         cp_str = str(cp) + "::double precision"
+    fold = k if running_cv else "NULL"
 
-    fold = ", " + str(k) + " as k" if running_cv else ""
     if use_existing_tables:
-        # plpy.execute("truncate " + output_table_name + "_summary")
-        header = "insert into " + add_postfix(output_table_name, "_summary") + 
" "
+        header = "INSERT INTO {0} ".format(
+            add_postfix(output_table_name, "_summary"))
     else:
-        header = "create table " + add_postfix(output_table_name, "_summary") 
+ " as "
-
+        header = "CREATE TABLE {0} AS ".format(
+            add_postfix(output_table_name, "_summary"))
+    null_proxy_str="NULL" if null_proxy is None else null_proxy
     sql = header + """
             SELECT
                 'tree_train'::text             AS method,
@@ -1580,8 +1650,10 @@ def _create_summary_table(
                 {dep_list_str}::text           AS dependent_var_levels,
                 '{dep_type}'::text             AS dependent_var_type,
                 {cp_str}                       AS input_cp,
-                '{indep_type}'::text           AS independent_var_types
-                {fold}
+                '{indep_type}'::text           AS independent_var_types,
+                {fold}::integer                AS k,
+                '{null_proxy_str}'::text       AS null_proxy
+
         """.format(**locals())
     plpy.execute(sql)
 # ------------------------------------------------------------
@@ -1683,6 +1755,7 @@ def tree_predict(schema_madlib, model, source, output, 
pred_type='response',
     dep_levels = summary_elements["dependent_var_levels"]
     is_classification = summary_elements["is_classification"]
     dep_type = summary_elements['dependent_var_type']
+    null_proxy = summary_elements['null_proxy']
 
     # find which columns are of type boolean
     boolean_cats = set([key for key, value in get_cols_and_types(source)
@@ -1690,7 +1763,7 @@ def tree_predict(schema_madlib, model, source, output, 
pred_type='response',
 
     cat_features_str, con_features_str = get_feature_str(
         schema_madlib, boolean_cats, cat_features, con_features,
-        "m.cat_levels_in_text", "m.cat_n_levels")
+        "m.cat_levels_in_text", "m.cat_n_levels", null_proxy)
 
     if use_existing_tables and table_exists(output):
         plpy.execute("truncate " + output)
@@ -2258,7 +2331,8 @@ def _tree_train_using_bins(
         cat_features, con_features, boolean_cats, n_bins, weights,
         dep_var_str, min_split, min_bucket, max_depth, filter_dep,
         dep_n_levels, is_classification, split_criterion,
-        subsample=False, n_random_features=1, max_n_surr=0, **kwargs):
+        subsample=False, n_random_features=1, max_n_surr=0, null_proxy=None,
+        **kwargs):
     """Trains a tree without grouping columns"""
     # Iterations for training the tree
     tree_state = plpy.execute(
@@ -2284,7 +2358,7 @@ def _tree_train_using_bins(
             cat_features, con_features, boolean_cats, bins,
             n_bins, tree_state, weights, dep_var_str,
             min_split, min_bucket, max_depth, filter_dep,
-            dep_n_levels, subsample, n_random_features, max_n_surr)
+            dep_n_levels, subsample, n_random_features, max_n_surr, null_proxy)
         plpy.notice("Completed training of level {0}".format(tree_depth))
 
     return tree_state
@@ -2296,7 +2370,8 @@ def _tree_train_grps_using_bins(
         boolean_cats, n_bins, weights, grouping_cols, grouping_array_str, 
dep_var_str,
         min_split, min_bucket, max_depth, filter_dep, dep_n_levels,
         is_classification, split_criterion, subsample=False,
-        n_random_features=1, tree_terminated=None, max_n_surr=0, **kwargs):
+        n_random_features=1, tree_terminated=None, max_n_surr=0, 
null_proxy=None,
+        **kwargs):
 
     """Trains a tree with grouping columns included """
     # Iterations for training the tree
@@ -2340,7 +2415,7 @@ def _tree_train_grps_using_bins(
             tree_states, weights, grouping_cols,
             grouping_array_str, dep_var_str, min_split, min_bucket,
             max_depth, filter_dep, dep_n_levels, subsample,
-            n_random_features, max_n_surr)
+            n_random_features, max_n_surr, null_proxy)
         level += 1
         plpy.notice("Finished training for level " + str(level))
 

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a2f47400/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index e8f37f8..8290dcf 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -54,7 +54,7 @@ tree_train(
     min_bucket,
     num_splits,
     pruning_params,
-    surrogate_params,
+    null_handling_params,
     verbosity
     )
 </pre>
@@ -186,16 +186,27 @@ tree_train(
     </table>
   </DD>
 
-  <DT>surrogate_params (optional)</DT>
+  <DT>null_handling_params (optional)</DT>
   <DD>TEXT. Comma-separated string of key-value pairs controlling the behavior
-  of surrogate splits for each node. A surrogate variable is another predictor
-  variable that is associated (correlated) with the primary predictor variable
-  for a split. The surrogate variable comes into use when the primary 
predictior
-  value is NULL. This parameter currently accepts one argument:
+  of various features handling missing values.
     <table class='output'>
     <tr>
       <th>max_surrogates</th>
       <td>Default: 0. Number of surrogates to store for each node.</td>
+      One of the approaches of handling NULLs is to use surrogate splits for 
each
+      node. A surrogate variable is another predictor variable that is 
associated
+      (correlated) with the primary split variable. The surrogate
+      variable comes into use when the primary predictior value is NULL.
+    </tr>
+    <tr>
+    <th>null_as_category</th>
+    <td>Default: FALSE. Whether to treat NULL as a special categorical value.
+
+    If this is set to TRUE, NULL values are considered a categorical value and
+    placed at the end of the ordering of categorical levels. Placing it at the
+    end ensures that NULL is never used as a value to split a node on. This
+    parameter is ignored for continuous-valued features.
+    </td>
     </tr>
     </table>
   </DD>
@@ -901,6 +912,85 @@ Result:
 (32 rows)
 </pre>
 
+<h4>NULL handling example</h4>
+
+- Create toy example to illustrate null-as-category handling
+<pre class='example'>
+drop table null_handling_example;
+create table null_handling_example (
+id integer,
+country text,
+city text,
+weather text,
+response text
+);
+&nbsp;
+insert into null_handling_example values
+(1,null,null,null,'a'),
+(2,'US',null,null,'b'),
+(3,'US','NY',null,'c'),
+(4,'US','NY','rainy','d');
+&nbsp;
+DROP TABLE IF EXISTS train_output, train_output_summary;
+SELECT madlib.tree_train('null_handling_example',         -- source table
+                         'train_output',    -- output model table
+                         'id',              -- id column
+                         'response',             -- dependent variable
+                         'country, city, weather',               -- features
+                         NULL,
+                         'gini',             -- split criterion
+                         NULL::text,        -- no grouping
+                         NULL::text,        -- no weights
+                         4,                -- max depth
+                         1,                 -- min split
+                         1,                 -- number of bins per continuous 
variable
+                         10,                -- number of splits
+                         NULL,              -- pruning parameters
+                         'null_as_category=true' -- null handling
+                         );
+select cat_levels_in_text, cat_n_levels from train_output;
+</pre>
+<pre class='result'>
+            cat_levels_in_text            | cat_n_levels
+------------------------------------------+--------------
+ {US,__NULL__,rainy,__NULL__,NY,__NULL__} | {2,2,2}
+</pre>
+
+- Predict for previously not seen data by assuming NULL value as the default
+<pre class='example'>
+drop table if exists table_test;
+create table table_test (
+id integer,
+country text,
+city text,
+weather text,
+expected_response text
+);
+insert into table_test values
+(1,'IN','MUM','cloudy','a'),
+(2,'US','HOU','humid','b'),
+(3,'US','NY','sunny','c'),
+(4,'US','NY','rainy','d');
+&nbsp;
+DROP TABLE IF EXISTS prediction_results;
+SELECT madlib.tree_predict('train_output',
+                           'table_test',
+                           'prediction_results',
+                           'response');
+SELECT s.id, expected_response, estimated_response
+FROM prediction_results p, table_test s
+where s.id = p.id ORDER BY id;
+</pre>
+<pre class='result'>
+ id | expected_response | estimated_response
+----+-------------------+--------------------
+  1 | a                 | a
+  2 | b                 | b
+  3 | c                 | c
+  4 | d                 | d
+(4 rows)
+</pre>
+
 @anchor literature
 @literature
 [1] Breiman, Leo; Friedman, J. H.; Olshen, R. A.; Stone, C. J. (1984). 
Classification and regression trees. Monterey, CA: Wadsworth & Brooks/Cole 
Advanced Books & Software.
@@ -995,7 +1085,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     min_bucket                  INTEGER,
     n_bins                      INTEGER,
     pruning_params              TEXT,
-    surrogate_params            TEXT,
+    null_handling_params        TEXT,
     verbose_mode                BOOLEAN
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(recursive_partitioning, decision_tree)
@@ -1004,7 +1094,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
         id_col_name, dependent_variable, list_of_features,
         list_of_features_to_exclude, split_criterion, grouping_cols,
         weights, max_depth, min_split, min_bucket, n_bins, pruning_params,
-        surrogate_params, verbose_mode)
+        null_handling_params, verbose_mode)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 ------------------------------------------------------------
@@ -1602,7 +1692,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     min_bucket                  INTEGER,
     n_bins                      INTEGER,
     pruning_params              TEXT,
-    surrogate_params            TEXT
+    null_handling_params        TEXT
 ) RETURNS VOID AS $$
     -- verbose = false
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10,

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a2f47400/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in 
b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
index 05c029e..bc4bd97 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in
@@ -237,7 +237,7 @@ def forest_train(
         grouping_cols, num_trees, num_random_features,
         importance, num_permutations, max_tree_depth,
         min_split, min_bucket, num_bins,
-        surrogate_params, verbose=False, sample_ratio=None, **kwargs):
+        null_handling_params, verbose=False, sample_ratio=None, **kwargs):
     """ Random forest main training function
 
     Args:
@@ -283,15 +283,19 @@ def forest_train(
                 num_bins = 100 if num_bins is None else num_bins
                 sample_ratio = 1 if sample_ratio is None else sample_ratio
 
-                surrogate_param_dict = extract_keyvalue_params(
-                    surrogate_params,
-                    dict(max_surrogates=int),   # type of variable
-                    dict(max_surrogates=0))     # default value of variable
-                max_n_surr = surrogate_param_dict['max_surrogates']
+                null_handling_dict = extract_keyvalue_params(
+                    null_handling_params,
+                    dict(max_surrogates=int, null_as_category=bool),
+                    dict(max_surrogates=0, null_as_category=False))
+                max_n_surr = null_handling_dict['max_surrogates']
+                null_as_category = null_handling_dict['null_as_category']
+                null_proxy = "__NULL__" if null_as_category else None
+                if null_as_category:
+                    # can't have two ways of handling tuples with NULL values
+                    max_n_surr = 0
                 _assert(max_n_surr >= 0,
                         "Maximum number of surrogates ({0}) should be 
non-negative".
                         format(max_n_surr))
-
                 
##################################################################
                 # validate arguments
                 _forest_validate_args(training_table_name, output_table_name, 
id_col_name,
@@ -383,7 +387,7 @@ def forest_train(
                                      cat_features, ordered_cat_features,
                                      con_features, num_bins, dep,
                                      boolean_cats, n_rows, is_classification,
-                                     dep_n_levels, filter_null)
+                                     dep_n_levels, filter_null, null_proxy)
                     # some features may be dropped because they have only one 
value
                     cat_features = bins['cat_features']
                     bins['grp_key_cat'] = ['']
@@ -421,7 +425,7 @@ def forest_train(
                                           con_features, num_bins, dep,
                                           boolean_cats, grouping_cols,
                                           grouping_array_str, n_rows,
-                                          is_classification, dep_n_levels, 
filter_null)
+                                          is_classification, dep_n_levels, 
filter_null, null_proxy)
                     cat_features = bins['cat_features']
 
                 # a table for converting cat_features to integers
@@ -564,7 +568,7 @@ def forest_train(
                             boolean_cats, num_bins, 'poisson_count', dep, 
min_split,
                             min_bucket, max_tree_depth, filter_null, 
dep_n_levels,
                             is_classification, split_criterion, True,
-                            num_random_features, max_n_surr)
+                            num_random_features, max_n_surr, null_proxy)
                         tree_states = 
[dict(tree_state=tree_state['tree_state'],
                                             grp_key='')]
 
@@ -578,7 +582,7 @@ def forest_train(
                             max_tree_depth, filter_null, dep_n_levels,
                             is_classification, split_criterion, True,
                             num_random_features, 
tree_terminated=tree_terminated,
-                            max_n_surr=max_n_surr)
+                            max_n_surr=max_n_surr, null_proxy=null_proxy)
 
                         # If a tree for a group is terminated (not finished 
properly),
                         # then we do not need to compute other trees, and can 
just
@@ -603,7 +607,7 @@ def forest_train(
                         sample_id, id_col_name, cat_features, con_features,
                         boolean_cats, grouping_cols, grp_key_to_grp_cols, dep,
                         num_permutations, is_classification, importance,
-                        num_bins, filter_null)
+                        num_bins, filter_null, null_proxy)
 
                 
###################################################################
                 # evaluating and summerizing random forest
@@ -695,6 +699,7 @@ def forest_predict(schema_madlib, model, source, output, 
pred_type='response',
     dep_levels = summary_elements["dependent_var_levels"]
     is_classification = summary_elements["is_classification"]
     dep_type = summary_elements['dependent_var_type']
+    null_proxy = summary_elements['null_proxy']
 
     # pred_type='prob' is allowed only for classification
     _assert(is_classification or pred_type == 'response',
@@ -706,7 +711,7 @@ def forest_predict(schema_madlib, model, source, output, 
pred_type='response',
 
     cat_features_str, con_features_str = get_feature_str(
         schema_madlib, boolean_cats, cat_features, con_features,
-        "cat_levels_in_text", "cat_n_levels")
+        "cat_levels_in_text", "cat_n_levels", null_proxy)
 
     pred_name = ('"prob_{0}"' if pred_type == "prob" else
                  '"estimated_{0}"').format(dep_varname.replace('"', 
'').strip())
@@ -929,12 +934,12 @@ def _calculate_oob_prediction(
         schema_madlib, model_table, cat_features_info_table, con_splits_table,
         oob_prediction_table, oob_view, sample_id, id_col_name, cat_features,
         con_features, boolean_cats, grouping_cols, grp_key_to_grp_cols, dep,
-        num_permutations, is_classification, importance, num_bins, 
filter_null):
+        num_permutations, is_classification, importance, num_bins, 
filter_null, null_proxy=None):
     """Calculate predication for out-of-bag sample"""
 
     cat_features_str, con_features_str = get_feature_str(
         schema_madlib, boolean_cats, cat_features, con_features,
-        "cat_levels_in_text", "cat_n_levels")
+        "cat_levels_in_text", "cat_n_levels", null_proxy)
 
     join_str = "," if grouping_cols is None else "JOIN"
     using_str = "" if grouping_cols is None else "USING (" + grouping_cols + 
")"
@@ -1247,7 +1252,8 @@ def _create_summary_table(**kwargs):
                 {n_rows_skipped}::integer       AS total_rows_skipped,
                 {dep_list_str}::text            AS dependent_var_levels,
                 '{dep_type}'::text              AS dependent_var_type,
-                '{indep_type}'::text            AS independent_var_types
+                '{indep_type}'::text            AS independent_var_types,
+                '{null_proxy}'::text            AS null_proxy
         """.format(**kwargs)
     plpy.notice("sql_create_summary_table:\n" + sql_create_summary_table)
     plpy.execute(sql_create_summary_table)

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a2f47400/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
index f263cf9..d9ae9bf 100644
--- a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in
@@ -34,7 +34,7 @@ constructed using bootstrapped samples from the input data. 
The results of these
 models are then combined to yield a single prediction, which, at the
 expense of some loss in interpretation, have been found to be highly accurate.
 
-Please also refer to the decision tree user documentation for 
+Please also refer to the decision tree user documentation for
 information relevant to the implementation of random forests in MADlib.
 
 @anchor train
@@ -56,7 +56,7 @@ forest_train(training_table_name,
              min_split,
              min_bucket,
              num_splits,
-             surrogate_params,
+             null_handling_params,
              verbose,
              sample_ratio
              )
@@ -314,7 +314,7 @@ forest_train(training_table_name,
   <DT>importance (optional)</DT>
   <DD>boolean, default: true. Whether or not to calculate variable importance.
   If set to true, variable importance for categorical and continuous features
-  will be output in the group table <em>\<model_table\>_group</em> described 
+  will be output in the group table <em>\<model_table\>_group</em> described
   above.  Will increase run time when variable importance is turned on. </DD>
 
   <DT>num_permutations (optional)</DT>
@@ -331,7 +331,7 @@ forest_train(training_table_name,
   <DT>max_tree_depth (optional)</DT>
   <DD>integer, default: 7. Maximum depth of any node of a tree,
       with the root node counted as depth 0. A deeper tree can
-      lead to better prediction but will also result in 
+      lead to better prediction but will also result in
       longer processing time and higher memory usage.</DD>
 
   <DT>min_split (optional)</DT>
@@ -350,13 +350,27 @@ forest_train(training_table_name,
       Higher number of bins will lead to better prediction,
       but will also result in longer processing time and higher memory 
usage.</DD>
 
-  <DT>surrogate_params (optional)</DT>
-  <DD>text, Comma-separated string of key-value pairs controlling the behavior
-  of surrogate splits for each node in a tree.
+  <DT>null_handling_params</DT>
+  <DD>TEXT. Comma-separated string of key-value pairs controlling the behavior
+  of various features handling missing values.
     <table class='output'>
     <tr>
       <th>max_surrogates</th>
       <td>Default: 0. Number of surrogates to store for each node.</td>
+      One of the approaches of handling NULLs is to use surrogate splits for 
each
+      node. A surrogate variable is another predictor variable that is 
associated
+      (correlated) with the primary split variable. The surrogate
+      variable comes into use when the primary predictior value is NULL.
+    </tr>
+    <tr>
+    <th>null_as_special_cat</th>
+    <td>Default: FALSE. Whether to treat NULL as a special categorical value.
+
+    If this is set to TRUE, NULL values are considered a categorical
+    value and placed at the end of the ordering of categorical levels. Placing 
it
+    at the end ensures that NULL is never used as a value to split a node on.
+    This parameter is ignored for continuous-valued features.
+    </td>
     </tr>
     </table>
   </DD>
@@ -371,10 +385,10 @@ forest_train(training_table_name,
     is close to 0 may result in trees with only the root node.
     This allows users to experiment with the function in a speedy fashion.</DD>
 </DL>
-    @note The main parameters that affect memory usage are: depth of 
-    tree (âmax_tree_depthâ), number of features, number of values per 
-    categorical feature, and number of bins for continuous features 
(ânum_splitsâ). 
-    If you are hitting memory limits, consider reducing one or 
+    @note The main parameters that affect memory usage are: depth of
+    tree (âmax_tree_depthâ), number of features, number of values per
+    categorical feature, and number of bins for continuous features 
(ânum_splitsâ).
+    If you are hitting memory limits, consider reducing one or
     more of these parameters.
 
 @anchor predict
@@ -460,8 +474,8 @@ are NULL, then the majority branch is used to compute the 
split for a tuple.
 
     <DT>verbose (optional)</DT>
     <DD>boolean, default = FALSE. If true, the dot format output will contain
-    additional information (impurity, sample size, number of weighted rows for 
-    each response variable, classification or prediction if the tree was 
+    additional information (impurity, sample size, number of weighted rows for
+    each response variable, classification or prediction if the tree was
     pruned at this level)</DD>
 </DL>
 
@@ -539,7 +553,7 @@ dependent_varname     | class
 independent_varnames  | "OUTLOOK",windy,temperature,humidity
 cat_features          | "OUTLOOK",windy
 con_features          | temperature,humidity
-grouping_cols         | 
+grouping_cols         |
 num_trees             | 20
 num_random_features   | 2
 max_tree_depth        | 8
@@ -581,21 +595,21 @@ SELECT madlib.get_tree('train_output',1,2);
 </pre>
 Result:
 <pre class="result">
- digraph "Classification tree for dt_golf" {                 
- "0" [label="humidity <= 75", shape=ellipse];                
- "0" -> "1"[label="yes"];                                    
- "1" [label="\"Play\"",shape=box];                           
- "0" -> "2"[label="no"];                                     
- "2" [label="humidity <= 80", shape=ellipse];                
- "2" -> "5"[label="yes"];                                    
- "5" [label="\"Don't Play\"",shape=box];                     
- "2" -> "6"[label="no"];                                     
+ digraph "Classification tree for dt_golf" {
+ "0" [label="humidity <= 75", shape=ellipse];
+ "0" -> "1"[label="yes"];
+ "1" [label="\"Play\"",shape=box];
+ "0" -> "2"[label="no"];
+ "2" [label="humidity <= 80", shape=ellipse];
+ "2" -> "5"[label="yes"];
+ "5" [label="\"Don't Play\"",shape=box];
+ "2" -> "6"[label="no"];
  "6" [label="\"OUTLOOK\" in {overcast,rain}", shape=ellipse];
- "6" -> "13"[label="yes"];                                   
- "13" [label="\"Play\"",shape=box];                          
- "6" -> "14"[label="no"];                                    
- "14" [label="\"Don't Play\"",shape=box];                                      
                                         
- } //---end of digraph--------- 
+ "6" -> "13"[label="yes"];
+ "13" [label="\"Play\"",shape=box];
+ "6" -> "14"[label="no"];
+ "14" [label="\"Don't Play\"",shape=box];
+ } //---end of digraph---------
 </pre>
 
 -# Obtain a text display of the tree:
@@ -613,13 +627,13 @@ second indented child (2i+2) is the 'False' node.
 &nbsp;- Number of (weighted) rows for each response variable inside [].
 &nbsp;- Order of values = ['"Don\'t Play"', '"Play"']
 &nbsp;-------------------------------------
- (0)[ 4 10]  humidity <= 75                                               
-    (1)[0 7]  * --> "Play"                                                
-    (2)[4 3]  humidity <= 80                                              
-       (5)[3 1]  * --> "Don't Play"                                       
-       (6)[1 2]  "OUTLOOK" in {overcast,rain}                             
-          (13)[0 2]  * --> "Play"                                        
-          (14)[1 0]  * --> "Don't Play"                                   
+ (0)[ 4 10]  humidity <= 75
+    (1)[0 7]  * --> "Play"
+    (2)[4 3]  humidity <= 80
+       (5)[3 1]  * --> "Don't Play"
+       (6)[1 2]  "OUTLOOK" in {overcast,rain}
+          (13)[0 2]  * --> "Play"
+          (14)[1 0]  * --> "Don't Play"
 &nbsp;-------------------------------------
 </pre>
 
@@ -637,7 +651,7 @@ ORDER BY id;
 </pre>
 Result:
 <pre class="result">
-  id | estimated_class |   class    
+  id | estimated_class |   class
 ----+-----------------+------------
   1 | Don't Play      | Don't Play
   2 | Don't Play      | Don't Play
@@ -670,7 +684,7 @@ ORDER BY id;
 </pre>
 Result:
 <pre class="result">
- id | estimated_prob_Play |   class    
+ id | estimated_prob_Play |   class
 ----+---------------------+------------
   1 |                0.05 | Don't Play
   2 |                0.15 | Don't Play
@@ -917,7 +931,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     min_split                   INTEGER,
     min_bucket                  INTEGER,
     num_splits                  INTEGER,
-    surrogate_params            TEXT,
+    null_handling_params            TEXT,
     verbose                     BOOLEAN,
     sample_ratio                DOUBLE PRECISION
 ) RETURNS VOID AS $$
@@ -939,7 +953,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
         min_split,
         min_bucket,
         num_splits,
-        surrogate_params,
+        null_handling_params,
         verbose,
         sample_ratio
     )
@@ -1179,7 +1193,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     min_split                   INTEGER,
     min_bucket                  INTEGER,
     num_splits                  INTEGER,
-    surrogate_params            TEXT,
+    null_handling_params            TEXT,
     verbose                     BOOLEAN
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
@@ -1204,7 +1218,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train(
     min_split                   integer,
     min_bucket                  integer,
     num_splits                  integer,
-    surrogate_params            TEXT
+    null_handling_params            TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8,
             $9, $10, $11, $12, $13, $14, $15, $16, FALSE::BOOLEAN);

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a2f47400/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
index b135108..93b9a52 100644
--- 
a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
+++ 
b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
@@ -344,7 +344,7 @@ SELECT tree_train('dt_golf'::text,         -- source table
                          2::integer,        -- min bucket
                          8::integer,        -- number of bins per continuous 
variable
                          'cp=0.01, n_folds=5',
-                         'max_surrogates=2'
+                         'max_surrogates=2, null_as_category=True'
                          );
 
 SELECT _print_decision_tree(tree) from train_output;

incubator-madlib git commit: DT: Add option to treat NULL as category

Reply via email to