madlib git commit: DT: Ensure n_folds and null_proxy are set correctly

riyer Fri, 23 Feb 2018 17:26:23 -0800

Repository: madlib
Updated Branches:
  refs/heads/master 7acc591d5 -> 8bbad2df6



DT: Ensure n_folds and null_proxy are set correctly

Changes:

- The summary table in Decision Tree included two entries: k and
null_proxy. The 'k' value is supposed to reflect the 'n_folds' value but
was not set correctly and was always NULL. The null_proxy was supposed
to be NULL when null_as_category=False but was set to the string 'NULL'.
Both these issues have been corrected in this commit.

- The null_proxy parameter was being reset in _build_tree, a
sub-function that is called by cross validation. This commit fixes the
problem by adding an additional parameter in the function.

- Further, a scenario where no features exist due categorical columns
with single level being dropped led to the database crashing. This is
now caught and an error is thrown instead of the crash.

Closes #236


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/8bbad2df
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/8bbad2df
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/8bbad2df

Branch: refs/heads/master
Commit: 8bbad2df68f40442918fb72dc3acaabb8b9add45
Parents: 7acc591
Author: Rahul Iyer <ri...@apache.org>
Authored: Fri Feb 23 17:22:26 2018 -0800
Committer: Rahul Iyer <ri...@apache.org>
Committed: Fri Feb 23 17:22:26 2018 -0800

----------------------------------------------------------------------
 .../recursive_partitioning/decision_tree.py_in  | 50 +++++++++++---------
 .../recursive_partitioning/decision_tree.sql_in | 11 +++--
 .../test/decision_tree.sql_in                   |  1 +
 .../modules/validation/cross_validation.py_in   |  1 -
 4 files changed, 34 insertions(+), 29 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/8bbad2df/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in 
b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index 811d5fc..133ce1d 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -45,10 +45,10 @@ def _tree_validate_args(
     """
     if not split_criterion:
         split_criterion = 'gini'
-    if (split_criterion.lower().strip() not in
-            ['mse', 'gini', 'cross-entropy', 'entropy',
-             'misclass', 'misclassification']):
-        plpy.error("Decision tree error: Invalid split_criterion.")
+    _assert(split_criterion.lower().strip() in ['mse', 'gini', 'cross-entropy',
+                                                'entropy', 'misclass',
+                                                'misclassification'],
+            "Decision tree error: Invalid split_criterion.")
 
     _assert(training_table_name and
             training_table_name.strip().lower() not in ('null', ''),
@@ -265,6 +265,8 @@ def _get_tree_states(schema_madlib, is_classification, 
split_criterion,
                          dep_n_levels, filter_dep, null_proxy)
         # some features may be dropped if they have only one value
         cat_features = bins['cat_features']
+        if not cat_features and not con_features:
+            plpy.error("Decision tree: None of the input features are valid")
 
         # 4) Run tree train till the training is finished
         #  finished: 0 = running, 1 = finished training, 2 = terminated 
prematurely
@@ -291,6 +293,9 @@ def _get_tree_states(schema_madlib, is_classification, 
split_criterion,
                                       is_classification, dep_n_levels,
                                       filter_dep, null_proxy)
                 cat_features = bins['cat_features']
+                if not cat_features and not con_features:
+                    plpy.error("Decision tree: None of the input features "
+                               "are valid for some groups")
 
                 # 3b) Load each group's tree state in memory and set to the 
initial tree
                 tree_states = _tree_train_grps_using_bins(**locals())
@@ -350,8 +355,8 @@ def _build_tree(schema_madlib, is_classification, 
split_criterion,
                 cat_features, ordered_cat_features,
                 boolean_cats, con_features, grouping_cols,
                 weights, max_depth, min_split, min_bucket, n_bins,
-                cp_table, max_n_surr=0, null_as_category=False,
-                msg_level="warning", k=0, **kwargs):
+                cp_table, max_n_surr=0, null_proxy=None,
+                msg_level="warning", n_folds=0, **kwargs):
 
     compute_cp_list = False
     if grouping_cols:
@@ -502,18 +507,18 @@ def _create_output_tables(schema_madlib, 
training_table_name, output_table_name,
                           id_col_name, dependent_variable, list_of_features,
                           is_classification, n_all_rows, n_rows, dep_list, cp,
                           all_cols_types, grouping_cols=None,
-                          use_existing_tables=False, running_cv=False, k=0,
-                          null_proxy=None, **kwargs):
+                          use_existing_tables=False, running_cv=False,
+                          n_folds=0, null_proxy=None, **kwargs):
     if not grouping_cols:
         _create_result_table(schema_madlib, tree_states[0],
                              bins['cat_origin'], bins['cat_n'], cat_features,
                              con_features, output_table_name,
-                             use_existing_tables, running_cv, k)
+                             use_existing_tables, running_cv, n_folds)
     else:
         _create_grp_result_table(
             schema_madlib, tree_states, bins, cat_features,
             con_features, output_table_name, grouping_cols, 
training_table_name,
-            use_existing_tables, running_cv, k)
+            use_existing_tables, running_cv, n_folds)
 
     failed_groups = sum(row['finished'] != 1 for row in tree_states)
     _create_summary_table(
@@ -522,7 +527,7 @@ def _create_output_tables(schema_madlib, 
training_table_name, output_table_name,
         dependent_variable, list_of_features,
         failed_groups, is_classification, n_all_rows,
         n_rows, dep_list, all_cols_types, cp, grouping_cols, 1,
-        use_existing_tables, running_cv, k, null_proxy)
+        use_existing_tables, n_folds, null_proxy)
 # -------------------------------------------------------------------------
 
 
@@ -721,11 +726,11 @@ def _get_bins(schema_madlib, training_table_name,
         all_levels = plpy.execute(sql_all_cats)
 
         if len(all_levels) != len(cat_features):
-            plpy.warning("Decision tree warning: Categorical columns with only 
"
-                         "one value are dropped from the tree model.")
             use_cat_features = [row['colname'] for row in all_levels]
             cat_features = [feature for feature in cat_features
                             if feature in use_cat_features]
+            plpy.warning("Decision tree warning: Categorical columns with only 
"
+                         "one value are dropped from the tree model.")
 
         col_to_row = dict((row['colname'], i) for i, row in 
enumerate(all_levels))
 
@@ -1440,7 +1445,7 @@ def _create_summary_table(
         cat_features, con_features, dependent_variable, list_of_features,
         num_failed_groups, is_classification, n_all_rows, n_rows,
         dep_list, all_cols_types, cp, grouping_cols=None, n_groups=1,
-        use_existing_tables=False, running_cv=False, k=0, null_proxy=None):
+        use_existing_tables=False, n_folds=0, null_proxy=None):
 
     # dependent variables
     if dep_list:
@@ -1463,7 +1468,6 @@ def _create_summary_table(
         cp_str = py_list_to_sql_string(cp, 'double precision')
     else:
         cp_str = str(cp) + "::double precision"
-    fold = k if running_cv else "NULL"
 
     if use_existing_tables:
         header = "INSERT INTO {0} ".format(
@@ -1471,7 +1475,7 @@ def _create_summary_table(
     else:
         header = "CREATE TABLE {0} AS ".format(
             add_postfix(output_table_name, "_summary"))
-    null_proxy_str="NULL" if null_proxy is None else null_proxy
+    null_proxy_str="NULL" if null_proxy is None else "'{0}'".format(null_proxy)
     sql = header + """
             SELECT
                 'tree_train'::text             AS method,
@@ -1492,10 +1496,9 @@ def _create_summary_table(
                 '{dep_type}'::text             AS dependent_var_type,
                 {cp_str}                       AS input_cp,
                 '{indep_type}'::text           AS independent_var_types,
-                {fold}::integer                AS k,
-                '{null_proxy_str}'::text       AS null_proxy
-
-        """.format(**locals())
+                {n_folds}::integer             AS n_folds,
+                {null_proxy_str}::text         AS null_proxy
+            """.format(**locals())
     plpy.execute(sql)
 # ------------------------------------------------------------
 
@@ -1864,7 +1867,7 @@ def _xvalidate(schema_madlib, tree_states, 
training_table_name, output_table_nam
                split_criterion, grouping_cols, weights, max_depth,
                min_split, min_bucket, n_bins, is_classification,
                dep_is_bool, dep_n_levels, n_folds, n_rows,
-               max_n_surr, msg_level='warning', **kwargs):
+               max_n_surr, null_proxy, msg_level='warning', **kwargs):
     """
     Run cross validation for decision tree over multiple cp values
 
@@ -2079,11 +2082,12 @@ def _get_xvalidate_params(**kwargs):
                        quoted_args['n_bins'],
                        "%explore%",
                        quoted_args['max_n_surr'],
-                       quoted_args['msg_level']
+                       quoted_args['msg_level'],
+                       quoted_args['null_proxy']
                        ]
     modeling_param_types = (["BOOLEAN"] + ["TEXT"] * 5 + ["BOOLEAN"] + 
["TEXT"] +
                             ["VARCHAR[]"] * 4 + ["TEXT"] * 2 + ["INTEGER"] * 4 
+
-                            ["TEXT", "SMALLINT", "TEXT"])
+                            ["TEXT", "SMALLINT", "TEXT", "TEXT"])
     predict_params = ["%model%", "%data%", "%prediction%", "'response'", 
"True"]
     predict_param_types = (["VARCHAR"] * 4 + ["BOOLEAN"])
     metric_params = ["%data%",

http://git-wip-us.apache.org/repos/asf/madlib/blob/8bbad2df/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index eb0e760..e21f3f2 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -87,13 +87,13 @@ tree_train(
   <DT>list_of_features</DT>
   <DD>TEXT. Comma-separated string of column names or expressions to use as 
predictors.
   Can also be a '*' implying all columns are to be used as predictors (except 
for the
-  ones included in the next argument that lists exclusions). 
-  The types of the features can be mixed - boolean, integer, and text columns 
+  ones included in the next argument that lists exclusions).
+  The types of the features can be mixed - boolean, integer, and text columns
   are considered categorical and
   double precision columns are considered continuous. Categorical variables
-  are not encoded and used as is for the training. 
+  are not encoded and used as is for the training.
 
-  Array columns can also be included in the list, where the array is expanded 
+  Array columns can also be included in the list, where the array is expanded
   to treat each element of the array as a feature.
 
   It is important to note that not every combination of the levels of a
@@ -1141,7 +1141,8 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.__build_tree(
     cp_table              TEXT,
     max_n_surr            SMALLINT,
     msg_level             TEXT,
-    k                     INTEGER)
+    null_proxy            TEXT,
+    n_folds               INTEGER)
 RETURNS VOID AS $$
 PythonFunction(recursive_partitioning, decision_tree, _build_tree)
 $$ LANGUAGE plpythonu

http://git-wip-us.apache.org/repos/asf/madlib/blob/8bbad2df/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
index 387d627..e891ed7 100644
--- 
a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
+++ 
b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
@@ -396,6 +396,7 @@ select __build_tree(
     'group_cp',
     0::smallint,
     'notice',
+    NULL,
     0
     );
 

http://git-wip-us.apache.org/repos/asf/madlib/blob/8bbad2df/src/ports/postgres/modules/validation/cross_validation.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/validation/cross_validation.py_in 
b/src/ports/postgres/modules/validation/cross_validation.py_in
index 3b3dfe1..1be5b86 100644
--- a/src/ports/postgres/modules/validation/cross_validation.py_in
+++ b/src/ports/postgres/modules/validation/cross_validation.py_in
@@ -122,7 +122,6 @@ def __cv_funcall_general(func, params, params_type, 
tbl_data, col_random_id,
         param_explored, explore_value, tbl_input, tbl_output,
         add_param_quotes=add_param_quotes)
     sql = "SELECT {func}({arg_string})".format(func=func, 
arg_string=arg_string)
-    plpy.info(sql)
     plpy.execute(sql)
 # ------------------------------------------------------------------------

madlib git commit: DT: Ensure n_folds and null_proxy are set correctly

Reply via email to