DTree: Update defaults for max_depth, num_splits

Reduce the defaults for max_depth to 7 and num_splits to 20 to decrease
the chances of running out of memory when initializing tree for problems
with many features or with features with many categorical values.

Closes #117


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/3eec0a82
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/3eec0a82
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/3eec0a82

Branch: refs/heads/latest_release
Commit: 3eec0a82ee522101264c6557457602f9e0dbee52
Parents: 8faf622
Author: Rahul Iyer <ri...@apache.org>
Authored: Tue Apr 18 11:53:36 2017 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Tue Apr 18 17:19:54 2017 -0700

----------------------------------------------------------------------
 .../recursive_partitioning/decision_tree.py_in  | 23 ++++----
 .../recursive_partitioning/decision_tree.sql_in | 59 +++++++++++---------
 .../test/decision_tree.sql_in                   |  2 +-
 3 files changed, 46 insertions(+), 38 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3eec0a82/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in 
b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index fb18278..f7c4bd8 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -223,7 +223,7 @@ SELECT {schema_madlib}.tree_train(
                                 is NULL
     'weights',              -- A Column name containing weights for
                                 each observation. Default is NULL
-    max_depth,              -- Maximum depth of any node, default is 10
+    max_depth,              -- Maximum depth of any node, default is 7
     min_split,              -- Minimum number of observations that must
                                 exist in a node for a split to be
                                 attemped, default is 20
@@ -231,7 +231,7 @@ SELECT {schema_madlib}.tree_train(
                                 terminal node, default is min_split/3
     n_bins,                 -- Number of bins to find possible node
                                 split threshold values for continuous
-                                variables, default is 100 (Must be greater 
than 1)
+                                variables, default is 20 (Must be greater than 
1)
     pruning_params,         -- A comma-separated text containing
                                 key=value pairs of parameters for pruning.
                                 Parameters accepted:
@@ -341,7 +341,6 @@ def _extract_pruning_params(pruning_params_str):
         @param pruning_param: str, Parameters used for pruning the tree
                                     cp = Cost-complexity for pruning
                                     n_folds = Number of folds for 
cross-validation
-
     Returns:
         dict. A dictionary containing the pruning parameters
     """
@@ -567,17 +566,21 @@ def tree_train(schema_madlib, training_table_name, 
output_table_name,
     """
     msg_level = "notice" if verbose_mode else "warning"
 
-    # Set default values for optional arguments
-    min_split = 20 if (min_split is None and min_bucket is None) else min_split
-    min_bucket = min_split // 3 if min_bucket is None else min_bucket
-    min_split = min_bucket * 3 if min_split is None else min_split
-    n_bins = 100 if n_bins is None else n_bins
+    # Set default values for all arguments
     split_criterion = 'gini' if not split_criterion else split_criterion
-    plpy.notice("split_criterion:" + split_criterion)
+    max_depth = 7 if max_depth is None else max_depth
+    if min_split is None and min_bucket is None:
+        min_split = 20
+        min_bucket = 6
+    else:
+        min_bucket = min_split // 3 if min_bucket is None else min_bucket
+        min_split = min_bucket * 3 if min_split is None else min_split
+    n_bins = 20 if n_bins is None else n_bins
+
+    # defaults for cp and n_folds set within _extract_pruning_params
     pruning_param_dict = _extract_pruning_params(pruning_params)
     cp = pruning_param_dict['cp']
     n_folds = pruning_param_dict['n_folds']
-
     surrogate_param_dict = extract_keyvalue_params(surrogate_params,
                                                    dict(max_surrogates=int),
                                                    dict(max_surrogates=0))

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3eec0a82/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
index 97e8471..ef671fc 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in
@@ -258,7 +258,7 @@ tree_train(
   <DD>TEXT. Column name containing weights for each observation.</DD>
 
   <DT>max_depth (optional)</DT>
-  <DD>INTEGER, default: 10. Maximum depth of any node of the final tree,
+  <DD>INTEGER, default: 7. Maximum depth of any node of the final tree,
       with the root node counted as depth 0.</DD>
 
   <DT>min_split (optional)</DT>
@@ -272,7 +272,7 @@ tree_train(
       set to min_bucket*3 or min_bucket to min_split/3, as appropriate.</DD>
 
   <DT>num_splits (optional)</DT>
-  <DD>INTEGER, default: 100. Continuous-valued features are binned into
+  <DD>INTEGER, default: 20. Continuous-valued features are binned into
       discrete quantiles to compute split boundaries. This global parameter
       is used to compute the resolution of splits for continuous features.
       Higher number of bins will lead to better prediction,
@@ -920,7 +920,7 @@ File decision_tree.sql_in documenting the training function
   *        multiple decision trees, one for each group.
   * @param weights OPTIONAL. Column name containing weights for
   *        each observation.
-  * @param max_depth OPTIONAL (Default = 10). Set the maximum depth
+  * @param max_depth OPTIONAL (Default = 7). Set the maximum depth
   *        of any node of the final tree, with the root node counted
   *        as depth 0.
   * @param min_split OPTIONAL (Default = 20). Minimum number of
@@ -931,13 +931,13 @@ File decision_tree.sql_in documenting the training 
function
   *        one of minbucket or minsplit is specified, minsplit
   *        is set to minbucket*3 or minbucket to minsplit/3, as
   *        appropriate.
-  * @param n_bins optional (default = 100) number of bins to use
+  * @param n_bins optional (default = 20) number of bins to use
   *        during binning. continuous-valued features are binned
   *        into discrete bins (per the quartile values) to compute
   *        split bound- aries. this global parameter is used to
   *        compute the resolution of the bins. higher number of
   *        bins will lead to higher processing time.
-  * @param pruning_params (default = 'cp=0.01') pruning parameter string
+  * @param pruning_params (default: cp=0) pruning parameter string
   *         containing key-value pairs.
   *        the keys can be:
   *             cp (default = 0.01) a complexity parameter
@@ -1574,8 +1574,10 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     pruning_params              TEXT,
     surrogate_params            TEXT
 ) RETURNS VOID AS $$
+    -- verbose = false
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
                                     $11, $12, $13, $14, $15, FALSE);
+
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -1596,7 +1598,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     pruning_params              TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
-                                    $11, $12, $13, $14, 'max_surrogates=0', 
FALSE);
+                                    $11, $12, $13, $14, NULL::text, FALSE);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -1616,8 +1618,8 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     n_bins                      INTEGER
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
-                                    $11, $12, $13, 'cp=0.01'::TEXT,
-                                    'max_surrogates=0', FALSE::BOOLEAN);
+                                    $11, $12, $13, NULL::TEXT,
+                                    NULL::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -1635,8 +1637,9 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     min_split                   INTEGER,
     min_bucket                  INTEGER
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, 
$11, $12,
-        100::INTEGER, 'cp=0.01'::TEXT, 'max_surrogates=0', FALSE::BOOLEAN);
+    SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
+                                    $11, $12, NULL::INTEGER, NULL::TEXT,
+                                    NULL::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -1654,8 +1657,8 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     min_split                   INTEGER
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, 
$11,
-        ($11/3)::INTEGER, 100::INTEGER, 'cp=0.01'::TEXT, 'max_surrogates=0',
-        FALSE::BOOLEAN);
+                                    NULL::INTEGER, NULL::INTEGER, NULL::TEXT,
+                                    NULL::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -1672,8 +1675,8 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     max_depth                   INTEGER
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10,
-        20::INTEGER, 6::INTEGER, 100::INTEGER, 'cp=0.01'::TEXT,
-        'max_surrogates=0', FALSE::BOOLEAN);
+                                    NULL::INTEGER, NULL::INTEGER, 
NULL::INTEGER,
+                                    NULL::TEXT, NULL::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -1689,8 +1692,9 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     weights                     TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9,
-        10::INTEGER, 20::INTEGER, 6::INTEGER, 100::INTEGER,
-        'cp=0.01'::TEXT, 'max_surrogates=0', FALSE::BOOLEAN);
+                                    NULL::INTEGER, NULL::INTEGER, 
NULL::INTEGER,
+                                    NULL::INTEGER, NULL::TEXT, NULL::TEXT,
+                                    FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -1705,8 +1709,9 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     grouping_cols               TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8,
-        NULL::TEXT, 10::INTEGER, 20::INTEGER, 6::INTEGER, 100::INTEGER,
-        'cp=0.01'::TEXT, 'max_surrogates=0', FALSE::BOOLEAN);
+        NULL::TEXT, NULL::INTEGER, NULL::INTEGER,
+        NULL::INTEGER, NULL::INTEGER, NULL::TEXT, NULL::TEXT,
+        FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -1720,9 +1725,9 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     split_criterion             TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7,
-        NULL::TEXT, NULL::TEXT, 10::INTEGER, 20::INTEGER,
-        6::INTEGER, 100::INTEGER, 'cp=0.01'::TEXT,
-        'max_surrogates=0', FALSE::BOOLEAN);
+        NULL::TEXT, NULL::TEXT, NULL::INTEGER, NULL::INTEGER,
+        NULL::INTEGER, NULL::INTEGER, NULL::TEXT,
+        NULL::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -1735,9 +1740,9 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     list_of_features_to_exclude TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6,
-        'gini'::TEXT, NULL::TEXT, NULL::TEXT, 10::INTEGER,
-        20::INTEGER, 6::INTEGER, 100::INTEGER, 'cp=0.01'::TEXT,
-        'max_surrogates=0', FALSE::BOOLEAN);
+        NULL::TEXT, NULL::TEXT, NULL::TEXT, NULL::INTEGER,
+        NULL::INTEGER, NULL::INTEGER, NULL::INTEGER, NULL::TEXT,
+        NULL::TEXT, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -1749,9 +1754,9 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train(
     list_of_features            TEXT
 ) RETURNS VOID AS $$
     SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5,
-        NULL::TEXT, 'gini'::TEXT, NULL::TEXT, NULL::TEXT,
-        10::INTEGER, 20::INTEGER, 6::INTEGER, 100::INTEGER,
-        'cp=0.01'::TEXT, 'max_surrogates=0', FALSE::BOOLEAN);
+        NULL::TEXT, NULL::TEXT, NULL::TEXT, NULL::TEXT,
+        NULL::INTEGER, NULL::INTEGER, NULL::INTEGER, NULL::INTEGER,
+        NULL::TEXT, NULL::text, FALSE::BOOLEAN);
 $$ LANGUAGE SQL VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 -- -------------------------------------------------------------------------

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/3eec0a82/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in 
b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
index 1863b64..28a4647 100644
--- 
a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
+++ 
b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in
@@ -325,7 +325,7 @@ SELECT tree_train('dt_golf'::text,         -- source table
                          'mse'::text,      -- split criterion
                          NULL::text,        -- no grouping
                          NULL::text,        -- no weights
-                         10::integer,       -- max depth
+                         NULL::integer,     -- max depth
                          6::integer,        -- min split
                          2::integer,        -- min bucket
                          8::integer,        -- number of bins per continuous 
variable

Reply via email to