Repository: incubator-madlib Updated Branches: refs/heads/master a1f980336 -> a2f474007
DT: Add option to treat NULL as category This commit adds an option to treat NULL as a level in the categorical feature. The level is added as a string (instead of a NULL value) to ensure MADlib arrays don't have NULLs in them during the binning procedure. Closes #156 Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/a2f47400 Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/a2f47400 Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/a2f47400 Branch: refs/heads/master Commit: a2f47400705da784f9a727d8b926da34c84465a2 Parents: a1f9803 Author: Rahul Iyer <ri...@apache.org> Authored: Tue Jul 18 14:04:37 2017 -0700 Committer: Rahul Iyer <ri...@apache.org> Committed: Wed Aug 2 20:52:27 2017 -0700 ---------------------------------------------------------------------- src/modules/recursive_partitioning/DT_impl.hpp | 2 +- .../recursive_partitioning/decision_tree.py_in | 187 +++++++++++++------ .../recursive_partitioning/decision_tree.sql_in | 108 ++++++++++- .../recursive_partitioning/random_forest.py_in | 38 ++-- .../recursive_partitioning/random_forest.sql_in | 96 ++++++---- .../test/decision_tree.sql_in | 2 +- 6 files changed, 309 insertions(+), 124 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a2f47400/src/modules/recursive_partitioning/DT_impl.hpp ---------------------------------------------------------------------- diff --git a/src/modules/recursive_partitioning/DT_impl.hpp b/src/modules/recursive_partitioning/DT_impl.hpp index 27bc647..808e48c 100644 --- a/src/modules/recursive_partitioning/DT_impl.hpp +++ b/src/modules/recursive_partitioning/DT_impl.hpp @@ -1181,7 +1181,7 @@ DecisionTree<Container>::displayInternalNode( label_str << escape_quotes(feature_name) << " <= " << feature_thresholds(id); } else { feature_name = get_text(cat_features_str, feature_indices(id)); - label_str << escape_quotes(feature_name) << " = "; + label_str << escape_quotes(feature_name) << " <= "; // Text for all categoricals are stored in a flat array (cat_levels_text); // find the appropriate index for this node http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a2f47400/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in index 7b724c1..d82936a 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in @@ -246,11 +246,13 @@ SELECT {schema_madlib}.tree_train( 'cp' - complexity parameter with default=0.01, 'n_folds' - number of cross-validation folds with default value of 0 (= no cross-validation) - surrogate_params, -- A comma-separated text containing - key=value pairs of parameters for computing surrogate splits. + null_handling_params, -- A comma-separated text containing + key=value pairs of parameters for handling NULL values. Parameters accepted: 'max_surrogates' - Maximum number of surrogates to compute for each split + 'null_as_category' - Boolean to indicate if + NULL should be treated as a special category verbose -- Boolean, whether to print more info, default is False ); @@ -291,6 +293,9 @@ The output summary table ('output_table_summary') has the following columns: input_cp -- The complexity parameter (cp) used for pruning the trained tree(s) (before cross-validation is run) independent_var_types -- The types of independent variables, comma-separated + k -- Number of folds (NULL if not using cross validation) + null_proxy -- String used as replacement for NULL values + (NULL if null_as_category = False) """ elif message.lower().strip() in ['example', 'examples']: @@ -371,7 +376,7 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion, con_features, n_bins, boolean_cats, min_split, min_bucket, weights, max_depth, grp_key_to_cp, compute_cp_list=False, - max_n_surr=0, **kwargs): + max_n_surr=0, null_proxy=None, **kwargs): """ Args: grp_key_to_cp : Dictionary, mapping from group key to the cp value @@ -419,7 +424,7 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion, bins = _get_bins(schema_madlib, training_table_name, cat_features, ordered_cat_features, con_features, n_bins, dep_var_str, boolean_cats, n_rows, is_classification, - dep_n_levels, filter_dep) + dep_n_levels, filter_dep, null_proxy) # some features may be dropped if they have only one value cat_features = bins['cat_features'] @@ -446,7 +451,7 @@ def _get_tree_states(schema_madlib, is_classification, split_criterion, boolean_cats, grouping_cols, grouping_array_str, n_rows, is_classification, dep_n_levels, - filter_dep) + filter_dep, null_proxy) cat_features = bins['cat_features'] # 3b) Load each group's tree state in memory and set to the initial tree @@ -507,7 +512,8 @@ def _build_tree(schema_madlib, is_classification, split_criterion, cat_features, ordered_cat_features, boolean_cats, con_features, grouping_cols, weights, max_depth, min_split, min_bucket, n_bins, - cp_table, max_n_surr=0, msg_level="warning", k=0, **kwargs): + cp_table, max_n_surr=0, null_as_category=False, + msg_level="warning", k=0, **kwargs): compute_cp_list = False if grouping_cols: @@ -545,7 +551,7 @@ def tree_train(schema_madlib, training_table_name, output_table_name, list_of_features_to_exclude, split_criterion, grouping_cols, weights, max_depth, min_split, min_bucket, n_bins, pruning_params, - surrogate_params, verbose_mode, **kwargs): + null_handling_params, verbose_mode, **kwargs): """ Decision tree main training function Args: @@ -589,10 +595,18 @@ def tree_train(schema_madlib, training_table_name, output_table_name, pruning_param_dict = _extract_pruning_params(pruning_params) cp = pruning_param_dict['cp'] n_folds = pruning_param_dict['n_folds'] - surrogate_param_dict = extract_keyvalue_params(surrogate_params, - dict(max_surrogates=int), - dict(max_surrogates=0)) - max_n_surr = surrogate_param_dict['max_surrogates'] + + # null handing parameters: max_n_surr and null_as_category + null_handling_dict = extract_keyvalue_params( + null_handling_params, + dict(max_surrogates=int, null_as_category=bool), + dict(max_surrogates=0, null_as_category=False)) + max_n_surr = null_handling_dict['max_surrogates'] + null_as_category = null_handling_dict['null_as_category'] + null_proxy = "__NULL__" if null_as_category else None + if null_as_category: + # can't have two ways of handling tuples with NULL values + max_n_surr = 0 _assert(max_n_surr >= 0, "Maximum number of surrogates ({0}) should be non-negative". format(max_n_surr)) @@ -650,7 +664,8 @@ def _create_output_tables(schema_madlib, training_table_name, output_table_name, id_col_name, dependent_variable, is_classification, n_all_rows, n_rows, dep_list, cp, all_cols_types, grouping_cols=None, - use_existing_tables=False, running_cv=False, k=0, **kwargs): + use_existing_tables=False, running_cv=False, k=0, + null_proxy=None, **kwargs): if not grouping_cols: _create_result_table(schema_madlib, tree_states[0], bins['cat_origin'], bins['cat_n'], cat_features, @@ -667,8 +682,8 @@ def _create_output_tables(schema_madlib, training_table_name, output_table_name, schema_madlib, split_criterion, training_table_name, output_table_name, id_col_name, cat_features, con_features, dependent_variable, failed_groups, is_classification, n_all_rows, - n_rows, dep_list, all_cols_types, cp, grouping_cols, 1, use_existing_tables, - running_cv, k) + n_rows, dep_list, all_cols_types, cp, grouping_cols, 1, + use_existing_tables, running_cv, k, null_proxy) # ------------------------------------------------------------------------- @@ -715,7 +730,8 @@ def _is_dep_categorical(training_table_name, dependent_variable): def _get_bins(schema_madlib, training_table_name, cat_features, ordered_cat_features, con_features, n_bins, dependent_variable, boolean_cats, n_rows, - is_classification, dep_n_levels, filter_null): + is_classification, dep_n_levels, filter_null_dep, + null_proxy=None): """ Compute the bins of all features @param training_table_name Data source table @@ -760,7 +776,7 @@ def _get_bins(schema_madlib, training_table_name, FROM ( SELECT *, random() AS rand FROM {training_table_name} - WHERE {filter_null} + WHERE {filter_null_dep} AND not {schema_madlib}.array_contains_null({con_features_str}) ) subq WHERE rand <= {percentage} @@ -808,10 +824,24 @@ def _get_bins(schema_madlib, training_table_name, # For regressions order_fun = "AVG({0})".format(dependent_variable) - # Note that 'sql_cat_levels' goes through two levels of formatting + # Note that 'sql_cat_levels' goes through two levels of string formatting # Try to obtain all the levels in one scan of the table. # () are needed when casting the categorical variables because # they can be expressions. + + filter_str = filter_null_dep + " AND {col} IS NOT NULL" + + if null_proxy is None: + union_null_proxy = "" + else: + union_null_proxy = "UNION " + """ + SELECT '{null_proxy}'::text as levels, + 'Infinity'::double precision as dep_avg + FROM {training_table_name} + WHERE {{col}} IS NULL + GROUP BY {{col}} + """.format(**locals()) + sql_cat_levels = """ SELECT '{{col_name}}'::text AS colname, @@ -825,22 +855,30 @@ def _get_bins(schema_madlib, training_table_name, ({{col}})::text AS levels, {{order_fun}} AS dep_avg FROM {training_table_name} - WHERE {filter_null} - AND {{col}} is not NULL + WHERE {filter_str} GROUP BY {{col}} + {union_null_proxy} ) s ) s1 WHERE array_upper(levels, 1) > 1 """.format(training_table_name=training_table_name, - filter_null=filter_null) + filter_str=filter_str, + union_null_proxy=union_null_proxy) + + all_col_expressions = {} + for col in cat_features: + if col in boolean_cats: + all_col_expressions[col] = ("(CASE WHEN " + col + + " THEN 'True' ELSE 'False' END)") + else: + all_col_expressions[col] = col sql_all_cats = ' UNION '.join( sql_cat_levels.format( - col="(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)" - if col in boolean_cats else col, - col_name=col, - order_fun=col if col in ordered_cat_features else order_fun) - for col in cat_features) + col=expr, + col_name=col_name, + order_fun=expr if col_name in ordered_cat_features else order_fun) + for col_name, expr in all_col_expressions.items()) all_levels = plpy.execute(sql_all_cats) if len(all_levels) != len(cat_features): @@ -925,7 +963,7 @@ def _get_bins_grps( schema_madlib, training_table_name, cat_features, ordered_cat_features, con_features, n_bins, dependent_variable, boolean_cats, grouping_cols, grouping_array_str, n_rows, is_classification, - dep_n_levels, filter_null): + dep_n_levels, filter_null_dep, null_proxy=None): """ Compute the bins for all features in each group @brief Similar to _get_bins except that this is for multiple groups. @@ -974,14 +1012,14 @@ def _get_bins_grps( ( SELECT * FROM {training_table_name} - WHERE {filter_null} + WHERE {filter_null_dep} AND not {schema_madlib}.array_contains_null({con_features_str}) ) src JOIN ( SELECT {grouping_cols}, count(*) AS {grp_size_str} FROM {training_table_name} - WHERE {filter_null} + WHERE {filter_null_dep} AND not {schema_madlib}.array_contains_null({con_features_str}) GROUP BY {grouping_cols} ) grp_info @@ -1024,6 +1062,18 @@ def _get_bins_grps( else: order_fun = "avg({0})".format(dependent_variable) + filter_str = filter_null_dep + " AND {col} IS NOT NULL" + if null_proxy is None: + union_null_proxy = "" + else: + union_null_proxy = "UNION " + """ + SELECT '{null_proxy}'::text as levels, + 'Infinity'::double precision as dep_avg + FROM {training_table_name} + WHERE {{col}} IS NULL + GROUP BY {{col}} + """.format(**locals()) + sql_cat_levels = """ SELECT colname::text, @@ -1040,8 +1090,7 @@ def _get_bins_grps( ({{col}})::text as levels, {{order_fun}} as dep_avg FROM {training_table_name} - WHERE {filter_null} - AND {{col}} is not NULL + WHERE {filter_str} GROUP BY {{col}}, {grouping_cols} ) s GROUP BY grp_key @@ -1049,13 +1098,24 @@ def _get_bins_grps( where array_upper(levels, 1) > 1 """.format(**locals()) + all_col_expressions = {} + for col in cat_features: + if col in boolean_cats: + all_col_expressions[col] = ("(CASE WHEN " + col + + " THEN 'True' ELSE 'False' END)") + else: + if null_proxy is not None: + all_col_expressions[col] = ("COALESCE({0}, {1})". + format(col, null_proxy)) + else: + all_col_expressions[col] = col + sql_all_cats = ' UNION ALL '.join( sql_cat_levels.format( - col=("(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)" - if col in boolean_cats else col), - col_name=col, - order_fun=col if col in ordered_cat_features else order_fun) - for col in cat_features) + col=expr, + col_name=col_name, + order_fun=expr if col_name in ordered_cat_features else order_fun) + for col_name, expr in all_col_expressions.items()) all_levels = list(plpy.execute(sql_all_cats)) all_levels.sort(key=itemgetter('grp_key')) @@ -1100,9 +1160,16 @@ def _get_bins_grps( def get_feature_str(schema_madlib, boolean_cats, cat_features, con_features, - levels_str, n_levels_str): + levels_str, n_levels_str, + null_proxy=None): if len(cat_features) > 0: - unique_val = unique_string() + # null_val is the replacement for NULL in categorial feature. If a + # null_proxy is set then the proxy is used to assign NULL as a valid + # category. If no proxy is available then NULL is replaced with a unique + # value. In a later step, the categorical levels are mapped to integers + # (1 to N). The unique value will be mapped to -1 indicating an + # unknown/missing value in the underlying layers. + null_val = unique_string() if null_proxy is None else null_proxy cat_features_cast = [] for col in cat_features: if col in boolean_cats: @@ -1110,8 +1177,7 @@ def get_feature_str(schema_madlib, boolean_cats, "(CASE WHEN " + col + " THEN 'True' ELSE 'False' END)::text") else: cat_features_cast.append( - "(coalesce(" + col + "::text,'{0}')".format(unique_val) + - ")::text") + "(coalesce({0}::text, '{1}'))::text".format(col, null_val)) cat_features_str = ("{0}._map_catlevel_to_int(array[" + ", ".join(cat_features_cast) + "], {1}, {2})" @@ -1132,7 +1198,8 @@ def get_feature_str(schema_madlib, boolean_cats, def _one_step(schema_madlib, training_table_name, cat_features, con_features, boolean_cats, bins, n_bins, tree_state, weights, dep_var, min_split, min_bucket, max_depth, filter_null, - dep_n_levels, subsample, n_random_features, max_n_surr=0): + dep_n_levels, subsample, n_random_features, + max_n_surr=0, null_proxy=None): """ One step of tree training @param tree_state A big double precision array that conatins @@ -1147,7 +1214,8 @@ def _one_step(schema_madlib, training_table_name, cat_features, boolean_cats, cat_features, con_features, - "$3", "$2") + "$3", "$2", + null_proxy) # The arguments of the aggregate (in the same order): # 1. current tree state, madlib.bytea8 @@ -1229,7 +1297,7 @@ def _one_step_for_grps( con_features, boolean_cats, bins, n_bins, tree_states, weights, grouping_cols, grouping_array_str, dep_var, min_split, min_bucket, max_depth, filter_null, dep_n_levels, subsample, n_random_features, - max_n_surr=0): + max_n_surr=0, null_proxy=None): """ One step of trees training with grouping support """ # The function _map_catlevel_to_int maps a categorical variable value to its @@ -1247,7 +1315,7 @@ def _one_step_for_grps( cat_features_str, con_features_str = get_feature_str( schema_madlib, boolean_cats, cat_features, con_features, - cat_levels_in_text, cat_n_levels) + cat_levels_in_text, cat_n_levels, null_proxy) train_apply_func = """ {schema_madlib}._dt_apply( @@ -1530,7 +1598,8 @@ def _create_summary_table( cat_features, con_features, dependent_variable, num_failed_groups, is_classification, n_all_rows, n_rows, dep_list, all_cols_types, cp, grouping_cols=None, n_groups=1, - use_existing_tables=False, running_cv=False, k=0): + use_existing_tables=False, running_cv=False, k=0, null_proxy=None): + # dependent variables features = ', '.join(cat_features + con_features) if dep_list: @@ -1553,14 +1622,15 @@ def _create_summary_table( cp_str = py_list_to_sql_string(cp, 'double precision') else: cp_str = str(cp) + "::double precision" + fold = k if running_cv else "NULL" - fold = ", " + str(k) + " as k" if running_cv else "" if use_existing_tables: - # plpy.execute("truncate " + output_table_name + "_summary") - header = "insert into " + add_postfix(output_table_name, "_summary") + " " + header = "INSERT INTO {0} ".format( + add_postfix(output_table_name, "_summary")) else: - header = "create table " + add_postfix(output_table_name, "_summary") + " as " - + header = "CREATE TABLE {0} AS ".format( + add_postfix(output_table_name, "_summary")) + null_proxy_str="NULL" if null_proxy is None else null_proxy sql = header + """ SELECT 'tree_train'::text AS method, @@ -1580,8 +1650,10 @@ def _create_summary_table( {dep_list_str}::text AS dependent_var_levels, '{dep_type}'::text AS dependent_var_type, {cp_str} AS input_cp, - '{indep_type}'::text AS independent_var_types - {fold} + '{indep_type}'::text AS independent_var_types, + {fold}::integer AS k, + '{null_proxy_str}'::text AS null_proxy + """.format(**locals()) plpy.execute(sql) # ------------------------------------------------------------ @@ -1683,6 +1755,7 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response', dep_levels = summary_elements["dependent_var_levels"] is_classification = summary_elements["is_classification"] dep_type = summary_elements['dependent_var_type'] + null_proxy = summary_elements['null_proxy'] # find which columns are of type boolean boolean_cats = set([key for key, value in get_cols_and_types(source) @@ -1690,7 +1763,7 @@ def tree_predict(schema_madlib, model, source, output, pred_type='response', cat_features_str, con_features_str = get_feature_str( schema_madlib, boolean_cats, cat_features, con_features, - "m.cat_levels_in_text", "m.cat_n_levels") + "m.cat_levels_in_text", "m.cat_n_levels", null_proxy) if use_existing_tables and table_exists(output): plpy.execute("truncate " + output) @@ -2258,7 +2331,8 @@ def _tree_train_using_bins( cat_features, con_features, boolean_cats, n_bins, weights, dep_var_str, min_split, min_bucket, max_depth, filter_dep, dep_n_levels, is_classification, split_criterion, - subsample=False, n_random_features=1, max_n_surr=0, **kwargs): + subsample=False, n_random_features=1, max_n_surr=0, null_proxy=None, + **kwargs): """Trains a tree without grouping columns""" # Iterations for training the tree tree_state = plpy.execute( @@ -2284,7 +2358,7 @@ def _tree_train_using_bins( cat_features, con_features, boolean_cats, bins, n_bins, tree_state, weights, dep_var_str, min_split, min_bucket, max_depth, filter_dep, - dep_n_levels, subsample, n_random_features, max_n_surr) + dep_n_levels, subsample, n_random_features, max_n_surr, null_proxy) plpy.notice("Completed training of level {0}".format(tree_depth)) return tree_state @@ -2296,7 +2370,8 @@ def _tree_train_grps_using_bins( boolean_cats, n_bins, weights, grouping_cols, grouping_array_str, dep_var_str, min_split, min_bucket, max_depth, filter_dep, dep_n_levels, is_classification, split_criterion, subsample=False, - n_random_features=1, tree_terminated=None, max_n_surr=0, **kwargs): + n_random_features=1, tree_terminated=None, max_n_surr=0, null_proxy=None, + **kwargs): """Trains a tree with grouping columns included """ # Iterations for training the tree @@ -2340,7 +2415,7 @@ def _tree_train_grps_using_bins( tree_states, weights, grouping_cols, grouping_array_str, dep_var_str, min_split, min_bucket, max_depth, filter_dep, dep_n_levels, subsample, - n_random_features, max_n_surr) + n_random_features, max_n_surr, null_proxy) level += 1 plpy.notice("Finished training for level " + str(level)) http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a2f47400/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in index e8f37f8..8290dcf 100644 --- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in +++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.sql_in @@ -54,7 +54,7 @@ tree_train( min_bucket, num_splits, pruning_params, - surrogate_params, + null_handling_params, verbosity ) </pre> @@ -186,16 +186,27 @@ tree_train( </table> </DD> - <DT>surrogate_params (optional)</DT> + <DT>null_handling_params (optional)</DT> <DD>TEXT. Comma-separated string of key-value pairs controlling the behavior - of surrogate splits for each node. A surrogate variable is another predictor - variable that is associated (correlated) with the primary predictor variable - for a split. The surrogate variable comes into use when the primary predictior - value is NULL. This parameter currently accepts one argument: + of various features handling missing values. <table class='output'> <tr> <th>max_surrogates</th> <td>Default: 0. Number of surrogates to store for each node.</td> + One of the approaches of handling NULLs is to use surrogate splits for each + node. A surrogate variable is another predictor variable that is associated + (correlated) with the primary split variable. The surrogate + variable comes into use when the primary predictior value is NULL. + </tr> + <tr> + <th>null_as_category</th> + <td>Default: FALSE. Whether to treat NULL as a special categorical value. + + If this is set to TRUE, NULL values are considered a categorical value and + placed at the end of the ordering of categorical levels. Placing it at the + end ensures that NULL is never used as a value to split a node on. This + parameter is ignored for continuous-valued features. + </td> </tr> </table> </DD> @@ -901,6 +912,85 @@ Result: (32 rows) </pre> +<h4>NULL handling example</h4> + +- Create toy example to illustrate null-as-category handling +<pre class='example'> +drop table null_handling_example; +create table null_handling_example ( +id integer, +country text, +city text, +weather text, +response text +); + +insert into null_handling_example values +(1,null,null,null,'a'), +(2,'US',null,null,'b'), +(3,'US','NY',null,'c'), +(4,'US','NY','rainy','d'); + +DROP TABLE IF EXISTS train_output, train_output_summary; +SELECT madlib.tree_train('null_handling_example', -- source table + 'train_output', -- output model table + 'id', -- id column + 'response', -- dependent variable + 'country, city, weather', -- features + NULL, + 'gini', -- split criterion + NULL::text, -- no grouping + NULL::text, -- no weights + 4, -- max depth + 1, -- min split + 1, -- number of bins per continuous variable + 10, -- number of splits + NULL, -- pruning parameters + 'null_as_category=true' -- null handling + ); +select cat_levels_in_text, cat_n_levels from train_output; +</pre> +<pre class='result'> + cat_levels_in_text | cat_n_levels +------------------------------------------+-------------- + {US,__NULL__,rainy,__NULL__,NY,__NULL__} | {2,2,2} +</pre> + +- Predict for previously not seen data by assuming NULL value as the default +<pre class='example'> +drop table if exists table_test; +create table table_test ( +id integer, +country text, +city text, +weather text, +expected_response text +); +insert into table_test values +(1,'IN','MUM','cloudy','a'), +(2,'US','HOU','humid','b'), +(3,'US','NY','sunny','c'), +(4,'US','NY','rainy','d'); + +DROP TABLE IF EXISTS prediction_results; +SELECT madlib.tree_predict('train_output', + 'table_test', + 'prediction_results', + 'response'); +SELECT s.id, expected_response, estimated_response +FROM prediction_results p, table_test s +where s.id = p.id ORDER BY id; +</pre> +<pre class='result'> + id | expected_response | estimated_response +----+-------------------+-------------------- + 1 | a | a + 2 | b | b + 3 | c | c + 4 | d | d +(4 rows) +</pre> + @anchor literature @literature [1] Breiman, Leo; Friedman, J. H.; Olshen, R. A.; Stone, C. J. (1984). Classification and regression trees. Monterey, CA: Wadsworth & Brooks/Cole Advanced Books & Software. @@ -995,7 +1085,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( min_bucket INTEGER, n_bins INTEGER, pruning_params TEXT, - surrogate_params TEXT, + null_handling_params TEXT, verbose_mode BOOLEAN ) RETURNS VOID AS $$ PythonFunctionBodyOnly(recursive_partitioning, decision_tree) @@ -1004,7 +1094,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( id_col_name, dependent_variable, list_of_features, list_of_features_to_exclude, split_criterion, grouping_cols, weights, max_depth, min_split, min_bucket, n_bins, pruning_params, - surrogate_params, verbose_mode) + null_handling_params, verbose_mode) $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); ------------------------------------------------------------ @@ -1602,7 +1692,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.tree_train( min_bucket INTEGER, n_bins INTEGER, pruning_params TEXT, - surrogate_params TEXT + null_handling_params TEXT ) RETURNS VOID AS $$ -- verbose = false SELECT MADLIB_SCHEMA.tree_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a2f47400/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in index 05c029e..bc4bd97 100644 --- a/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in +++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.py_in @@ -237,7 +237,7 @@ def forest_train( grouping_cols, num_trees, num_random_features, importance, num_permutations, max_tree_depth, min_split, min_bucket, num_bins, - surrogate_params, verbose=False, sample_ratio=None, **kwargs): + null_handling_params, verbose=False, sample_ratio=None, **kwargs): """ Random forest main training function Args: @@ -283,15 +283,19 @@ def forest_train( num_bins = 100 if num_bins is None else num_bins sample_ratio = 1 if sample_ratio is None else sample_ratio - surrogate_param_dict = extract_keyvalue_params( - surrogate_params, - dict(max_surrogates=int), # type of variable - dict(max_surrogates=0)) # default value of variable - max_n_surr = surrogate_param_dict['max_surrogates'] + null_handling_dict = extract_keyvalue_params( + null_handling_params, + dict(max_surrogates=int, null_as_category=bool), + dict(max_surrogates=0, null_as_category=False)) + max_n_surr = null_handling_dict['max_surrogates'] + null_as_category = null_handling_dict['null_as_category'] + null_proxy = "__NULL__" if null_as_category else None + if null_as_category: + # can't have two ways of handling tuples with NULL values + max_n_surr = 0 _assert(max_n_surr >= 0, "Maximum number of surrogates ({0}) should be non-negative". format(max_n_surr)) - ################################################################## # validate arguments _forest_validate_args(training_table_name, output_table_name, id_col_name, @@ -383,7 +387,7 @@ def forest_train( cat_features, ordered_cat_features, con_features, num_bins, dep, boolean_cats, n_rows, is_classification, - dep_n_levels, filter_null) + dep_n_levels, filter_null, null_proxy) # some features may be dropped because they have only one value cat_features = bins['cat_features'] bins['grp_key_cat'] = [''] @@ -421,7 +425,7 @@ def forest_train( con_features, num_bins, dep, boolean_cats, grouping_cols, grouping_array_str, n_rows, - is_classification, dep_n_levels, filter_null) + is_classification, dep_n_levels, filter_null, null_proxy) cat_features = bins['cat_features'] # a table for converting cat_features to integers @@ -564,7 +568,7 @@ def forest_train( boolean_cats, num_bins, 'poisson_count', dep, min_split, min_bucket, max_tree_depth, filter_null, dep_n_levels, is_classification, split_criterion, True, - num_random_features, max_n_surr) + num_random_features, max_n_surr, null_proxy) tree_states = [dict(tree_state=tree_state['tree_state'], grp_key='')] @@ -578,7 +582,7 @@ def forest_train( max_tree_depth, filter_null, dep_n_levels, is_classification, split_criterion, True, num_random_features, tree_terminated=tree_terminated, - max_n_surr=max_n_surr) + max_n_surr=max_n_surr, null_proxy=null_proxy) # If a tree for a group is terminated (not finished properly), # then we do not need to compute other trees, and can just @@ -603,7 +607,7 @@ def forest_train( sample_id, id_col_name, cat_features, con_features, boolean_cats, grouping_cols, grp_key_to_grp_cols, dep, num_permutations, is_classification, importance, - num_bins, filter_null) + num_bins, filter_null, null_proxy) ################################################################### # evaluating and summerizing random forest @@ -695,6 +699,7 @@ def forest_predict(schema_madlib, model, source, output, pred_type='response', dep_levels = summary_elements["dependent_var_levels"] is_classification = summary_elements["is_classification"] dep_type = summary_elements['dependent_var_type'] + null_proxy = summary_elements['null_proxy'] # pred_type='prob' is allowed only for classification _assert(is_classification or pred_type == 'response', @@ -706,7 +711,7 @@ def forest_predict(schema_madlib, model, source, output, pred_type='response', cat_features_str, con_features_str = get_feature_str( schema_madlib, boolean_cats, cat_features, con_features, - "cat_levels_in_text", "cat_n_levels") + "cat_levels_in_text", "cat_n_levels", null_proxy) pred_name = ('"prob_{0}"' if pred_type == "prob" else '"estimated_{0}"').format(dep_varname.replace('"', '').strip()) @@ -929,12 +934,12 @@ def _calculate_oob_prediction( schema_madlib, model_table, cat_features_info_table, con_splits_table, oob_prediction_table, oob_view, sample_id, id_col_name, cat_features, con_features, boolean_cats, grouping_cols, grp_key_to_grp_cols, dep, - num_permutations, is_classification, importance, num_bins, filter_null): + num_permutations, is_classification, importance, num_bins, filter_null, null_proxy=None): """Calculate predication for out-of-bag sample""" cat_features_str, con_features_str = get_feature_str( schema_madlib, boolean_cats, cat_features, con_features, - "cat_levels_in_text", "cat_n_levels") + "cat_levels_in_text", "cat_n_levels", null_proxy) join_str = "," if grouping_cols is None else "JOIN" using_str = "" if grouping_cols is None else "USING (" + grouping_cols + ")" @@ -1247,7 +1252,8 @@ def _create_summary_table(**kwargs): {n_rows_skipped}::integer AS total_rows_skipped, {dep_list_str}::text AS dependent_var_levels, '{dep_type}'::text AS dependent_var_type, - '{indep_type}'::text AS independent_var_types + '{indep_type}'::text AS independent_var_types, + '{null_proxy}'::text AS null_proxy """.format(**kwargs) plpy.notice("sql_create_summary_table:\n" + sql_create_summary_table) plpy.execute(sql_create_summary_table) http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a2f47400/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in index f263cf9..d9ae9bf 100644 --- a/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in +++ b/src/ports/postgres/modules/recursive_partitioning/random_forest.sql_in @@ -34,7 +34,7 @@ constructed using bootstrapped samples from the input data. The results of these models are then combined to yield a single prediction, which, at the expense of some loss in interpretation, have been found to be highly accurate. -Please also refer to the decision tree user documentation for +Please also refer to the decision tree user documentation for information relevant to the implementation of random forests in MADlib. @anchor train @@ -56,7 +56,7 @@ forest_train(training_table_name, min_split, min_bucket, num_splits, - surrogate_params, + null_handling_params, verbose, sample_ratio ) @@ -314,7 +314,7 @@ forest_train(training_table_name, <DT>importance (optional)</DT> <DD>boolean, default: true. Whether or not to calculate variable importance. If set to true, variable importance for categorical and continuous features - will be output in the group table <em>\<model_table\>_group</em> described + will be output in the group table <em>\<model_table\>_group</em> described above. Will increase run time when variable importance is turned on. </DD> <DT>num_permutations (optional)</DT> @@ -331,7 +331,7 @@ forest_train(training_table_name, <DT>max_tree_depth (optional)</DT> <DD>integer, default: 7. Maximum depth of any node of a tree, with the root node counted as depth 0. A deeper tree can - lead to better prediction but will also result in + lead to better prediction but will also result in longer processing time and higher memory usage.</DD> <DT>min_split (optional)</DT> @@ -350,13 +350,27 @@ forest_train(training_table_name, Higher number of bins will lead to better prediction, but will also result in longer processing time and higher memory usage.</DD> - <DT>surrogate_params (optional)</DT> - <DD>text, Comma-separated string of key-value pairs controlling the behavior - of surrogate splits for each node in a tree. + <DT>null_handling_params</DT> + <DD>TEXT. Comma-separated string of key-value pairs controlling the behavior + of various features handling missing values. <table class='output'> <tr> <th>max_surrogates</th> <td>Default: 0. Number of surrogates to store for each node.</td> + One of the approaches of handling NULLs is to use surrogate splits for each + node. A surrogate variable is another predictor variable that is associated + (correlated) with the primary split variable. The surrogate + variable comes into use when the primary predictior value is NULL. + </tr> + <tr> + <th>null_as_special_cat</th> + <td>Default: FALSE. Whether to treat NULL as a special categorical value. + + If this is set to TRUE, NULL values are considered a categorical + value and placed at the end of the ordering of categorical levels. Placing it + at the end ensures that NULL is never used as a value to split a node on. + This parameter is ignored for continuous-valued features. + </td> </tr> </table> </DD> @@ -371,10 +385,10 @@ forest_train(training_table_name, is close to 0 may result in trees with only the root node. This allows users to experiment with the function in a speedy fashion.</DD> </DL> - @note The main parameters that affect memory usage are: depth of - tree (âmax_tree_depthâ), number of features, number of values per - categorical feature, and number of bins for continuous features (ânum_splitsâ). - If you are hitting memory limits, consider reducing one or + @note The main parameters that affect memory usage are: depth of + tree (âmax_tree_depthâ), number of features, number of values per + categorical feature, and number of bins for continuous features (ânum_splitsâ). + If you are hitting memory limits, consider reducing one or more of these parameters. @anchor predict @@ -460,8 +474,8 @@ are NULL, then the majority branch is used to compute the split for a tuple. <DT>verbose (optional)</DT> <DD>boolean, default = FALSE. If true, the dot format output will contain - additional information (impurity, sample size, number of weighted rows for - each response variable, classification or prediction if the tree was + additional information (impurity, sample size, number of weighted rows for + each response variable, classification or prediction if the tree was pruned at this level)</DD> </DL> @@ -539,7 +553,7 @@ dependent_varname | class independent_varnames | "OUTLOOK",windy,temperature,humidity cat_features | "OUTLOOK",windy con_features | temperature,humidity -grouping_cols | +grouping_cols | num_trees | 20 num_random_features | 2 max_tree_depth | 8 @@ -581,21 +595,21 @@ SELECT madlib.get_tree('train_output',1,2); </pre> Result: <pre class="result"> - digraph "Classification tree for dt_golf" { - "0" [label="humidity <= 75", shape=ellipse]; - "0" -> "1"[label="yes"]; - "1" [label="\"Play\"",shape=box]; - "0" -> "2"[label="no"]; - "2" [label="humidity <= 80", shape=ellipse]; - "2" -> "5"[label="yes"]; - "5" [label="\"Don't Play\"",shape=box]; - "2" -> "6"[label="no"]; + digraph "Classification tree for dt_golf" { + "0" [label="humidity <= 75", shape=ellipse]; + "0" -> "1"[label="yes"]; + "1" [label="\"Play\"",shape=box]; + "0" -> "2"[label="no"]; + "2" [label="humidity <= 80", shape=ellipse]; + "2" -> "5"[label="yes"]; + "5" [label="\"Don't Play\"",shape=box]; + "2" -> "6"[label="no"]; "6" [label="\"OUTLOOK\" in {overcast,rain}", shape=ellipse]; - "6" -> "13"[label="yes"]; - "13" [label="\"Play\"",shape=box]; - "6" -> "14"[label="no"]; - "14" [label="\"Don't Play\"",shape=box]; - } //---end of digraph--------- + "6" -> "13"[label="yes"]; + "13" [label="\"Play\"",shape=box]; + "6" -> "14"[label="no"]; + "14" [label="\"Don't Play\"",shape=box]; + } //---end of digraph--------- </pre> -# Obtain a text display of the tree: @@ -613,13 +627,13 @@ second indented child (2i+2) is the 'False' node. - Number of (weighted) rows for each response variable inside []. - Order of values = ['"Don\'t Play"', '"Play"'] ------------------------------------- - (0)[ 4 10] humidity <= 75 - (1)[0 7] * --> "Play" - (2)[4 3] humidity <= 80 - (5)[3 1] * --> "Don't Play" - (6)[1 2] "OUTLOOK" in {overcast,rain} - (13)[0 2] * --> "Play" - (14)[1 0] * --> "Don't Play" + (0)[ 4 10] humidity <= 75 + (1)[0 7] * --> "Play" + (2)[4 3] humidity <= 80 + (5)[3 1] * --> "Don't Play" + (6)[1 2] "OUTLOOK" in {overcast,rain} + (13)[0 2] * --> "Play" + (14)[1 0] * --> "Don't Play" ------------------------------------- </pre> @@ -637,7 +651,7 @@ ORDER BY id; </pre> Result: <pre class="result"> - id | estimated_class | class + id | estimated_class | class ----+-----------------+------------ 1 | Don't Play | Don't Play 2 | Don't Play | Don't Play @@ -670,7 +684,7 @@ ORDER BY id; </pre> Result: <pre class="result"> - id | estimated_prob_Play | class + id | estimated_prob_Play | class ----+---------------------+------------ 1 | 0.05 | Don't Play 2 | 0.15 | Don't Play @@ -917,7 +931,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train( min_split INTEGER, min_bucket INTEGER, num_splits INTEGER, - surrogate_params TEXT, + null_handling_params TEXT, verbose BOOLEAN, sample_ratio DOUBLE PRECISION ) RETURNS VOID AS $$ @@ -939,7 +953,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train( min_split, min_bucket, num_splits, - surrogate_params, + null_handling_params, verbose, sample_ratio ) @@ -1179,7 +1193,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train( min_split INTEGER, min_bucket INTEGER, num_splits INTEGER, - surrogate_params TEXT, + null_handling_params TEXT, verbose BOOLEAN ) RETURNS VOID AS $$ SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8, @@ -1204,7 +1218,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.forest_train( min_split integer, min_bucket integer, num_splits integer, - surrogate_params TEXT + null_handling_params TEXT ) RETURNS VOID AS $$ SELECT MADLIB_SCHEMA.forest_train($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, FALSE::BOOLEAN); http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/a2f47400/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in ---------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in index b135108..93b9a52 100644 --- a/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in +++ b/src/ports/postgres/modules/recursive_partitioning/test/decision_tree.sql_in @@ -344,7 +344,7 @@ SELECT tree_train('dt_golf'::text, -- source table 2::integer, -- min bucket 8::integer, -- number of bins per continuous variable 'cp=0.01, n_folds=5', - 'max_surrogates=2' + 'max_surrogates=2, null_as_category=True' ); SELECT _print_decision_tree(tree) from train_output;