[GitHub] madlib pull request #295: Recursive Partitioning: Add function to report imp...
Github user asfgit closed the pull request at: https://github.com/apache/madlib/pull/295 ---
[GitHub] madlib pull request #295: Recursive Partitioning: Add function to report imp...
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/295#discussion_r204593622 --- Diff: src/ports/postgres/modules/recursive_partitioning/random_forest.py_in --- @@ -1564,69 +1578,69 @@ def get_var_importance(schema_madlib, model_table, output_table, **kwargs): """ # Validate parameters summary_table = add_postfix(model_table, "_summary") -_validate_var_importance_input(model_table, summary_table, output_table) -grouping_cols = plpy.execute( -"SELECT grouping_cols FROM {summary_table}". -format(**locals()))[0]['grouping_cols'] -grouping_cols_comma = '' -if grouping_cols : -grouping_cols_comma = add_postfix(grouping_cols,", ") -is_RF = _is_model_for_RF(summary_table) +_validate_var_importance_input(model_table, + summary_table, + output_table) +is_RF = _is_random_forest_model(summary_table) + +# 'importance_model_table' is the table containing variable importance values. +# For RF, it is placed in _group as opposed to +# for DT. +importance_model_table = (model_table if not is_RF else + add_postfix(model_table, "_group")) +grouping_cols = plpy.execute("SELECT grouping_cols FROM {summary_table}". + format(**locals()))[0]['grouping_cols'] +if grouping_cols: +grouping_cols_comma = add_postfix(grouping_cols, ", ") +else: +grouping_cols_comma = '' +is_impurity_imp_col_present = _is_impurity_importance_in_model( +importance_model_table, summary_table, is_RF=is_RF) + +# convert importance to percentages +normalization_target = 100.0 + +def _unnest_normalize(input_array_str): +return (""" +unnest({0}.normalize_sum_array({1}::double precision[], + {2}::double precision)) +""".format(schema_madlib, input_array_str, normalization_target)) + if is_RF: -group_table = add_postfix(model_table, "_group") -is_impurity_imp_col_present = _is_impurity_importance_in_group_table( -group_table, summary_table) -# The group table for >= 1.15 RF models should have a column named -# impurity_var_importance if it was learnt with importance param True. -# So set is_pre_1_15_RF_model to False if the column exists, and to -# True if the column does not exist. -is_pre_1_15_RF_model = False if is_impurity_imp_col_present else True - -# Query to add oob variable importance for categorical vars -if is_pre_1_15_RF_model: -# In < 1.15 RF model, the variable importance was captured using two -# different columns named 'cat_var_importance' -plpy.execute( -""" CREATE TABLE {output_table} AS --- Add oob variable importance for categorical vars -SELECT {grouping_cols_comma} -unnest(regexp_split_to_array(cat_features, ',')) AS feature, -unnest(cat_var_importance) AS var_importance -FROM {group_table}, {summary_table} -UNION --- Add oob variable importance for continuous vars -SELECT {grouping_cols_comma} -unnest(regexp_split_to_array(con_features, ',')) AS feature, -unnest(con_var_importance) AS var_importance -FROM {group_table}, {summary_table} -""".format(**locals())) +if is_impurity_imp_col_present: +# In versions >= 1.15, the OOB variable importance is captured +# in a single column: 'oob_var_importance'. +oob_var_importance_str = ( +"{0} AS oob_var_importance,". +format(_unnest_normalize('oob_var_importance'))) +impurity_var_importance_str = ( +"{0} AS impurity_var_importance". +format(_unnest_normalize('impurity_var_importance'))) else: -# In >= 1.15 RF models, the variable importance and impurity -# importance scores are captured in columns 'oob_var_importance' -# and 'impurity_var_importance' respectively. -plpy.execute( -""" CREATE TABLE {output_table} AS -SELECT {grouping_cols_comma} -unnest(regexp_split_to_array(independent_varnames, ','))
[GitHub] madlib pull request #295: Recursive Partitioning: Add function to report imp...
Github user iyerr3 commented on a diff in the pull request: https://github.com/apache/madlib/pull/295#discussion_r203812338 --- Diff: src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in --- @@ -2327,6 +2328,110 @@ def _tree_error(schema_madlib, source_table, dependent_varname, plpy.execute(sql) # +def _validate_var_importance_input(model_table, summary_table, output_table): +_assert(table_exists(model_table), +"Recursive Partitioning: Model table does not exist.") +_assert(table_exists(summary_table), +"Recursive Partitioning: Model summary table does not exist.") +_assert(not table_exists(output_table), +"Recursive Partitioning: Output table already exists.") + +def _is_model_for_RF(summary_table): +# Only an RF model (and not DT) would have num_trees column in summary +return columns_exist_in_table(summary_table, ['num_trees']) + +def _is_RF_model_with_imp_pre_1_15(group_table, summary_table): --- End diff -- Since goal of function is to check if `impurity_var_importance` exists in group table, let's name it to reflect that. ---
[GitHub] madlib pull request #295: Recursive Partitioning: Add function to report imp...
Github user iyerr3 commented on a diff in the pull request: https://github.com/apache/madlib/pull/295#discussion_r203811384 --- Diff: src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in --- @@ -2327,6 +2328,110 @@ def _tree_error(schema_madlib, source_table, dependent_varname, plpy.execute(sql) # +def _validate_var_importance_input(model_table, summary_table, output_table): +_assert(table_exists(model_table), +"Recursive Partitioning: Model table does not exist.") +_assert(table_exists(summary_table), +"Recursive Partitioning: Model summary table does not exist.") +_assert(not table_exists(output_table), +"Recursive Partitioning: Output table already exists.") + +def _is_model_for_RF(summary_table): +# Only an RF model (and not DT) would have num_trees column in summary +return columns_exist_in_table(summary_table, ['num_trees']) + +def _is_RF_model_with_imp_pre_1_15(group_table, summary_table): +""" +Check if the RF model is from MADlib < 1.15. The group table for +>= 1.15 RF models should have a column named impurity_var_importance +if it was learnt with importance param True. +""" +_assert(table_exists(group_table), +"Recursive Partitioning: Model group table does not exist.") +# this flag has to be set to true for RF to report importance scores. +isImportance = plpy.execute("SELECT importance FROM {summary_table}". --- End diff -- Our convention is to use snake case for Python (i.e. `is_importance`). I also suggest changing it to `is_importance_set` to make it more explicit. ---
[GitHub] madlib pull request #295: Recursive Partitioning: Add function to report imp...
Github user iyerr3 commented on a diff in the pull request: https://github.com/apache/madlib/pull/295#discussion_r203811750 --- Diff: src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in --- @@ -2327,6 +2328,110 @@ def _tree_error(schema_madlib, source_table, dependent_varname, plpy.execute(sql) # +def _validate_var_importance_input(model_table, summary_table, output_table): +_assert(table_exists(model_table), +"Recursive Partitioning: Model table does not exist.") +_assert(table_exists(summary_table), +"Recursive Partitioning: Model summary table does not exist.") +_assert(not table_exists(output_table), +"Recursive Partitioning: Output table already exists.") + +def _is_model_for_RF(summary_table): +# Only an RF model (and not DT) would have num_trees column in summary +return columns_exist_in_table(summary_table, ['num_trees']) --- End diff -- We've a `method_name` in summary table that makes this clearer. ---
[GitHub] madlib pull request #295: Recursive Partitioning: Add function to report imp...
Github user iyerr3 commented on a diff in the pull request: https://github.com/apache/madlib/pull/295#discussion_r203816091 --- Diff: src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in --- @@ -2327,6 +2328,110 @@ def _tree_error(schema_madlib, source_table, dependent_varname, plpy.execute(sql) # +def _validate_var_importance_input(model_table, summary_table, output_table): +_assert(table_exists(model_table), +"Recursive Partitioning: Model table does not exist.") +_assert(table_exists(summary_table), +"Recursive Partitioning: Model summary table does not exist.") +_assert(not table_exists(output_table), +"Recursive Partitioning: Output table already exists.") + +def _is_model_for_RF(summary_table): +# Only an RF model (and not DT) would have num_trees column in summary +return columns_exist_in_table(summary_table, ['num_trees']) + +def _is_RF_model_with_imp_pre_1_15(group_table, summary_table): +""" +Check if the RF model is from MADlib < 1.15. The group table for +>= 1.15 RF models should have a column named impurity_var_importance +if it was learnt with importance param True. +""" +_assert(table_exists(group_table), +"Recursive Partitioning: Model group table does not exist.") +# this flag has to be set to true for RF to report importance scores. +isImportance = plpy.execute("SELECT importance FROM {summary_table}". +format(**locals()))[0]['importance'] +_assert(isImportance, """Recursive Partitioning: The model does """ ++ """not have the importance information.""") +if columns_exist_in_table(group_table, ['impurity_var_importance']): +# If this column exists, then the RF model is >=1.15. +return False +else: +return True + +def get_var_importance(schema_madlib, model_table, output_table, **kwargs): +""" Create table capturing importance scores for each feature. +For DT, this function will record the impurity importance score if it exists. +For RF, this function will record the oob variable importance and impurity +importance (only for models learnt with 1.15 onwards) for each variable. + +Args: +@param schema_madlib: str, MADlib schema name +@param model_table: str, Model table name +@param output_table: str, Output table name + +""" +# Validate parameters +summary_table = add_postfix(model_table, "_summary") +_validate_var_importance_input(model_table, summary_table, output_table) +grouping_cols = plpy.execute( +"SELECT grouping_cols FROM {summary_table}". +format(**locals()))[0]['grouping_cols'] +grouping_cols_comma = '' +if grouping_cols : +grouping_cols_comma = add_postfix(grouping_cols,", ") +is_RF = _is_model_for_RF(summary_table) +if is_RF: +group_table = add_postfix(model_table, "_group") +is_pre_1_15_RF_model = _is_RF_model_with_imp_pre_1_15( +group_table, summary_table) + +# Query to add oob variable importance for categorical vars +if is_pre_1_15_RF_model: +# In < 1.15 RF model, the variable importance was captured using two +# different columns named 'cat_var_importance' +plpy.execute( +""" CREATE TABLE {output_table} AS +SELECT {grouping_cols_comma} +unnest(regexp_split_to_array(cat_features, ',')) AS feature, +unnest(cat_var_importance) AS var_importance +FROM {group_table}, {summary_table} +""".format(**locals())) --- End diff -- IMO, it's cleaner to see the two queries `UNION`ed together to get the output. ---