[GitHub] madlib pull request #295: Recursive Partitioning: Add function to report imp...

2018-08-01 Thread asfgit
Github user asfgit closed the pull request at:

https://github.com/apache/madlib/pull/295


---


[GitHub] madlib pull request #295: Recursive Partitioning: Add function to report imp...

2018-07-23 Thread njayaram2
Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/295#discussion_r204593622
  
--- Diff: 
src/ports/postgres/modules/recursive_partitioning/random_forest.py_in ---
@@ -1564,69 +1578,69 @@ def get_var_importance(schema_madlib, model_table, 
output_table, **kwargs):
 """
 # Validate parameters
 summary_table = add_postfix(model_table, "_summary")
-_validate_var_importance_input(model_table, summary_table, 
output_table)
-grouping_cols = plpy.execute(
-"SELECT grouping_cols FROM {summary_table}".
-format(**locals()))[0]['grouping_cols']
-grouping_cols_comma = ''
-if grouping_cols :
-grouping_cols_comma = add_postfix(grouping_cols,", ")
-is_RF = _is_model_for_RF(summary_table)
+_validate_var_importance_input(model_table,
+   summary_table,
+   output_table)
+is_RF = _is_random_forest_model(summary_table)
+
+# 'importance_model_table' is the table containing variable importance 
values.
+# For RF, it is placed in _group as opposed to 

+# for DT.
+importance_model_table = (model_table if not is_RF else
+  add_postfix(model_table, "_group"))
+grouping_cols = plpy.execute("SELECT grouping_cols FROM 
{summary_table}".
+ format(**locals()))[0]['grouping_cols']
+if grouping_cols:
+grouping_cols_comma = add_postfix(grouping_cols, ", ")
+else:
+grouping_cols_comma = ''
+is_impurity_imp_col_present = _is_impurity_importance_in_model(
+importance_model_table, summary_table, is_RF=is_RF)
+
+# convert importance to percentages
+normalization_target = 100.0
+
+def _unnest_normalize(input_array_str):
+return ("""
+unnest({0}.normalize_sum_array({1}::double precision[],
+   {2}::double precision))
+""".format(schema_madlib, input_array_str, 
normalization_target))
+
 if is_RF:
-group_table = add_postfix(model_table, "_group")
-is_impurity_imp_col_present = 
_is_impurity_importance_in_group_table(
-group_table, summary_table)
-# The group table for >= 1.15 RF models should have a column named
-# impurity_var_importance if it was learnt with importance param 
True.
-# So set is_pre_1_15_RF_model to False if the column exists, and to
-# True if the column does not exist.
-is_pre_1_15_RF_model = False if is_impurity_imp_col_present else 
True
-
-# Query to add oob variable importance for categorical vars
-if is_pre_1_15_RF_model:
-# In < 1.15 RF model, the variable importance was captured 
using two
-# different columns named 'cat_var_importance'
-plpy.execute(
-""" CREATE TABLE {output_table} AS
--- Add oob variable importance for categorical vars
-SELECT {grouping_cols_comma}
-unnest(regexp_split_to_array(cat_features, ',')) 
AS feature,
-unnest(cat_var_importance) AS var_importance
-FROM {group_table}, {summary_table}
-UNION
--- Add oob variable importance for continuous vars
-SELECT {grouping_cols_comma}
-unnest(regexp_split_to_array(con_features, ',')) 
AS feature,
-unnest(con_var_importance) AS var_importance
-FROM {group_table}, {summary_table}
-""".format(**locals()))
+if is_impurity_imp_col_present:
+# In versions >= 1.15, the OOB variable importance is captured
+# in a single column: 'oob_var_importance'.
+oob_var_importance_str = (
+"{0} AS oob_var_importance,".
+format(_unnest_normalize('oob_var_importance')))
+impurity_var_importance_str = (
+"{0} AS impurity_var_importance".
+format(_unnest_normalize('impurity_var_importance')))
 else:
-# In >= 1.15 RF models, the variable importance and impurity
-# importance scores are captured in columns 
'oob_var_importance'
-# and 'impurity_var_importance' respectively.
-plpy.execute(
-""" CREATE TABLE {output_table} AS
-SELECT {grouping_cols_comma}
-unnest(regexp_split_to_array(independent_varnames, 
',')) 

[GitHub] madlib pull request #295: Recursive Partitioning: Add function to report imp...

2018-07-19 Thread iyerr3
Github user iyerr3 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/295#discussion_r203812338
  
--- Diff: 
src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in ---
@@ -2327,6 +2328,110 @@ def _tree_error(schema_madlib, source_table, 
dependent_varname,
 plpy.execute(sql)
 # 
 
+def _validate_var_importance_input(model_table, summary_table, 
output_table):
+_assert(table_exists(model_table),
+"Recursive Partitioning: Model table does not exist.")
+_assert(table_exists(summary_table),
+"Recursive Partitioning: Model summary table does not exist.")
+_assert(not table_exists(output_table),
+"Recursive Partitioning: Output table already exists.")
+
+def _is_model_for_RF(summary_table):
+# Only an RF model (and not DT) would have num_trees column in summary
+return columns_exist_in_table(summary_table, ['num_trees'])
+
+def _is_RF_model_with_imp_pre_1_15(group_table, summary_table):
--- End diff --

Since goal of function is to check if `impurity_var_importance` exists in 
group table, let's name it to reflect that. 


---


[GitHub] madlib pull request #295: Recursive Partitioning: Add function to report imp...

2018-07-19 Thread iyerr3
Github user iyerr3 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/295#discussion_r203811384
  
--- Diff: 
src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in ---
@@ -2327,6 +2328,110 @@ def _tree_error(schema_madlib, source_table, 
dependent_varname,
 plpy.execute(sql)
 # 
 
+def _validate_var_importance_input(model_table, summary_table, 
output_table):
+_assert(table_exists(model_table),
+"Recursive Partitioning: Model table does not exist.")
+_assert(table_exists(summary_table),
+"Recursive Partitioning: Model summary table does not exist.")
+_assert(not table_exists(output_table),
+"Recursive Partitioning: Output table already exists.")
+
+def _is_model_for_RF(summary_table):
+# Only an RF model (and not DT) would have num_trees column in summary
+return columns_exist_in_table(summary_table, ['num_trees'])
+
+def _is_RF_model_with_imp_pre_1_15(group_table, summary_table):
+"""
+Check if the RF model is from MADlib < 1.15. The group table for
+>= 1.15 RF models should have a column named 
impurity_var_importance
+if it was learnt with importance param True.
+"""
+_assert(table_exists(group_table),
+"Recursive Partitioning: Model group table does not exist.")
+# this flag has to be set to true for RF to report importance scores.
+isImportance = plpy.execute("SELECT importance FROM {summary_table}".
--- End diff --

Our convention is to use snake case for Python (i.e. `is_importance`). I 
also suggest changing it to `is_importance_set` to make it more explicit. 


---


[GitHub] madlib pull request #295: Recursive Partitioning: Add function to report imp...

2018-07-19 Thread iyerr3
Github user iyerr3 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/295#discussion_r203811750
  
--- Diff: 
src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in ---
@@ -2327,6 +2328,110 @@ def _tree_error(schema_madlib, source_table, 
dependent_varname,
 plpy.execute(sql)
 # 
 
+def _validate_var_importance_input(model_table, summary_table, 
output_table):
+_assert(table_exists(model_table),
+"Recursive Partitioning: Model table does not exist.")
+_assert(table_exists(summary_table),
+"Recursive Partitioning: Model summary table does not exist.")
+_assert(not table_exists(output_table),
+"Recursive Partitioning: Output table already exists.")
+
+def _is_model_for_RF(summary_table):
+# Only an RF model (and not DT) would have num_trees column in summary
+return columns_exist_in_table(summary_table, ['num_trees'])
--- End diff --

We've a `method_name` in summary table that makes this clearer. 


---


[GitHub] madlib pull request #295: Recursive Partitioning: Add function to report imp...

2018-07-19 Thread iyerr3
Github user iyerr3 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/295#discussion_r203816091
  
--- Diff: 
src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in ---
@@ -2327,6 +2328,110 @@ def _tree_error(schema_madlib, source_table, 
dependent_varname,
 plpy.execute(sql)
 # 
 
+def _validate_var_importance_input(model_table, summary_table, 
output_table):
+_assert(table_exists(model_table),
+"Recursive Partitioning: Model table does not exist.")
+_assert(table_exists(summary_table),
+"Recursive Partitioning: Model summary table does not exist.")
+_assert(not table_exists(output_table),
+"Recursive Partitioning: Output table already exists.")
+
+def _is_model_for_RF(summary_table):
+# Only an RF model (and not DT) would have num_trees column in summary
+return columns_exist_in_table(summary_table, ['num_trees'])
+
+def _is_RF_model_with_imp_pre_1_15(group_table, summary_table):
+"""
+Check if the RF model is from MADlib < 1.15. The group table for
+>= 1.15 RF models should have a column named 
impurity_var_importance
+if it was learnt with importance param True.
+"""
+_assert(table_exists(group_table),
+"Recursive Partitioning: Model group table does not exist.")
+# this flag has to be set to true for RF to report importance scores.
+isImportance = plpy.execute("SELECT importance FROM {summary_table}".
+format(**locals()))[0]['importance']
+_assert(isImportance, """Recursive Partitioning: The model does """
++ """not have the importance information.""")
+if columns_exist_in_table(group_table, ['impurity_var_importance']):
+# If this column exists, then the RF model is >=1.15.
+return False
+else:
+return True
+
+def get_var_importance(schema_madlib, model_table, output_table, **kwargs):
+""" Create table capturing importance scores for each feature.
+For DT, this function will record the impurity importance score if 
it exists.
+For RF, this function will record the oob variable importance and 
impurity
+importance (only for models learnt with 1.15 onwards) for each 
variable.
+
+Args:
+@param schema_madlib: str, MADlib schema name
+@param model_table: str, Model table name
+@param output_table: str, Output table name
+
+"""
+# Validate parameters
+summary_table = add_postfix(model_table, "_summary")
+_validate_var_importance_input(model_table, summary_table, 
output_table)
+grouping_cols = plpy.execute(
+"SELECT grouping_cols FROM {summary_table}".
+format(**locals()))[0]['grouping_cols']
+grouping_cols_comma = ''
+if grouping_cols :
+grouping_cols_comma = add_postfix(grouping_cols,", ")
+is_RF = _is_model_for_RF(summary_table)
+if is_RF:
+group_table = add_postfix(model_table, "_group")
+is_pre_1_15_RF_model = _is_RF_model_with_imp_pre_1_15(
+group_table, summary_table)
+
+# Query to add oob variable importance for categorical vars
+if is_pre_1_15_RF_model:
+# In < 1.15 RF model, the variable importance was captured 
using two
+# different columns named 'cat_var_importance'
+plpy.execute(
+""" CREATE TABLE {output_table} AS
+SELECT {grouping_cols_comma}
+unnest(regexp_split_to_array(cat_features, ',')) 
AS feature,
+unnest(cat_var_importance) AS var_importance
+FROM {group_table}, {summary_table}
+""".format(**locals()))
--- End diff --

IMO, it's cleaner to see the two queries `UNION`ed together to get the 
output. 


---