Repository: incubator-madlib
Updated Branches:
  refs/heads/master 41a439f47 -> 80410635a


Multiple: Fix array to string conversion for DT with CV

This change is made to allow double quotes in names of feature arrays
input to DT/RF. The double quotes were leading to failures when cast
into a text array for cross validation.


Project: http://git-wip-us.apache.org/repos/asf/incubator-madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-madlib/commit/80410635
Tree: http://git-wip-us.apache.org/repos/asf/incubator-madlib/tree/80410635
Diff: http://git-wip-us.apache.org/repos/asf/incubator-madlib/diff/80410635

Branch: refs/heads/master
Commit: 80410635a0d4e0ce8eb95a85d9dbb7fc551eb2e7
Parents: 41a439f
Author: Rahul Iyer <ri...@apache.org>
Authored: Wed May 17 16:31:45 2017 -0700
Committer: Rahul Iyer <ri...@apache.org>
Committed: Wed May 17 16:31:45 2017 -0700

----------------------------------------------------------------------
 .../recursive_partitioning/decision_tree.py_in  | 120 +++++++++++++++----
 .../modules/validation/cross_validation.py_in   |  34 ++++--
 2 files changed, 118 insertions(+), 36 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/80410635/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
----------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in 
b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
index 32c484d..7b724c1 100644
--- a/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
+++ b/src/ports/postgres/modules/recursive_partitioning/decision_tree.py_in
@@ -2088,37 +2088,30 @@ def _xvalidate(schema_madlib, tree_states, 
training_table_name, output_table_nam
     pred_name = '"estimated_{0}"'.format(dependent_variable.strip(' "'))
     grouping_str = 'NULL' if not grouping_cols else '"' + grouping_cols + '"'
 
-    cat_feature_str = _array_to_string(cat_features)
-    ordered_cat_feature_str = _array_to_string(ordered_cat_features)
-    boolean_cat_str = _array_to_string(boolean_cats)
-    con_feature_str = _array_to_string(con_features)
-
-    modeling_params = [str(i) for i in
-                       (is_classification,
-                        split_criterion, "%data%", "%model%", id_col_name,
-                        dependent_variable, dep_is_bool,
-                        cat_feature_str, ordered_cat_feature_str,
-                        boolean_cat_str, con_feature_str,
-                        grouping_str, weights, max_depth,
-                        min_split, min_bucket, n_bins,
-                        "%explore%", max_n_surr, msg_level)]
-    modeling_param_types = (["BOOLEAN"] + ["TEXT"] * 5 + ["BOOLEAN"] +
-                            ["VARCHAR[]"] * 4 + ["TEXT"] * 2 + ["INTEGER"] * 4 
+
-                            ["TEXT", "SMALLINT", "TEXT"])
-
+    all_features = [cat_features, ordered_cat_features, boolean_cats, 
con_features]
+
+    # _get_xvalidate_params builds the parameters used in
+    # DT train, predict, distance functions. Single quotes are added in these
+    # parameters (except for the feature arrays) since we run
+    # cross_validation_grouping_w_params with `add_param_quotes=False'.
+    #  This special handling is put in place to ensure the feature arrays are
+    #  treated as arrays instead of strings.
+    xvalidate_params = _get_xvalidate_params(**locals())
     cross_validation_grouping_w_params(
         schema_madlib,
         schema_madlib + '.__build_tree',
-        modeling_params,
-        modeling_param_types,
+        xvalidate_params[0],
+        xvalidate_params[1],
         schema_madlib + '.__tree_predict',
-        ["%model%", "%data%", "%prediction%", "response", "True"],
-        ["VARCHAR", "VARCHAR", "VARCHAR", "VARCHAR", "BOOLEAN"],
+        xvalidate_params[2],
+        xvalidate_params[3],
         schema_madlib + "." + metric_function,
-        ["%data%", dependent_variable, "%prediction%", pred_name, id_col_name, 
grouping_cols, "%error%", "True"],
-        ["VARCHAR", "VARCHAR", "VARCHAR", "VARCHAR", "VARCHAR", "TEXT", 
"VARCHAR", "BOOLEAN"],
+        xvalidate_params[4],
+        xvalidate_params[5],
         group_to_param_list_table, param_list_name, grouping_cols,
-        training_table_name, id_col_name, False, model_cv, 'cp', None, n_folds)
+        training_table_name, id_col_name, False,
+        model_cv, 'cp', None, n_folds,
+        add_param_quotes=False)
 
     # 3) find the best cp for each group from table {model_cv}
     if not grouping_cols:
@@ -2183,6 +2176,83 @@ def _xvalidate(schema_madlib, tree_states, 
training_table_name, output_table_nam
 # ------------------------------------------------------------
 
 
+def _get_xvalidate_params(**kwargs):
+    """ Build train, predict, and metric parameters for cross_validation
+
+    Args:
+        @param all_features
+
+    Returns:
+
+    """
+    def _list_to_string_to_array(array_input):
+        """ Return a string that can interpreted by postgresql as text[] 
containing
+        the names in array_input
+
+        Example:
+            Input: ['"Cont_features"[1]', '"Cont_features"[2]']
+            Output: 
string_to_array('"Cont_features"[1]~^~"Cont_features"[2]'::text, '~^~');
+
+            When this output is executed by Postgresql it creates a text array:
+                madlib=# select 
string_to_array('"Cont_features"[1]~^~"Cont_features"[2]'::text, 
'~^~')::VARCHAR[] as t;
+                                        t
+                -------------------------------------------------
+                 {"\"Cont_features\"[1]","\"Cont_features\"[2]"}
+                (1 row)
+        """
+        if not array_input:
+            return "'{}'"
+        return "string_to_array('{0}', '~^~')".format('~^~'.join(array_input))
+
+    all_feature_str = [_list_to_string_to_array(i) for i in 
kwargs['all_features']]
+
+    def _add_quote(s):
+        if s is None:
+            return None
+        s = str(s)
+        return "NULL" if s.upper() == 'NULL' else "'{0}'".format(s)
+
+    quoted_args = {}
+    for k, v in kwargs.items():
+        quoted_args[k] = _add_quote(v)
+
+    modeling_params = [quoted_args['is_classification'],
+                       quoted_args['split_criterion'],
+                       "%data%",
+                       "%model%",
+                       quoted_args['id_col_name'],
+                       quoted_args['dependent_variable'],
+                       quoted_args['dep_is_bool'],
+                       all_feature_str[0],
+                       all_feature_str[1],
+                       all_feature_str[2],
+                       all_feature_str[3],
+                       quoted_args['grouping_str'],
+                       quoted_args['weights'],
+                       quoted_args['max_depth'],
+                       quoted_args['min_split'], quoted_args['min_bucket'], 
quoted_args['n_bins'],
+                       "%explore%", quoted_args['max_n_surr'], 
quoted_args['msg_level']
+                       ]
+    modeling_param_types = (["BOOLEAN"] + ["TEXT"] * 5 + ["BOOLEAN"] +
+                            ["VARCHAR[]"] * 4 + ["TEXT"] * 2 + ["INTEGER"] * 4 
+
+                            ["TEXT", "SMALLINT", "TEXT"])
+    predict_params = ["%model%", "%data%", "%prediction%", "'response'", 
"True"]
+    predict_param_types = ["VARCHAR", "VARCHAR", "VARCHAR", "VARCHAR", 
"BOOLEAN"]
+    metric_params = ["%data%",
+                     quoted_args['dependent_variable'],
+                     "%prediction%",
+                     quoted_args['pred_name'],
+                     quoted_args['id_col_name'],
+                     quoted_args['grouping_cols'],
+                     "%error%",
+                     "True"]
+    metric_param_types = ["VARCHAR", "VARCHAR", "VARCHAR", "VARCHAR", 
"VARCHAR", "TEXT", "VARCHAR", "BOOLEAN"]
+    return [modeling_params, modeling_param_types,
+            predict_params, predict_param_types,
+            metric_params, metric_param_types]
+# ----------------------------------------------------------------------
+
+
 def _tree_train_using_bins(
         schema_madlib, bins, training_table_name,
         cat_features, con_features, boolean_cats, n_bins, weights,

http://git-wip-us.apache.org/repos/asf/incubator-madlib/blob/80410635/src/ports/postgres/modules/validation/cross_validation.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/validation/cross_validation.py_in 
b/src/ports/postgres/modules/validation/cross_validation.py_in
index f157ffa..3b3dfe1 100644
--- a/src/ports/postgres/modules/validation/cross_validation.py_in
+++ b/src/ports/postgres/modules/validation/cross_validation.py_in
@@ -20,7 +20,8 @@ mad_vec = version_wrapper.select_vecfunc()
 def __cv_combine_params_type_general(params, params_type, tbl_data,
                                      col_random_id, param_explored,
                                      explore_value, tbl_input, tbl_output,
-                                     grp_to_param_tbl=None):
+                                     grp_to_param_tbl=None,
+                                     add_param_quotes=True):
     """
     Create argument list for SQL functions for training, validation and metric 
measuring
 
@@ -71,7 +72,9 @@ def __cv_combine_params_type_general(params, params_type, 
tbl_data,
             elif p == "%group_param_tbl%":
                 rst.append("\'" + tbl_output + "\'::" + p_type)
             else:
-                rst.append("\'" + p + "\'::" + p_type)
+                if add_param_quotes:
+                    p = "'{0}'".format(p)
+                rst.append(p + "::" + p_type)
     return ','.join(rst)
 # ------------------------------------------------------------------------
 
@@ -106,7 +109,7 @@ def _replace_explore(params, param_explored, explore_value):
 
 def __cv_funcall_general(func, params, params_type, tbl_data, col_random_id,
                          param_explored, explore_value, tbl_input,
-                         tbl_output):
+                         tbl_output, add_param_quotes=True):
     """
     Call training, validation or metric measuring function
 
@@ -116,9 +119,10 @@ def __cv_funcall_general(func, params, params_type, 
tbl_data, col_random_id,
     """
     arg_string = __cv_combine_params_type_general(
         params, params_type, tbl_data, col_random_id,
-        param_explored, explore_value, tbl_input, tbl_output
-    )
+        param_explored, explore_value, tbl_input, tbl_output,
+        add_param_quotes=add_param_quotes)
     sql = "SELECT {func}({arg_string})".format(func=func, 
arg_string=arg_string)
+    plpy.info(sql)
     plpy.execute(sql)
 # ------------------------------------------------------------------------
 
@@ -207,7 +211,7 @@ def _one_step_cv(tbl_output_model, tbl_output_pred, 
tbl_output_error, tbl_accum_
                  modelling_func, modelling_params, modelling_params_type, 
tbl_train,
                  predict_func, predict_params, predict_params_type, tbl_valid,
                  metric_func, metric_params, metric_params_type, k=0, 
append_k=False,
-                 use_existing_tables=False, **kwargs):
+                 use_existing_tables=False, add_param_quotes=True, **kwargs):
     """
     Args:
         @param schema_madlib
@@ -251,19 +255,22 @@ def _one_step_cv(tbl_output_model, tbl_output_pred, 
tbl_output_error, tbl_accum_
     __cv_funcall_general(
         modelling_func, modelling_params, modelling_params_type,
         tbl_train, col_random_id, temp_param_explored,
-        explore_value, None, tbl_output_model)
+        explore_value, None, tbl_output_model,
+        add_param_quotes=add_param_quotes)
 
     # validation
     __cv_funcall_general(
         predict_func, predict_params, predict_params_type,
         tbl_valid, col_random_id, temp_param_explored,
-        explore_value, tbl_output_model, tbl_output_pred)
+        explore_value, tbl_output_model, tbl_output_pred,
+        add_param_quotes=add_param_quotes)
 
     # measure the performance metric
     __cv_funcall_general(
         metric_func, metric_params, metric_params_type,
         tbl_valid, col_random_id, temp_param_explored,
-        explore_value, tbl_output_pred, tbl_output_error)
+        explore_value, tbl_output_pred, tbl_output_error,
+        add_param_quotes=add_param_quotes)
 
     # accumulate the measured metric result
     if not output_created:
@@ -380,9 +387,10 @@ def cross_validation_general(
         return None
 # ------------------------------------------------------------------------
 
-# XXX Currently, this function is used only by the decision tree,
+# XXX Currently, this function is built specifically for the decision tree,
 # which appends the outputs from different cv folds to a single
 # table to reduce catalog changes and thus improve the performance.
+# FIXME: this should either be made general enough or moved to 
decision_tree.py_in
 
 
 def cross_validation_grouping_w_params(
@@ -436,7 +444,8 @@ def cross_validation_grouping_w_params(
                                 param_list_name=param_list_name,
                                 
group_to_param_list_table=group_to_param_list_table))
 
-        _replace_explore(modelling_params, unique_string(), grp_to_param_tbl)
+        _replace_explore(modelling_params, unique_string(),
+                         "'" + grp_to_param_tbl + "'")
         # get the maximum size of the param list among all groups
         max_len = plpy.execute("""
             SELECT max(array_upper({0}, 1)) as max_len
@@ -477,6 +486,9 @@ def cross_validation_grouping_w_params(
                 # to speed up the search in the prediction table.
                 append_k = True  # k is fold ID to distinguish different sets 
of results.
 
+                # XXX Important to not add single quotes around param
+                # as a special-handling for decision trees
+                add_param_quotes = False
                 output_created = _one_step_cv(**locals())
 
         # compute the averages and standard deviations of cv_error for each 
group and explore value

Reply via email to