Repository: madlib
Updated Branches:
  refs/heads/master 467807b54 -> fa6d53a42


Bugfix: MLP predict using 1.12 model fails on later versions

JIRA: MADLIB-1207

MADlib 1.12 did not support grouping in MLP. The summary table created
used to have the mean and std used for standardizing the independent
variable. From MADlib 1.13 onwards, grouping was supported, and the mean
and std were moved to the standardization table.
This resulted in a failure when MADlib 1.12 MLP models were used to
predict using MADlib 1.13. This commit fixes this issue.

Closes #237


Project: http://git-wip-us.apache.org/repos/asf/madlib/repo
Commit: http://git-wip-us.apache.org/repos/asf/madlib/commit/fa6d53a4
Tree: http://git-wip-us.apache.org/repos/asf/madlib/tree/fa6d53a4
Diff: http://git-wip-us.apache.org/repos/asf/madlib/diff/fa6d53a4

Branch: refs/heads/master
Commit: fa6d53a4256020cb82e801068153ec5fdb65c2a4
Parents: 467807b
Author: Nandish Jayaram <njaya...@apache.org>
Authored: Wed Feb 21 17:18:57 2018 -0800
Committer: Nandish Jayaram <njaya...@apache.org>
Committed: Fri Feb 23 11:04:11 2018 -0800

----------------------------------------------------------------------
 src/ports/postgres/modules/convex/mlp_igd.py_in | 45 ++++++++++++++++----
 1 file changed, 37 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/madlib/blob/fa6d53a4/src/ports/postgres/modules/convex/mlp_igd.py_in
----------------------------------------------------------------------
diff --git a/src/ports/postgres/modules/convex/mlp_igd.py_in 
b/src/ports/postgres/modules/convex/mlp_igd.py_in
index 68e5dfd..405c289 100644
--- a/src/ports/postgres/modules/convex/mlp_igd.py_in
+++ b/src/ports/postgres/modules/convex/mlp_igd.py_in
@@ -748,8 +748,25 @@ def mlp_predict(schema_madlib,
         summary['layer_sizes'], array_type="DOUBLE PRECISION")
     is_classification = int(summary["is_classification"])
     is_response = int(pred_type == 'response')
-    grouping_col = '' if summary['grouping_col']=='NULL' \
-                    else summary['grouping_col']
+    # Fix to ensure that 1.12 models run on 1.13 or higher.
+    # As a result of adding grouping support in 1.13, some changes were
+    # made wrt standardization.
+    # The x_mean and x_std values were stored in the summary table itself in
+    # MADlib 1.12, and they were named as "x_means" and "x_stds".
+    # From MADlib 1.13 onwards, these parameters were moved to the
+    # _standardization table, and were renamed to "mean" and "std".
+    if 'grouping_col' in summary:
+        # This model was created in MADlib 1.13 or greater version
+        is_pre_113_model = False
+        grouping_col = '' if summary['grouping_col']=='NULL' \
+                        else summary['grouping_col']
+    else:
+        # This model was created in MADlib 1.12. Grouping was not
+        # supported in 1.12, but was added later in 1.13.
+        is_pre_113_model = True
+        grouping_col = ''
+        # Validate the summary table created with the 1.12 MLP model table.
+        cols_in_tbl_valid(summary_table, ['x_means', 'x_stds'], 'MLP')
 
     pred_name = ('"prob_{0}"' if pred_type == "prob" else
                 '"estimated_{0}"').format(
@@ -774,7 +791,8 @@ def mlp_predict(schema_madlib,
     grouping_col_comma = ""
     join_str = ''
     grouping_col_list = split_quoted_delimited_str(grouping_col)
-    _validate_standardization_table(standardization_table, grouping_col_list)
+    if not is_pre_113_model:
+        _validate_standardization_table(standardization_table, 
grouping_col_list)
     if grouping_col:
         join_str = """JOIN {model_table}
             USING ({grouping_col})
@@ -795,14 +813,25 @@ def mlp_predict(schema_madlib,
     else:
         # if not grouping, then directly read out the coeff, mean
         # and std values from the model and standardization tables.
-        standardization = plpy.execute(
-            "SELECT * FROM {0}".format(standardization_table))[0]
+
+        if is_pre_113_model:
+            # Get mean and std from the summary table
+            standardization = plpy.execute("""
+                    SELECT x_means AS mean, x_stds AS std
+                    FROM {0}
+                """.format(summary_table))[0]
+        else:
+            # Get mean and std from the standardization table
+            standardization = plpy.execute("""
+                    SELECT mean, std
+                    FROM {0}
+                """.format(standardization_table))[0]
         coeff = py_list_to_sql_string(plpy.execute(
             "SELECT * FROM {0}".format(model_table))[0]["coeff"])
         x_means = py_list_to_sql_string(
-            standardization["mean"], array_type="DOUBLE PRECISION")
+            standardization['mean'], array_type="DOUBLE PRECISION")
         x_stds = py_list_to_sql_string(
-            standardization["std"], array_type="DOUBLE PRECISION")
+            standardization['std'], array_type="DOUBLE PRECISION")
 
         coeff_column = "{coeff}".format(**locals())
         mean_col = "{x_means}".format(**locals())
@@ -823,7 +852,7 @@ def mlp_predict(schema_madlib,
         dependent_type = get_expr_type(dependent_varname, source_table)
         unnest_if_not_array = ""
         # Return the same type as the user provided.  Internally we always
-        # use an array, but if they provided a scaler, unnest it for
+        # use an array, but if they provided a scalar, unnest it for
         # the user
         if "[]" not in dependent_type:
             unnest_if_not_array = "UNNEST"

Reply via email to