[madlib] 03/04: DL: Set plan_cache_mode when passing weights for GPDB6

khannaekta Mon, 16 Mar 2020 10:36:29 -0700

This is an automated email from the ASF dual-hosted git repository.

khannaekta pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


commit fe7ba29d08328354a2626cd4b69b8eb69ac1f16b
Author: Ekta Khanna <[email protected]>
AuthorDate: Mon Mar 9 11:04:01 2020 -0700

    DL: Set plan_cache_mode when passing weights for GPDB6
    
    JIRA: MADLIB-1414
    
    For versions >=GPDB6, previously, for queries called with weights value
    passed in as a param, the query plan for it would create custom plans
    with weights embedded in the plan itself.  This meant that the query
    plan size would also include the size of these weights, bloating it up
    to hit the 1GB limit when dispatching the query plan to segments,
    leading to OOM for large weights.
    
    In GPDB, for PREPARE plans, there is a GUC `plan_cache_mode` that
    controls choosing generic plan(not constant folding the passed in
    params)/custom plans(constant folding the passed in params) for
    execution. Therefore, for GPDB6 setting this GUC to `force_generic_plan`
    for using generic plans when passing in weights as params to queries.
---
 src/ports/postgres/modules/convex/mlp_igd.py_in    |  4 +--
 .../modules/deep_learning/madlib_keras.py_in       | 14 +++++----
 .../modules/deep_learning/madlib_keras.sql_in      | 34 ++++++++++++++--------
 .../madlib_keras_fit_multiple_model.sql_in         |  4 ++-
 .../deep_learning/madlib_keras_predict.py_in       | 15 ++++++++--
 .../utilities/test/unit_tests/test_utilities.py_in | 18 ++++++++----
 .../postgres/modules/utilities/utilities.py_in     | 23 ++++++++++++---
 7 files changed, 79 insertions(+), 33 deletions(-)

diff --git a/src/ports/postgres/modules/convex/mlp_igd.py_in 
b/src/ports/postgres/modules/convex/mlp_igd.py_in
index 888bed9..956ea4c 100644
--- a/src/ports/postgres/modules/convex/mlp_igd.py_in
+++ b/src/ports/postgres/modules/convex/mlp_igd.py_in
@@ -1051,8 +1051,8 @@ def mlp_predict(schema_madlib, model_table, data_table, 
id_col_name,
         else:
             intermediate_col = unique_string()
             if classes:
-                score_format = create_cols_from_array_sql_string(
-                    classes, intermediate_col, 'prob', 'double precision', 
False, 'MLP') 
+                score_format, _ = create_cols_from_array_sql_string(
+                    classes, intermediate_col, 'prob', 'double precision', 
False, 'MLP')
             else:
                 # Case when the training step did not have to one-hot encode
                 # the dependent var.
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
index 596b0b2..d6fb857 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
@@ -313,11 +313,15 @@ def fit(schema_madlib, source_table, model, 
model_arch_table,
                  [compile_params, fit_params, name,
                   description, metrics_elapsed_time, class_values])
 
-    create_output_table = plpy.prepare("""
-        CREATE TABLE {0} AS SELECT
-        $1 as model_weights,
-        $2 as {1}""".format(model, ModelArchSchema.MODEL_ARCH), ["bytea", 
"json"])
-    plpy.execute(create_output_table, [serialized_weights, model_arch])
+    plpy.execute("""
+        CREATE TABLE {0}
+        (model_weights bytea,
+        {1} json)""".format(model, ModelArchSchema.MODEL_ARCH))
+    insert_output_table = plpy.prepare("""
+        INSERT INTO {0} SELECT model_weights, {1}
+        FROM (VALUES($1, $2))t(model_weights, {1})
+        """.format(model, ModelArchSchema.MODEL_ARCH), ["bytea", "json"])
+    plpy.execute(insert_output_table, [serialized_weights, model_arch])
 
     #TODO add a unit test for this in a future PR
     reset_cuda_env(original_cuda_env)
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
index 1d2d0ba..e4794a3 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
@@ -1643,8 +1643,10 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.madlib_keras_fit(
     description             VARCHAR
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(`deep_learning', `madlib_keras')
+    from utilities.control import SetGUC
     with AOControl(False):
-        madlib_keras.fit(**globals())
+        with SetGUC("plan_cache_mode", "force_generic_plan"):
+            madlib_keras.fit(**globals())
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -1836,16 +1838,18 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.madlib_keras_predict(
     mst_key                 INTEGER
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict')
+    from utilities.control import SetGUC
     with AOControl(False):
-        madlib_keras_predict.Predict(schema_madlib,
-               model_table,
-               test_table,
-               id_col,
-               independent_varname,
-               output_table,
-               pred_type,
-               use_gpus,
-               mst_key)
+        with SetGUC("plan_cache_mode", "force_generic_plan"):
+            madlib_keras_predict.Predict(schema_madlib,
+                   model_table,
+                   test_table,
+                   id_col,
+                   independent_varname,
+                   output_table,
+                   pred_type,
+                   use_gpus,
+                   mst_key)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -1917,8 +1921,10 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.madlib_keras_predict_byom(
     normalizing_const       DOUBLE PRECISION
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict')
+    from utilities.control import SetGUC
     with AOControl(False):
-        madlib_keras_predict.PredictBYOM(**globals())
+        with SetGUC("plan_cache_mode", "force_generic_plan"):
+            madlib_keras_predict.PredictBYOM(**globals())
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
@@ -1986,7 +1992,11 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.madlib_keras_evaluate(
     use_gpus                BOOLEAN,
     mst_key                 INTEGER
 ) RETURNS VOID AS $$
-    PythonFunction(`deep_learning', `madlib_keras', `evaluate')
+    PythonFunctionBodyOnly(`deep_learning', `madlib_keras')
+    from utilities.control import SetGUC
+    with AOControl(False):
+        with SetGUC("plan_cache_mode", "force_generic_plan"):
+            madlib_keras.evaluate(**globals())
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
 
b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
index 1b0e675..df79d86 100644
--- 
a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
+++ 
b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in
@@ -1398,8 +1398,10 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.madlib_keras_fit_multiple_model(
     description             VARCHAR
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(`deep_learning', `madlib_keras_fit_multiple_model')
+    from utilities.control import SetGUC
     with AOControl(False):
-        fit_obj = madlib_keras_fit_multiple_model.FitMultipleModel(**globals())
+        with SetGUC("plan_cache_mode", "force_generic_plan"):
+            fit_obj = 
madlib_keras_fit_multiple_model.FitMultipleModel(**globals())
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
index 16ae7b1..412e63b 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
@@ -35,6 +35,7 @@ from utilities.utilities import add_postfix
 from utilities.utilities import create_cols_from_array_sql_string
 from utilities.utilities import get_segments_per_host
 from utilities.utilities import unique_string
+from utilities.validate_args import get_expr_type
 from utilities.validate_args import input_tbl_valid
 
 from madlib_keras_wrapper import *
@@ -78,7 +79,7 @@ class BasePredict():
         intermediate_col = unique_string()
         class_values = 
strip_trailing_nulls_from_class_values(self.class_values)
 
-        prediction_select_clause = create_cols_from_array_sql_string(
+        prediction_select_clause, create_table_columns = 
create_cols_from_array_sql_string(
             class_values, intermediate_col, pred_col_name,
             pred_col_type, self.is_response, self.module_name)
         gp_segment_id_col, seg_ids_test, \
@@ -94,14 +95,20 @@ class BasePredict():
             group_by_clause = "GROUP BY 
{self.test_table}.gp_segment_id".format(self=self)
             join_cond_on_segmentid = 
"{self.test_table}.gp_segment_id=min_ctid.gp_segment_id AND".format(self=self)
 
+        # Calling CREATE TABLE instead of CTAS, to ensure that the 
plan_cache_mode
+        # guc codepath is called when passing in the weights
+        plpy.execute("""
+            CREATE TABLE {self.output_table}
+            ({self.id_col} {self.id_col_type}, {create_table_columns})
+            """.format(self=self, create_table_columns=create_table_columns))
         # Passing huge model weights to internal_keras_predict() for each row
         # resulted in slowness of overall madlib_keras_predict().
         # To avoid this, a CASE is added to pass the model weights only for
         # the very first row(min(ctid)) that is fetched on each segment and 
NULL
         # for the other rows.
         predict_query = plpy.prepare("""
-            CREATE TABLE {self.output_table} AS
-            SELECT {self.id_col}, {prediction_select_clause}
+            INSERT INTO {self.output_table}
+            SELECT {self.id_col}::{self.id_col_type}, 
{prediction_select_clause}
             FROM (
                 SELECT {self.test_table}.{self.id_col},
                        ({self.schema_madlib}.internal_keras_predict
@@ -175,6 +182,7 @@ class Predict(BasePredict):
         self.dependent_varname = param_proc.get_dependent_varname()
 
         self.validate()
+        self.id_col_type = get_expr_type(self.id_col, self.test_table)
         BasePredict.call_internal_keras(self)
         if self.is_mult_model:
             plpy.execute("DROP VIEW IF EXISTS 
{}".format(self.temp_summary_view))
@@ -230,6 +238,7 @@ class PredictBYOM(BasePredict):
             self.test_table, self.id_col, self.output_table,
             self.independent_varname)
         self.validate_and_set_defaults()
+        self.id_col_type = get_expr_type(self.id_col, self.test_table)
         BasePredict.call_internal_keras(self)
 
     def validate_and_set_defaults(self):
diff --git 
a/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in 
b/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in
index b884eec..0318516 100644
--- a/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in
+++ b/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in
@@ -254,15 +254,17 @@ class UtilitiesTestCase(unittest.TestCase):
         self.colname = 'estimated_col'
         self.coltype = 'dummy'
         self.has_one_ele = True
-        out_sql = utils.create_cols_from_array_sql_string(
+        out_sql, out_col = utils.create_cols_from_array_sql_string(
             self.py_list, self.sql_array_col, self.colname, self.coltype,
             self.has_one_ele, "dummy_module")
         self.assertEqual(out_sql, 'sqlcol[1]+1 AS estimated_col')
+        self.assertEqual(out_col, 'estimated_col dummy')
         self.has_one_ele = False
-        out_sql = utils.create_cols_from_array_sql_string(
+        out_sql, out_col = utils.create_cols_from_array_sql_string(
             self.py_list, self.sql_array_col, self.colname, self.coltype,
             self.has_one_ele, "dummy_module")
         self.assertEqual(out_sql, 'sqlcol AS estimated_col')
+        self.assertEqual(out_col, 'estimated_col dummy[]')
 
     def test_create_cols_from_array_sql_string_one_ele(self):
         utils = self.subject
@@ -271,10 +273,11 @@ class UtilitiesTestCase(unittest.TestCase):
         self.colname = 'estimated_pred'
         self.coltype = 'TEXT'
         self.has_one_ele = True
-        out_sql = utils.create_cols_from_array_sql_string(
+        out_sql, out_col = utils.create_cols_from_array_sql_string(
             self.py_list, self.sql_array_col, self.colname, self.coltype,
             self.has_one_ele, "dummy_module")
         self.assertTrue(out_sql, "(ARRAY['cat','dog'])[sqlcol[1]+1]::TEXT AS 
estimated_pred")
+        self.assertTrue(out_col, "estimated_pred TEXT")
 
     def test_create_cols_from_array_sql_string_one_ele_with_NULL(self):
         utils = self.subject
@@ -283,10 +286,11 @@ class UtilitiesTestCase(unittest.TestCase):
         self.colname = 'estimated_pred'
         self.coltype = 'INTEGER'
         self.has_one_ele = True
-        out_sql = utils.create_cols_from_array_sql_string(
+        out_sql, out_col = utils.create_cols_from_array_sql_string(
             self.py_list, self.sql_array_col, self.colname, self.coltype,
             self.has_one_ele, "dummy_module")
         self.assertEqual(out_sql, "(ARRAY[ NULL,1,2 
]::INTEGER[])[sqlcol[1]+1]::INTEGER AS estimated_pred")
+        self.assertEqual(out_col, "estimated_pred INTEGER")
 
     def test_create_cols_from_array_sql_string_one_ele_with_many_NULL(self):
         utils = self.subject
@@ -307,10 +311,11 @@ class UtilitiesTestCase(unittest.TestCase):
         self.colname = 'prob'
         self.coltype = 'TEXT'
         self.has_one_ele = False
-        out_sql = utils.create_cols_from_array_sql_string(
+        out_sql, out_col = utils.create_cols_from_array_sql_string(
             self.py_list, self.sql_array_col, self.colname, self.coltype,
             self.has_one_ele, "dummy_module")
         self.assertEqual(out_sql, "CAST(sqlcol[1] AS TEXT) AS \"prob_cat\", 
CAST(sqlcol[2] AS TEXT) AS \"prob_dog\"")
+        self.assertEqual(out_col, "\"prob_cat\" TEXT, \"prob_dog\" TEXT")
 
     def test_create_cols_from_array_sql_string_many_ele_with_NULL(self):
         utils = self.subject
@@ -319,10 +324,11 @@ class UtilitiesTestCase(unittest.TestCase):
         self.colname = 'prob'
         self.coltype = 'TEXT'
         self.has_one_ele = False
-        out_sql = utils.create_cols_from_array_sql_string(
+        out_sql, out_col = utils.create_cols_from_array_sql_string(
             self.py_list, self.sql_array_col, self.colname, self.coltype,
             self.has_one_ele, "dummy_module")
         self.assertEqual(out_sql, "CAST(sqlcol[1] AS TEXT) AS \"prob_NULL\", 
CAST(sqlcol[2] AS TEXT) AS \"prob_cat\", CAST(sqlcol[3] AS TEXT) AS 
\"prob_dog\"")
+        self.assertEqual(out_col, "\"prob_NULL\" TEXT, \"prob_cat\" TEXT, 
\"prob_dog\" TEXT")
 
     def test_create_cols_from_array_sql_string_many_ele_with_many_NULL(self):
         utils = self.subject
diff --git a/src/ports/postgres/modules/utilities/utilities.py_in 
b/src/ports/postgres/modules/utilities/utilities.py_in
index 12b5205..210fbd4 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in
@@ -431,7 +431,8 @@ def create_cols_from_array_sql_string(py_list, 
sql_array_col, colname,
                                       coltype, has_one_ele,
                                       module_name='Input Error'):
     """
-    Create SQL string to convert array of elements into multiple columns.
+    Create SQL string to convert array of elements into multiple columns and 
corresponding
+    SQL string of columns for CREATE TABLE.
     @args:
         @param: py_list, python list, if None, return sql_array_col as colname.
                             The py_list can at most have one 'None' element 
that
@@ -454,6 +455,7 @@ def create_cols_from_array_sql_string(py_list, 
sql_array_col, colname,
                 coltype = TEXT
                 has_one_ele = FALSE
             Output:
+                prob_cat TEXT, prob_dog TEXT
                 CAST(sqlcol[1] AS TEXT) AS prob_cat, CAST(sqlcol[2] AS TEXT) 
AS prob_dog
         2) Input:
                 py_list = ['cat', 'dog']
@@ -462,23 +464,25 @@ def create_cols_from_array_sql_string(py_list, 
sql_array_col, colname,
                 coltype = TEXT
                 has_one_ele = TRUE
             Output:
+                estimated_pred TEXT
                 (ARRAY['cat','dog'])[sqlcol[1]+1]::TEXT AS estimated_pred
 
     @NOTE:
         If py_list is [None, 'cat', 'dog', NULL']:
         then the SQL query string returned would create the following
         column names:
-            prob_NULL, prob_cat, 'prob_dog', and 'prob_"NULL'.
+            prob_NULL, prob_cat, 'prob_dog', and 'prob_"NULL"'.
         1. Notice that for None, which represents Postgres' NULL value, the
         column name will be 'prob_NULL',
         2. and to differentiate the column name for a string 'NULL', the
-        resulting column name will be 'prob_"NULL'.
+        resulting column name will be 'prob_"NULL"'.
 
         The weird quoting in this column name is due to calling strip after
         quote_ident in the code below.
 
     @returns:
         @param, str, that can be used in a SQL query.
+        @param, str, that can be used in a SQL query.
 
     """
     _assert(sql_array_col, "{0}: sql_array_col should be a valid string.".
@@ -515,6 +519,8 @@ def create_cols_from_array_sql_string(py_list, 
sql_array_col, colname,
             py_list_sql_str = py_list_to_sql_string(py_list, coltype+'[]')
             select_clause = "({0})[{1}[1]+1]::{2} AS {3}".format(
                 py_list_sql_str, sql_array_col, coltype, colname)
+            create_columns = "{0} {1}".format(
+                colname, coltype)
         else:
             # Create as many columns as the length of py_list. The
             # colnames are created based on the elements in py_list,
@@ -534,12 +540,21 @@ def create_cols_from_array_sql_string(py_list, 
sql_array_col, colname,
                            coltype=coltype)
                 for i, suffix in enumerate(py_list)
                 ])
+            create_columns = ', '.join(
+                ['"{final_colname}" {coltype}'.
+                     format(final_colname=quote_ident("{0}_{1}".
+                                                      format(colname, 
str(suffix))).strip(' "'),
+                            coltype=coltype)
+                 for i, suffix in enumerate(py_list)
+                 ])
     else:
         if has_one_ele:
             select_clause = '{0}[1]+1 AS {1}'.format(sql_array_col, colname)
+            create_columns = '{0} {1}'.format(colname, coltype)
         else:
             select_clause = '{0} AS {1}'.format(sql_array_col, colname)
-    return select_clause
+            create_columns = '{0} {1}'.format(colname, coltype+'[]')
+    return select_clause, create_columns
 # ------------------------------------------------------------------------
 
 def _array_to_string(origin):

[madlib] 03/04: DL: Set plan_cache_mode when passing weights for GPDB6

Reply via email to