This is an automated email from the ASF dual-hosted git repository. khannaekta pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/madlib.git
commit fe7ba29d08328354a2626cd4b69b8eb69ac1f16b Author: Ekta Khanna <[email protected]> AuthorDate: Mon Mar 9 11:04:01 2020 -0700 DL: Set plan_cache_mode when passing weights for GPDB6 JIRA: MADLIB-1414 For versions >=GPDB6, previously, for queries called with weights value passed in as a param, the query plan for it would create custom plans with weights embedded in the plan itself. This meant that the query plan size would also include the size of these weights, bloating it up to hit the 1GB limit when dispatching the query plan to segments, leading to OOM for large weights. In GPDB, for PREPARE plans, there is a GUC `plan_cache_mode` that controls choosing generic plan(not constant folding the passed in params)/custom plans(constant folding the passed in params) for execution. Therefore, for GPDB6 setting this GUC to `force_generic_plan` for using generic plans when passing in weights as params to queries. --- src/ports/postgres/modules/convex/mlp_igd.py_in | 4 +-- .../modules/deep_learning/madlib_keras.py_in | 14 +++++---- .../modules/deep_learning/madlib_keras.sql_in | 34 ++++++++++++++-------- .../madlib_keras_fit_multiple_model.sql_in | 4 ++- .../deep_learning/madlib_keras_predict.py_in | 15 ++++++++-- .../utilities/test/unit_tests/test_utilities.py_in | 18 ++++++++---- .../postgres/modules/utilities/utilities.py_in | 23 ++++++++++++--- 7 files changed, 79 insertions(+), 33 deletions(-) diff --git a/src/ports/postgres/modules/convex/mlp_igd.py_in b/src/ports/postgres/modules/convex/mlp_igd.py_in index 888bed9..956ea4c 100644 --- a/src/ports/postgres/modules/convex/mlp_igd.py_in +++ b/src/ports/postgres/modules/convex/mlp_igd.py_in @@ -1051,8 +1051,8 @@ def mlp_predict(schema_madlib, model_table, data_table, id_col_name, else: intermediate_col = unique_string() if classes: - score_format = create_cols_from_array_sql_string( - classes, intermediate_col, 'prob', 'double precision', False, 'MLP') + score_format, _ = create_cols_from_array_sql_string( + classes, intermediate_col, 'prob', 'double precision', False, 'MLP') else: # Case when the training step did not have to one-hot encode # the dependent var. diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in index 596b0b2..d6fb857 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in @@ -313,11 +313,15 @@ def fit(schema_madlib, source_table, model, model_arch_table, [compile_params, fit_params, name, description, metrics_elapsed_time, class_values]) - create_output_table = plpy.prepare(""" - CREATE TABLE {0} AS SELECT - $1 as model_weights, - $2 as {1}""".format(model, ModelArchSchema.MODEL_ARCH), ["bytea", "json"]) - plpy.execute(create_output_table, [serialized_weights, model_arch]) + plpy.execute(""" + CREATE TABLE {0} + (model_weights bytea, + {1} json)""".format(model, ModelArchSchema.MODEL_ARCH)) + insert_output_table = plpy.prepare(""" + INSERT INTO {0} SELECT model_weights, {1} + FROM (VALUES($1, $2))t(model_weights, {1}) + """.format(model, ModelArchSchema.MODEL_ARCH), ["bytea", "json"]) + plpy.execute(insert_output_table, [serialized_weights, model_arch]) #TODO add a unit test for this in a future PR reset_cuda_env(original_cuda_env) diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in index 1d2d0ba..e4794a3 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in @@ -1643,8 +1643,10 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit( description VARCHAR ) RETURNS VOID AS $$ PythonFunctionBodyOnly(`deep_learning', `madlib_keras') + from utilities.control import SetGUC with AOControl(False): - madlib_keras.fit(**globals()) + with SetGUC("plan_cache_mode", "force_generic_plan"): + madlib_keras.fit(**globals()) $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -1836,16 +1838,18 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict( mst_key INTEGER ) RETURNS VOID AS $$ PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict') + from utilities.control import SetGUC with AOControl(False): - madlib_keras_predict.Predict(schema_madlib, - model_table, - test_table, - id_col, - independent_varname, - output_table, - pred_type, - use_gpus, - mst_key) + with SetGUC("plan_cache_mode", "force_generic_plan"): + madlib_keras_predict.Predict(schema_madlib, + model_table, + test_table, + id_col, + independent_varname, + output_table, + pred_type, + use_gpus, + mst_key) $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -1917,8 +1921,10 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_predict_byom( normalizing_const DOUBLE PRECISION ) RETURNS VOID AS $$ PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict') + from utilities.control import SetGUC with AOControl(False): - madlib_keras_predict.PredictBYOM(**globals()) + with SetGUC("plan_cache_mode", "force_generic_plan"): + madlib_keras_predict.PredictBYOM(**globals()) $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -1986,7 +1992,11 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_evaluate( use_gpus BOOLEAN, mst_key INTEGER ) RETURNS VOID AS $$ - PythonFunction(`deep_learning', `madlib_keras', `evaluate') + PythonFunctionBodyOnly(`deep_learning', `madlib_keras') + from utilities.control import SetGUC + with AOControl(False): + with SetGUC("plan_cache_mode", "force_generic_plan"): + madlib_keras.evaluate(**globals()) $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in index 1b0e675..df79d86 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras_fit_multiple_model.sql_in @@ -1398,8 +1398,10 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit_multiple_model( description VARCHAR ) RETURNS VOID AS $$ PythonFunctionBodyOnly(`deep_learning', `madlib_keras_fit_multiple_model') + from utilities.control import SetGUC with AOControl(False): - fit_obj = madlib_keras_fit_multiple_model.FitMultipleModel(**globals()) + with SetGUC("plan_cache_mode", "force_generic_plan"): + fit_obj = madlib_keras_fit_multiple_model.FitMultipleModel(**globals()) $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in index 16ae7b1..412e63b 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in @@ -35,6 +35,7 @@ from utilities.utilities import add_postfix from utilities.utilities import create_cols_from_array_sql_string from utilities.utilities import get_segments_per_host from utilities.utilities import unique_string +from utilities.validate_args import get_expr_type from utilities.validate_args import input_tbl_valid from madlib_keras_wrapper import * @@ -78,7 +79,7 @@ class BasePredict(): intermediate_col = unique_string() class_values = strip_trailing_nulls_from_class_values(self.class_values) - prediction_select_clause = create_cols_from_array_sql_string( + prediction_select_clause, create_table_columns = create_cols_from_array_sql_string( class_values, intermediate_col, pred_col_name, pred_col_type, self.is_response, self.module_name) gp_segment_id_col, seg_ids_test, \ @@ -94,14 +95,20 @@ class BasePredict(): group_by_clause = "GROUP BY {self.test_table}.gp_segment_id".format(self=self) join_cond_on_segmentid = "{self.test_table}.gp_segment_id=min_ctid.gp_segment_id AND".format(self=self) + # Calling CREATE TABLE instead of CTAS, to ensure that the plan_cache_mode + # guc codepath is called when passing in the weights + plpy.execute(""" + CREATE TABLE {self.output_table} + ({self.id_col} {self.id_col_type}, {create_table_columns}) + """.format(self=self, create_table_columns=create_table_columns)) # Passing huge model weights to internal_keras_predict() for each row # resulted in slowness of overall madlib_keras_predict(). # To avoid this, a CASE is added to pass the model weights only for # the very first row(min(ctid)) that is fetched on each segment and NULL # for the other rows. predict_query = plpy.prepare(""" - CREATE TABLE {self.output_table} AS - SELECT {self.id_col}, {prediction_select_clause} + INSERT INTO {self.output_table} + SELECT {self.id_col}::{self.id_col_type}, {prediction_select_clause} FROM ( SELECT {self.test_table}.{self.id_col}, ({self.schema_madlib}.internal_keras_predict @@ -175,6 +182,7 @@ class Predict(BasePredict): self.dependent_varname = param_proc.get_dependent_varname() self.validate() + self.id_col_type = get_expr_type(self.id_col, self.test_table) BasePredict.call_internal_keras(self) if self.is_mult_model: plpy.execute("DROP VIEW IF EXISTS {}".format(self.temp_summary_view)) @@ -230,6 +238,7 @@ class PredictBYOM(BasePredict): self.test_table, self.id_col, self.output_table, self.independent_varname) self.validate_and_set_defaults() + self.id_col_type = get_expr_type(self.id_col, self.test_table) BasePredict.call_internal_keras(self) def validate_and_set_defaults(self): diff --git a/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in b/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in index b884eec..0318516 100644 --- a/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in +++ b/src/ports/postgres/modules/utilities/test/unit_tests/test_utilities.py_in @@ -254,15 +254,17 @@ class UtilitiesTestCase(unittest.TestCase): self.colname = 'estimated_col' self.coltype = 'dummy' self.has_one_ele = True - out_sql = utils.create_cols_from_array_sql_string( + out_sql, out_col = utils.create_cols_from_array_sql_string( self.py_list, self.sql_array_col, self.colname, self.coltype, self.has_one_ele, "dummy_module") self.assertEqual(out_sql, 'sqlcol[1]+1 AS estimated_col') + self.assertEqual(out_col, 'estimated_col dummy') self.has_one_ele = False - out_sql = utils.create_cols_from_array_sql_string( + out_sql, out_col = utils.create_cols_from_array_sql_string( self.py_list, self.sql_array_col, self.colname, self.coltype, self.has_one_ele, "dummy_module") self.assertEqual(out_sql, 'sqlcol AS estimated_col') + self.assertEqual(out_col, 'estimated_col dummy[]') def test_create_cols_from_array_sql_string_one_ele(self): utils = self.subject @@ -271,10 +273,11 @@ class UtilitiesTestCase(unittest.TestCase): self.colname = 'estimated_pred' self.coltype = 'TEXT' self.has_one_ele = True - out_sql = utils.create_cols_from_array_sql_string( + out_sql, out_col = utils.create_cols_from_array_sql_string( self.py_list, self.sql_array_col, self.colname, self.coltype, self.has_one_ele, "dummy_module") self.assertTrue(out_sql, "(ARRAY['cat','dog'])[sqlcol[1]+1]::TEXT AS estimated_pred") + self.assertTrue(out_col, "estimated_pred TEXT") def test_create_cols_from_array_sql_string_one_ele_with_NULL(self): utils = self.subject @@ -283,10 +286,11 @@ class UtilitiesTestCase(unittest.TestCase): self.colname = 'estimated_pred' self.coltype = 'INTEGER' self.has_one_ele = True - out_sql = utils.create_cols_from_array_sql_string( + out_sql, out_col = utils.create_cols_from_array_sql_string( self.py_list, self.sql_array_col, self.colname, self.coltype, self.has_one_ele, "dummy_module") self.assertEqual(out_sql, "(ARRAY[ NULL,1,2 ]::INTEGER[])[sqlcol[1]+1]::INTEGER AS estimated_pred") + self.assertEqual(out_col, "estimated_pred INTEGER") def test_create_cols_from_array_sql_string_one_ele_with_many_NULL(self): utils = self.subject @@ -307,10 +311,11 @@ class UtilitiesTestCase(unittest.TestCase): self.colname = 'prob' self.coltype = 'TEXT' self.has_one_ele = False - out_sql = utils.create_cols_from_array_sql_string( + out_sql, out_col = utils.create_cols_from_array_sql_string( self.py_list, self.sql_array_col, self.colname, self.coltype, self.has_one_ele, "dummy_module") self.assertEqual(out_sql, "CAST(sqlcol[1] AS TEXT) AS \"prob_cat\", CAST(sqlcol[2] AS TEXT) AS \"prob_dog\"") + self.assertEqual(out_col, "\"prob_cat\" TEXT, \"prob_dog\" TEXT") def test_create_cols_from_array_sql_string_many_ele_with_NULL(self): utils = self.subject @@ -319,10 +324,11 @@ class UtilitiesTestCase(unittest.TestCase): self.colname = 'prob' self.coltype = 'TEXT' self.has_one_ele = False - out_sql = utils.create_cols_from_array_sql_string( + out_sql, out_col = utils.create_cols_from_array_sql_string( self.py_list, self.sql_array_col, self.colname, self.coltype, self.has_one_ele, "dummy_module") self.assertEqual(out_sql, "CAST(sqlcol[1] AS TEXT) AS \"prob_NULL\", CAST(sqlcol[2] AS TEXT) AS \"prob_cat\", CAST(sqlcol[3] AS TEXT) AS \"prob_dog\"") + self.assertEqual(out_col, "\"prob_NULL\" TEXT, \"prob_cat\" TEXT, \"prob_dog\" TEXT") def test_create_cols_from_array_sql_string_many_ele_with_many_NULL(self): utils = self.subject diff --git a/src/ports/postgres/modules/utilities/utilities.py_in b/src/ports/postgres/modules/utilities/utilities.py_in index 12b5205..210fbd4 100644 --- a/src/ports/postgres/modules/utilities/utilities.py_in +++ b/src/ports/postgres/modules/utilities/utilities.py_in @@ -431,7 +431,8 @@ def create_cols_from_array_sql_string(py_list, sql_array_col, colname, coltype, has_one_ele, module_name='Input Error'): """ - Create SQL string to convert array of elements into multiple columns. + Create SQL string to convert array of elements into multiple columns and corresponding + SQL string of columns for CREATE TABLE. @args: @param: py_list, python list, if None, return sql_array_col as colname. The py_list can at most have one 'None' element that @@ -454,6 +455,7 @@ def create_cols_from_array_sql_string(py_list, sql_array_col, colname, coltype = TEXT has_one_ele = FALSE Output: + prob_cat TEXT, prob_dog TEXT CAST(sqlcol[1] AS TEXT) AS prob_cat, CAST(sqlcol[2] AS TEXT) AS prob_dog 2) Input: py_list = ['cat', 'dog'] @@ -462,23 +464,25 @@ def create_cols_from_array_sql_string(py_list, sql_array_col, colname, coltype = TEXT has_one_ele = TRUE Output: + estimated_pred TEXT (ARRAY['cat','dog'])[sqlcol[1]+1]::TEXT AS estimated_pred @NOTE: If py_list is [None, 'cat', 'dog', NULL']: then the SQL query string returned would create the following column names: - prob_NULL, prob_cat, 'prob_dog', and 'prob_"NULL'. + prob_NULL, prob_cat, 'prob_dog', and 'prob_"NULL"'. 1. Notice that for None, which represents Postgres' NULL value, the column name will be 'prob_NULL', 2. and to differentiate the column name for a string 'NULL', the - resulting column name will be 'prob_"NULL'. + resulting column name will be 'prob_"NULL"'. The weird quoting in this column name is due to calling strip after quote_ident in the code below. @returns: @param, str, that can be used in a SQL query. + @param, str, that can be used in a SQL query. """ _assert(sql_array_col, "{0}: sql_array_col should be a valid string.". @@ -515,6 +519,8 @@ def create_cols_from_array_sql_string(py_list, sql_array_col, colname, py_list_sql_str = py_list_to_sql_string(py_list, coltype+'[]') select_clause = "({0})[{1}[1]+1]::{2} AS {3}".format( py_list_sql_str, sql_array_col, coltype, colname) + create_columns = "{0} {1}".format( + colname, coltype) else: # Create as many columns as the length of py_list. The # colnames are created based on the elements in py_list, @@ -534,12 +540,21 @@ def create_cols_from_array_sql_string(py_list, sql_array_col, colname, coltype=coltype) for i, suffix in enumerate(py_list) ]) + create_columns = ', '.join( + ['"{final_colname}" {coltype}'. + format(final_colname=quote_ident("{0}_{1}". + format(colname, str(suffix))).strip(' "'), + coltype=coltype) + for i, suffix in enumerate(py_list) + ]) else: if has_one_ele: select_clause = '{0}[1]+1 AS {1}'.format(sql_array_col, colname) + create_columns = '{0} {1}'.format(colname, coltype) else: select_clause = '{0} AS {1}'.format(sql_array_col, colname) - return select_clause + create_columns = '{0} {1}'.format(colname, coltype+'[]') + return select_clause, create_columns # ------------------------------------------------------------------------ def _array_to_string(origin):
