This is an automated email from the ASF dual-hosted git repository. njayaram pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/madlib.git
commit 99929ee56a8926b2dbd60ed154a13773b1017540 Author: Nandish Jayaram <[email protected]> AuthorDate: Wed May 22 15:19:43 2019 -0700 DL: Update content of the model summary table JIRA: MADLIB-1349, MADLIB-1338 Add a new column named `metrics_type` that captures the metric name (if) specified in the compile params. This commit also fixes the model_size reported, it was reporting the wrong size. There were a few comments in MADLIB-1338 after the corresponding PR was merged to master. Some of those comments are addressed in this PR since they are related code, and minor changes. 1. Ensure compile_params cannot be NULL. 2. Update some info messages. 3. Use the @MinWarning decorator to suppress notices. 4. Ensure training_metrics_final and validation_metrics_final are NULL if the compile params did not have any metric specified. Closes #397 Co-authored-by: Jingyi Mei <[email protected]> --- .../modules/deep_learning/madlib_keras.py_in | 90 +++++++++++++--------- .../deep_learning/madlib_keras_predict.py_in | 2 + .../deep_learning/madlib_keras_wrapper.py_in | 13 +++- .../modules/deep_learning/test/madlib_keras.sql_in | 27 +++++++ .../test/unit_tests/test_madlib_keras.py_in | 7 -- 5 files changed, 95 insertions(+), 44 deletions(-) diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in index f628f23..bafc7e0 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in @@ -45,8 +45,10 @@ from madlib_keras_validator import FitInputValidator from madlib_keras_wrapper import * from keras_model_arch_table import Format +from utilities.control import MinWarning from utilities.model_arch_info import get_input_shape from utilities.model_arch_info import get_num_classes +from utilities.utilities import _assert from utilities.utilities import is_platform_pg from utilities.utilities import get_segments_per_host from utilities.utilities import madlib_version @@ -54,27 +56,7 @@ from utilities.validate_args import get_col_value_and_type from utilities.validate_args import get_expr_type from utilities.validate_args import quote_ident -def get_source_summary_table_dict(fit_validator): - source_summary = plpy.execute(""" - SELECT - {class_values} AS class_values, - {norm_const} AS norm_const, - {dep_vartype} AS dep_vartype, - {dep_varname} AS dependent_varname_in_source_table, - {indep_varname} AS independent_varname_in_source_table - FROM {tbl} - """.format(class_values=CLASS_VALUES_COLNAME, - norm_const=NORMALIZING_CONST_COLNAME, - dep_vartype=DEPENDENT_VARTYPE_COLNAME, - dep_varname='dependent_varname', - indep_varname='independent_varname', - tbl=fit_validator.source_summary_table))[0] - source_summary['class_values_type'] = get_expr_type( - CLASS_VALUES_COLNAME, fit_validator.source_summary_table) - source_summary['norm_const_type'] = get_expr_type( - NORMALIZING_CONST_COLNAME, fit_validator.source_summary_table) - return source_summary - +@MinWarning("warning") def fit(schema_madlib, source_table, model,model_arch_table, model_arch_id, compile_params, fit_params, num_iterations, gpus_per_host = 0, validation_table=None, @@ -85,11 +67,12 @@ def fit(schema_madlib, source_table, model,model_arch_table, independent_varname = MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL model_arch_table = quote_ident(model_arch_table) fit_params = "" if not fit_params else fit_params + _assert(compile_params, "Compile parameters cannot be empty or NULL.") fit_validator = FitInputValidator( source_table, validation_table, model, model_arch_table, - dependent_varname, independent_varname, num_iterations, - metrics_compute_frequency) + dependent_varname, independent_varname, + num_iterations, metrics_compute_frequency) if metrics_compute_frequency is None: metrics_compute_frequency = num_iterations @@ -188,9 +171,8 @@ def fit(schema_madlib, source_table, model,model_arch_table, training_loss, training_metrics, aggregate_runtime = [], [], [] metrics_iters = [] - plpy.info("Model architecture size: {}KB".format(len(model_arch)/1024)) - plpy.info("Model state (serialized) size: {}MB".format( - len(model_state)/1024/1024)) + # get the size of serialized model weights string in KB + model_size = sys.getsizeof(model_state)/1024.0 # Run distributed training for specified number of iterations for i in range(1, num_iterations+1): @@ -198,7 +180,7 @@ def fit(schema_madlib, source_table, model,model_arch_table, iteration_result = plpy.execute(run_training_iteration, [model_state])[0]['iteration_result'] end_iteration = time.time() - plpy.info("Time for iteration {0}: {1} sec". + plpy.info("Time for training in iteration {0}: {1} sec". format(i, end_iteration - start_iteration)) model_state = madlib_keras_serializer.deserialize_iteration_state( iteration_result) @@ -238,19 +220,22 @@ def fit(schema_madlib, source_table, model,model_arch_table, independent_varname_in_source_table = src_summary_dict['independent_varname_in_source_table'] # Define some constants to be inserted into the summary table. model_type = "madlib_keras" - model_size = sys.getsizeof(model) + compile_params_dict = convert_string_of_args_to_dict(compile_params) + metrics_list = get_metrics_from_compile_param(compile_params) + is_metrics_specified = True if metrics_list else False + metrics_type = 'ARRAY{0}'.format(metrics_list) if is_metrics_specified else 'NULL' metrics_iters = metrics_iters if metrics_iters else 'NULL' # We always compute the training loss and metrics, at least once. - training_metrics_final = training_metrics[-1] training_loss_final = training_loss[-1] - training_metrics = training_metrics if training_metrics else 'NULL' - training_loss = training_loss if training_loss else 'NULL' + training_loss = 'ARRAY{0}'.format(training_loss) if training_loss else 'NULL' + training_metrics_final, training_metrics = get_metrics_sql_string( + training_metrics, is_metrics_specified) # Validation loss and metrics are computed only if validation_table # is provided. if validation_set_provided: - validation_metrics_final = validation_metrics[-1] + validation_metrics_final, validation_metrics = get_metrics_sql_string( + validation_metrics, is_metrics_specified) validation_loss_final = validation_loss[-1] - validation_metrics = 'ARRAY{0}'.format(validation_metrics) validation_loss = 'ARRAY{0}'.format(validation_loss) # Must quote the string before inserting to table. Explicitly # quoting it here since this can also take a NULL value, done @@ -278,7 +263,7 @@ def fit(schema_madlib, source_table, model,model_arch_table, $3 AS name, $4 AS description, '{model_type}'::TEXT AS model_type, - {model_size}::INTEGER AS model_size, + {model_size}::DOUBLE PRECISION AS model_size, '{start_training_time}'::TIMESTAMP AS start_training_time, '{end_training_time}'::TIMESTAMP AS end_training_time, $5 AS time_iter, @@ -287,10 +272,11 @@ def fit(schema_madlib, source_table, model,model_arch_table, $6 AS {class_values_colname}, $MAD${dep_vartype}$MAD$::TEXT AS {dependent_vartype_colname}, {norm_const}::DOUBLE PRECISION AS {normalizing_const_colname}, + {metrics_type}::TEXT[] AS metrics_type, {training_metrics_final}::DOUBLE PRECISION AS training_metrics_final, {training_loss_final}::DOUBLE PRECISION AS training_loss_final, - ARRAY{training_metrics}::DOUBLE PRECISION[] AS training_metrics, - ARRAY{training_loss}::DOUBLE PRECISION[] AS training_loss, + {training_metrics}::DOUBLE PRECISION[] AS training_metrics, + {training_loss}::DOUBLE PRECISION[] AS training_loss, {validation_metrics_final}::DOUBLE PRECISION AS validation_metrics_final, {validation_loss_final}::DOUBLE PRECISION AS validation_loss_final, {validation_metrics}::DOUBLE PRECISION[] AS validation_metrics, @@ -319,6 +305,38 @@ def fit(schema_madlib, source_table, model,model_arch_table, #TODO add a unit test for this in a future PR reset_cuda_env(original_cuda_env) +def get_source_summary_table_dict(fit_validator): + source_summary = plpy.execute(""" + SELECT + {class_values} AS class_values, + {norm_const} AS norm_const, + {dep_vartype} AS dep_vartype, + {dep_varname} AS dependent_varname_in_source_table, + {indep_varname} AS independent_varname_in_source_table + FROM {tbl} + """.format(class_values=CLASS_VALUES_COLNAME, + norm_const=NORMALIZING_CONST_COLNAME, + dep_vartype=DEPENDENT_VARTYPE_COLNAME, + dep_varname='dependent_varname', + indep_varname='independent_varname', + tbl=fit_validator.source_summary_table))[0] + source_summary['class_values_type'] = get_expr_type( + CLASS_VALUES_COLNAME, fit_validator.source_summary_table) + source_summary['norm_const_type'] = get_expr_type( + NORMALIZING_CONST_COLNAME, fit_validator.source_summary_table) + return source_summary + +def get_metrics_sql_string(metrics_list, is_metrics_specified): + """ + Return the SQL string to use for creating metrics SQL values. + """ + if is_metrics_specified: + metrics_final = metrics_list[-1] + metrics_all = 'ARRAY{0}'.format(metrics_list) + else: + metrics_final = metrics_all = 'NULL' + return metrics_final, metrics_all + def compute_loss_and_metrics(schema_madlib, table, dependent_varname, independent_varname, compile_params, model_arch, model_state, gpus_per_host, segments_per_host, diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in index eb42bf7..dbf29e9 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in @@ -34,6 +34,7 @@ from madlib_keras_validator import PredictInputValidator from madlib_keras_wrapper import get_device_name_and_set_cuda_env from madlib_keras_wrapper import set_model_weights from predict_input_params import PredictParamsProcessor +from utilities.control import MinWarning from utilities.model_arch_info import get_input_shape from utilities.utilities import add_postfix from utilities.utilities import create_cols_from_array_sql_string @@ -45,6 +46,7 @@ from madlib_keras_wrapper import * MODULE_NAME = 'madlib_keras_predict' +@MinWarning("warning") def predict(schema_madlib, model_table, test_table, id_col, independent_varname, output_table, pred_type, gpus_per_host, **kwargs): if not pred_type: diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in index 8abb196..eef30bf 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in @@ -150,6 +150,18 @@ def convert_string_of_args_to_dict(str_of_args): compile_dict[key_str.strip()]=value_str.strip().strip('\'') return compile_dict +def get_metrics_from_compile_param(str_of_args): + compile_dict = convert_string_of_args_to_dict(str_of_args) + metrics = None + ckey = 'metrics' + if ckey in compile_dict: + try: + metrics = ast.literal_eval(compile_dict[ckey]) + except ValueError: + plpy.error(("Invalid input value for parameter {0}, " + "please refer to the documentation").format(ckey)) + return metrics + # Parse the compile parameters and the optimizer. def parse_and_validate_compile_params(str_of_args): """ @@ -160,7 +172,6 @@ def parse_and_validate_compile_params(str_of_args): opt_args: Arguments for the optimizer compile_dict: Dictionary of arguments for keras.compile """ - literal_eval_compile_params = ['metrics', 'loss_weights', 'weighted_metrics', 'sample_weight_mode'] accepted_compile_params = literal_eval_compile_params + ['optimizer', 'loss'] diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in index e2b9d4e..9b577b3 100644 --- a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in +++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in @@ -139,6 +139,7 @@ SELECT assert( metrics_compute_frequency = 3 AND num_classes = 2 AND class_values = '{0,1}' AND + metrics_type = '{mae}' AND training_metrics_final >= 0 AND training_loss_final >= 0 AND array_upper(training_metrics, 1) = 1 AND @@ -175,6 +176,7 @@ SELECT assert( metrics_compute_frequency = 4 AND training_metrics_final >= 0 AND training_loss_final >= 0 AND + metrics_type = '{accuracy}' AND array_upper(training_metrics, 1) = 2 AND array_upper(training_loss, 1) = 2 AND array_upper(time_iter, 1) = 2 AND @@ -259,6 +261,7 @@ SELECT assert( madlib_version is NOT NULL AND num_classes = 2 AND class_values = '{0,1}' AND + metrics_type = '{accuracy}' AND normalizing_const = 255.0 AND training_metrics_final is not NULL AND training_loss_final is not NULL AND @@ -284,6 +287,18 @@ $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), loss='categorical_crossent $$ batch_size=2, epochs=1, verbose=0 $$::text, 1); +SELECT assert( + metrics_type is NULL AND + training_metrics IS NULL AND + array_upper(training_loss, 1) = 1 AND + array_upper(time_iter, 1) = 1 AND + validation_metrics_final IS NULL AND + validation_loss_final >= 0 AND + validation_metrics IS NULL AND + array_upper(validation_loss, 1) = 1, + 'Keras model output Summary Validation failed. Actual:' || __to_char(summary)) +FROM (SELECT * FROM keras_saved_out_summary) summary; + -- Validate metrics=[] works fine DROP TABLE IF EXISTS keras_saved_out, keras_saved_out_summary; SELECT madlib_keras_fit( @@ -295,6 +310,18 @@ $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), loss='categorical_crossent $$ batch_size=2, epochs=1, verbose=0 $$::text, 1); +SELECT assert( + metrics_type IS NULL AND + training_metrics IS NULL AND + array_upper(training_loss, 1) = 1 AND + array_upper(time_iter, 1) = 1 AND + validation_metrics_final IS NULL AND + validation_loss_final >= 0 AND + validation_metrics IS NULL AND + array_upper(validation_loss, 1) = 1, + 'Keras model output Summary Validation failed. Actual:' || __to_char(summary)) +FROM (SELECT * FROM keras_saved_out_summary) summary; + DROP TABLE IF EXISTS cifar10_predict; SELECT madlib_keras_predict( 'keras_saved_out', diff --git a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in index 4ef3e9a..edfb69a 100644 --- a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in +++ b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in @@ -822,11 +822,6 @@ class MadlibKerasWrapperTestCase(unittest.TestCase): self.subject.parse_and_validate_compile_params(test_str) self.assertIn('invalid literal for float', str(error.exception)) - test_str = "" - with self.assertRaises(plpy.PLPYException) as error: - self.subject.parse_and_validate_compile_params(test_str) - self.assertIn('not accepted', str(error.exception)) - def test_parse_and_validate_fit_params_invalid_optimizer_fail(self): test_str = "optimizer='SGD1', loss='categorical_crossentropy'" with self.assertRaises(plpy.PLPYException) as error: @@ -834,8 +829,6 @@ class MadlibKerasWrapperTestCase(unittest.TestCase): self.assertIn('invalid optimizer', str(error.exception)) - - class MadlibKerasValidatorTestCase(unittest.TestCase): def setUp(self): self.plpy_mock = Mock(spec='error')
