[madlib] 01/02: DL: Update content of the model summary table

njayaram Wed, 29 May 2019 12:05:05 -0700

This is an automated email from the ASF dual-hosted git repository.

njayaram pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


commit 99929ee56a8926b2dbd60ed154a13773b1017540
Author: Nandish Jayaram <[email protected]>
AuthorDate: Wed May 22 15:19:43 2019 -0700

    DL: Update content of the model summary table
    
    JIRA: MADLIB-1349, MADLIB-1338
    Add a new column named `metrics_type` that captures the metric name
    (if) specified in the compile params. This commit also fixes the
    model_size reported, it was reporting the wrong size.
    
    There were a few comments in MADLIB-1338 after the corresponding PR was
    merged to master. Some of those comments are addressed in this PR since
    they are related code, and minor changes.
    1. Ensure compile_params cannot be NULL.
    2. Update some info messages.
    3. Use the @MinWarning decorator to suppress notices.
    4. Ensure training_metrics_final and validation_metrics_final are NULL
    if the compile params did not have any metric specified.
    
    Closes #397
    Co-authored-by: Jingyi Mei <[email protected]>
---
 .../modules/deep_learning/madlib_keras.py_in       | 90 +++++++++++++---------
 .../deep_learning/madlib_keras_predict.py_in       |  2 +
 .../deep_learning/madlib_keras_wrapper.py_in       | 13 +++-
 .../modules/deep_learning/test/madlib_keras.sql_in | 27 +++++++
 .../test/unit_tests/test_madlib_keras.py_in        |  7 --
 5 files changed, 95 insertions(+), 44 deletions(-)

diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
index f628f23..bafc7e0 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
@@ -45,8 +45,10 @@ from madlib_keras_validator import FitInputValidator
 from madlib_keras_wrapper import *
 from keras_model_arch_table import Format
 
+from utilities.control import MinWarning
 from utilities.model_arch_info import get_input_shape
 from utilities.model_arch_info import get_num_classes
+from utilities.utilities import _assert
 from utilities.utilities import is_platform_pg
 from utilities.utilities import get_segments_per_host
 from utilities.utilities import madlib_version
@@ -54,27 +56,7 @@ from utilities.validate_args import get_col_value_and_type
 from utilities.validate_args import get_expr_type
 from utilities.validate_args import quote_ident
 
-def get_source_summary_table_dict(fit_validator):
-    source_summary = plpy.execute("""
-            SELECT
-                {class_values} AS class_values,
-                {norm_const} AS norm_const,
-                {dep_vartype} AS dep_vartype,
-                {dep_varname} AS dependent_varname_in_source_table,
-                {indep_varname} AS independent_varname_in_source_table
-            FROM {tbl}
-        """.format(class_values=CLASS_VALUES_COLNAME,
-                   norm_const=NORMALIZING_CONST_COLNAME,
-                   dep_vartype=DEPENDENT_VARTYPE_COLNAME,
-                   dep_varname='dependent_varname',
-                   indep_varname='independent_varname',
-                   tbl=fit_validator.source_summary_table))[0]
-    source_summary['class_values_type'] = get_expr_type(
-        CLASS_VALUES_COLNAME, fit_validator.source_summary_table)
-    source_summary['norm_const_type'] = get_expr_type(
-        NORMALIZING_CONST_COLNAME, fit_validator.source_summary_table)
-    return source_summary
-
+@MinWarning("warning")
 def fit(schema_madlib, source_table, model,model_arch_table,
         model_arch_id, compile_params, fit_params, num_iterations,
         gpus_per_host = 0, validation_table=None,
@@ -85,11 +67,12 @@ def fit(schema_madlib, source_table, model,model_arch_table,
     independent_varname = MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL
     model_arch_table = quote_ident(model_arch_table)
     fit_params = "" if not fit_params else fit_params
+    _assert(compile_params, "Compile parameters cannot be empty or NULL.")
 
     fit_validator = FitInputValidator(
         source_table, validation_table, model, model_arch_table,
-        dependent_varname, independent_varname, num_iterations,
-        metrics_compute_frequency)
+        dependent_varname, independent_varname,
+        num_iterations, metrics_compute_frequency)
     if metrics_compute_frequency is None:
         metrics_compute_frequency = num_iterations
 
@@ -188,9 +171,8 @@ def fit(schema_madlib, source_table, model,model_arch_table,
     training_loss, training_metrics, aggregate_runtime = [], [], []
     metrics_iters = []
 
-    plpy.info("Model architecture size: {}KB".format(len(model_arch)/1024))
-    plpy.info("Model state (serialized) size: {}MB".format(
-        len(model_state)/1024/1024))
+    # get the size of serialized model weights string in KB
+    model_size = sys.getsizeof(model_state)/1024.0
 
     # Run distributed training for specified number of iterations
     for i in range(1, num_iterations+1):
@@ -198,7 +180,7 @@ def fit(schema_madlib, source_table, model,model_arch_table,
         iteration_result = plpy.execute(run_training_iteration,
                                         [model_state])[0]['iteration_result']
         end_iteration = time.time()
-        plpy.info("Time for iteration {0}: {1} sec".
+        plpy.info("Time for training in iteration {0}: {1} sec".
                   format(i, end_iteration - start_iteration))
         model_state = madlib_keras_serializer.deserialize_iteration_state(
             iteration_result)
@@ -238,19 +220,22 @@ def fit(schema_madlib, source_table, 
model,model_arch_table,
     independent_varname_in_source_table = 
src_summary_dict['independent_varname_in_source_table']
     # Define some constants to be inserted into the summary table.
     model_type = "madlib_keras"
-    model_size = sys.getsizeof(model)
+    compile_params_dict = convert_string_of_args_to_dict(compile_params)
+    metrics_list = get_metrics_from_compile_param(compile_params)
+    is_metrics_specified = True if metrics_list else False
+    metrics_type = 'ARRAY{0}'.format(metrics_list) if is_metrics_specified 
else 'NULL'
     metrics_iters = metrics_iters if metrics_iters else 'NULL'
     # We always compute the training loss and metrics, at least once.
-    training_metrics_final = training_metrics[-1]
     training_loss_final = training_loss[-1]
-    training_metrics = training_metrics if training_metrics else 'NULL'
-    training_loss = training_loss if training_loss else 'NULL'
+    training_loss = 'ARRAY{0}'.format(training_loss) if training_loss else 
'NULL'
+    training_metrics_final, training_metrics = get_metrics_sql_string(
+        training_metrics, is_metrics_specified)
     # Validation loss and metrics are computed only if validation_table
     # is provided.
     if validation_set_provided:
-        validation_metrics_final = validation_metrics[-1]
+        validation_metrics_final, validation_metrics = get_metrics_sql_string(
+            validation_metrics, is_metrics_specified)
         validation_loss_final = validation_loss[-1]
-        validation_metrics = 'ARRAY{0}'.format(validation_metrics)
         validation_loss = 'ARRAY{0}'.format(validation_loss)
         # Must quote the string before inserting to table. Explicitly
         # quoting it here since this can also take a NULL value, done
@@ -278,7 +263,7 @@ def fit(schema_madlib, source_table, model,model_arch_table,
             $3 AS name,
             $4 AS description,
             '{model_type}'::TEXT AS model_type,
-            {model_size}::INTEGER AS model_size,
+            {model_size}::DOUBLE PRECISION AS model_size,
             '{start_training_time}'::TIMESTAMP AS start_training_time,
             '{end_training_time}'::TIMESTAMP AS end_training_time,
             $5 AS time_iter,
@@ -287,10 +272,11 @@ def fit(schema_madlib, source_table, 
model,model_arch_table,
             $6 AS {class_values_colname},
             $MAD${dep_vartype}$MAD$::TEXT AS {dependent_vartype_colname},
             {norm_const}::DOUBLE PRECISION AS {normalizing_const_colname},
+            {metrics_type}::TEXT[] AS metrics_type,
             {training_metrics_final}::DOUBLE PRECISION AS 
training_metrics_final,
             {training_loss_final}::DOUBLE PRECISION AS training_loss_final,
-            ARRAY{training_metrics}::DOUBLE PRECISION[] AS training_metrics,
-            ARRAY{training_loss}::DOUBLE PRECISION[] AS training_loss,
+            {training_metrics}::DOUBLE PRECISION[] AS training_metrics,
+            {training_loss}::DOUBLE PRECISION[] AS training_loss,
             {validation_metrics_final}::DOUBLE PRECISION AS 
validation_metrics_final,
             {validation_loss_final}::DOUBLE PRECISION AS validation_loss_final,
             {validation_metrics}::DOUBLE PRECISION[] AS validation_metrics,
@@ -319,6 +305,38 @@ def fit(schema_madlib, source_table, 
model,model_arch_table,
     #TODO add a unit test for this in a future PR
     reset_cuda_env(original_cuda_env)
 
+def get_source_summary_table_dict(fit_validator):
+    source_summary = plpy.execute("""
+            SELECT
+                {class_values} AS class_values,
+                {norm_const} AS norm_const,
+                {dep_vartype} AS dep_vartype,
+                {dep_varname} AS dependent_varname_in_source_table,
+                {indep_varname} AS independent_varname_in_source_table
+            FROM {tbl}
+        """.format(class_values=CLASS_VALUES_COLNAME,
+                   norm_const=NORMALIZING_CONST_COLNAME,
+                   dep_vartype=DEPENDENT_VARTYPE_COLNAME,
+                   dep_varname='dependent_varname',
+                   indep_varname='independent_varname',
+                   tbl=fit_validator.source_summary_table))[0]
+    source_summary['class_values_type'] = get_expr_type(
+        CLASS_VALUES_COLNAME, fit_validator.source_summary_table)
+    source_summary['norm_const_type'] = get_expr_type(
+        NORMALIZING_CONST_COLNAME, fit_validator.source_summary_table)
+    return source_summary
+
+def get_metrics_sql_string(metrics_list, is_metrics_specified):
+    """
+        Return the SQL string to use for creating metrics SQL values.
+    """
+    if is_metrics_specified:
+        metrics_final = metrics_list[-1]
+        metrics_all = 'ARRAY{0}'.format(metrics_list)
+    else:
+        metrics_final = metrics_all = 'NULL'
+    return metrics_final, metrics_all
+
 def compute_loss_and_metrics(schema_madlib, table, dependent_varname,
                              independent_varname, compile_params, model_arch,
                              model_state, gpus_per_host, segments_per_host,
diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
index eb42bf7..dbf29e9 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
@@ -34,6 +34,7 @@ from madlib_keras_validator import PredictInputValidator
 from madlib_keras_wrapper import get_device_name_and_set_cuda_env
 from madlib_keras_wrapper import set_model_weights
 from predict_input_params import PredictParamsProcessor
+from utilities.control import MinWarning
 from utilities.model_arch_info import get_input_shape
 from utilities.utilities import add_postfix
 from utilities.utilities import create_cols_from_array_sql_string
@@ -45,6 +46,7 @@ from madlib_keras_wrapper import *
 
 MODULE_NAME = 'madlib_keras_predict'
 
+@MinWarning("warning")
 def predict(schema_madlib, model_table, test_table, id_col,
             independent_varname, output_table, pred_type, gpus_per_host, 
**kwargs):
     if not pred_type:
diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in
index 8abb196..eef30bf 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_wrapper.py_in
@@ -150,6 +150,18 @@ def convert_string_of_args_to_dict(str_of_args):
     compile_dict[key_str.strip()]=value_str.strip().strip('\'')
     return compile_dict
 
+def get_metrics_from_compile_param(str_of_args):
+    compile_dict = convert_string_of_args_to_dict(str_of_args)
+    metrics = None
+    ckey = 'metrics'
+    if ckey in compile_dict:
+        try:
+            metrics = ast.literal_eval(compile_dict[ckey])
+        except ValueError:
+            plpy.error(("Invalid input value for parameter {0}, "
+                        "please refer to the documentation").format(ckey))
+    return metrics
+
 # Parse the compile parameters and the optimizer.
 def parse_and_validate_compile_params(str_of_args):
     """
@@ -160,7 +172,6 @@ def parse_and_validate_compile_params(str_of_args):
         opt_args:               Arguments for the optimizer
         compile_dict:           Dictionary of arguments for keras.compile
     """
-
     literal_eval_compile_params = ['metrics', 'loss_weights',
                                    'weighted_metrics', 'sample_weight_mode']
     accepted_compile_params = literal_eval_compile_params + ['optimizer', 
'loss']
diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in 
b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
index e2b9d4e..9b577b3 100644
--- a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
@@ -139,6 +139,7 @@ SELECT assert(
         metrics_compute_frequency = 3 AND
         num_classes = 2 AND
         class_values = '{0,1}' AND
+        metrics_type = '{mae}' AND
         training_metrics_final >= 0  AND
         training_loss_final  >= 0  AND
         array_upper(training_metrics, 1) = 1 AND
@@ -175,6 +176,7 @@ SELECT assert(
         metrics_compute_frequency = 4 AND
         training_metrics_final >= 0  AND
         training_loss_final  >= 0  AND
+        metrics_type = '{accuracy}' AND
         array_upper(training_metrics, 1) = 2 AND
         array_upper(training_loss, 1) = 2 AND
         array_upper(time_iter, 1) = 2 AND
@@ -259,6 +261,7 @@ SELECT assert(
     madlib_version is NOT NULL AND
     num_classes = 2 AND
     class_values = '{0,1}' AND
+    metrics_type = '{accuracy}' AND
     normalizing_const = 255.0 AND
     training_metrics_final is not NULL AND
     training_loss_final is not NULL AND
@@ -284,6 +287,18 @@ $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossent
 $$ batch_size=2, epochs=1, verbose=0 $$::text,
 1);
 
+SELECT assert(
+        metrics_type is NULL AND
+        training_metrics IS NULL AND
+        array_upper(training_loss, 1) = 1 AND
+        array_upper(time_iter, 1) = 1 AND
+        validation_metrics_final IS NULL AND
+        validation_loss_final  >= 0  AND
+        validation_metrics IS NULL AND
+        array_upper(validation_loss, 1) = 1,
+        'Keras model output Summary Validation failed. Actual:' || 
__to_char(summary))
+FROM (SELECT * FROM keras_saved_out_summary) summary;
+
 -- Validate metrics=[] works fine
 DROP TABLE IF EXISTS keras_saved_out, keras_saved_out_summary;
 SELECT madlib_keras_fit(
@@ -295,6 +310,18 @@ $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossent
 $$ batch_size=2, epochs=1, verbose=0 $$::text,
 1);
 
+SELECT assert(
+        metrics_type IS NULL AND
+        training_metrics IS NULL AND
+        array_upper(training_loss, 1) = 1 AND
+        array_upper(time_iter, 1) = 1 AND
+        validation_metrics_final IS NULL AND
+        validation_loss_final  >= 0  AND
+        validation_metrics IS NULL AND
+        array_upper(validation_loss, 1) = 1,
+        'Keras model output Summary Validation failed. Actual:' || 
__to_char(summary))
+FROM (SELECT * FROM keras_saved_out_summary) summary;
+
 DROP TABLE IF EXISTS cifar10_predict;
 SELECT madlib_keras_predict(
     'keras_saved_out',
diff --git 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
index 4ef3e9a..edfb69a 100644
--- 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
+++ 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
@@ -822,11 +822,6 @@ class MadlibKerasWrapperTestCase(unittest.TestCase):
             self.subject.parse_and_validate_compile_params(test_str)
         self.assertIn('invalid literal for float', str(error.exception))
 
-        test_str = ""
-        with self.assertRaises(plpy.PLPYException) as error:
-            self.subject.parse_and_validate_compile_params(test_str)
-        self.assertIn('not accepted', str(error.exception))
-
     def test_parse_and_validate_fit_params_invalid_optimizer_fail(self):
         test_str = "optimizer='SGD1', loss='categorical_crossentropy'"
         with self.assertRaises(plpy.PLPYException) as error:
@@ -834,8 +829,6 @@ class MadlibKerasWrapperTestCase(unittest.TestCase):
         self.assertIn('invalid optimizer', str(error.exception))
 
 
-
-
 class MadlibKerasValidatorTestCase(unittest.TestCase):
     def setUp(self):
         self.plpy_mock = Mock(spec='error')

[madlib] 01/02: DL: Update content of the model summary table

Reply via email to