[madlib] 01/03: DL: Add new param metrics_compute_frequency to madlib_keras_fit()

njayaram Tue, 14 May 2019 16:27:11 -0700

This is an automated email from the ASF dual-hosted git repository.

njayaram pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


commit a5692219a361fad8b8b09c4c6789dd9830a04e8f
Author: Ekta Khanna <[email protected]>
AuthorDate: Wed May 1 17:09:12 2019 -0700

    DL: Add new param metrics_compute_frequency to madlib_keras_fit()
    
    JIRA: MADLIB-1335
    This commit adds a new optional parameter `metrics_compute_frequency` to
    `madlib_keras_fit()` to control how often Keras.evaluate() is run on
    training and validation data for computing loss/metrics. We only support
    one metric at the moment.
    
    Closes #388
    Co-authored-by: Nandish Jayaram <[email protected]>
---
 .../modules/deep_learning/madlib_keras.py_in       | 320 +++++++++++++--------
 .../modules/deep_learning/madlib_keras.sql_in      |  40 ++-
 .../deep_learning/madlib_keras_validator.py_in     |  32 ++-
 .../modules/deep_learning/test/madlib_keras.sql_in |  63 ++--
 4 files changed, 291 insertions(+), 164 deletions(-)

diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
index d0dde1d..9cda792 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
@@ -51,11 +51,34 @@ from utilities.utilities import is_platform_pg
 from utilities.utilities import get_segments_per_host
 from utilities.utilities import madlib_version
 from utilities.validate_args import get_col_value_and_type
+from utilities.validate_args import get_expr_type
 from utilities.validate_args import quote_ident
 
+def get_source_summary_table_dict(fit_validator):
+    source_summary = plpy.execute("""
+            SELECT
+                {class_values} AS class_values,
+                {norm_const} AS norm_const,
+                {dep_vartype} AS dep_vartype,
+                {dep_varname} AS dependent_varname_in_source_table,
+                {indep_varname} AS independent_varname_in_source_table
+            FROM {tbl}
+        """.format(class_values=CLASS_VALUES_COLNAME,
+                   norm_const=NORMALIZING_CONST_COLNAME,
+                   dep_vartype=DEPENDENT_VARTYPE_COLNAME,
+                   dep_varname='dependent_varname',
+                   indep_varname='independent_varname',
+                   tbl=fit_validator.source_summary_table))[0]
+    source_summary['class_values_type'] = get_expr_type(
+        CLASS_VALUES_COLNAME, fit_validator.source_summary_table)
+    source_summary['norm_const_type'] = get_expr_type(
+        NORMALIZING_CONST_COLNAME, fit_validator.source_summary_table)
+    return source_summary
+
 def fit(schema_madlib, source_table, model,model_arch_table,
         model_arch_id, compile_params, fit_params, num_iterations,
-        gpus_per_host = 0, validation_table=None, name="",
+        gpus_per_host = 0, validation_table=None,
+        metrics_compute_frequency=None, name="",
         description="", **kwargs):
     source_table = quote_ident(source_table)
     dependent_varname = MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL
@@ -65,7 +88,10 @@ def fit(schema_madlib, source_table, model,model_arch_table,
 
     fit_validator = FitInputValidator(
         source_table, validation_table, model, model_arch_table,
-        dependent_varname, independent_varname, num_iterations)
+        dependent_varname, independent_varname, num_iterations,
+        metrics_compute_frequency)
+    if metrics_compute_frequency is None:
+        metrics_compute_frequency = num_iterations
 
     start_training_time = datetime.datetime.now()
 
@@ -135,7 +161,7 @@ def fit(schema_madlib, source_table, model,model_arch_table,
 
     # Construct validation dataset if provided
     validation_set_provided = bool(validation_table)
-    validation_aggregate_accuracy = []; validation_aggregate_loss = []
+    validation_metrics = []; validation_loss = []
 
     # Prepare the SQL for running distributed training via UDA
     compile_params_to_pass = "$madlib$" + compile_params + "$madlib$"
@@ -161,143 +187,138 @@ def fit(schema_madlib, source_table, 
model,model_arch_table,
     # Define the state for the model and loss/accuracy storage lists
     model_state = madlib_keras_serializer.serialize_weights(
         0, 0, 0, model_weights)
-    aggregate_loss, aggregate_accuracy, aggregate_runtime = [], [], []
+    training_loss, training_metrics, aggregate_runtime = [], [], []
+    metrics_iters = []
 
     plpy.info("Model architecture size: {}KB".format(len(model_arch)/1024))
     plpy.info("Model state (serialized) size: {}MB".format(
         len(model_state)/1024/1024))
 
     # Run distributed training for specified number of iterations
-    for i in range(num_iterations):
+    for i in range(1, num_iterations+1):
         start_iteration = time.time()
-        iteration_result = plpy.execute(run_training_iteration, 
[model_state])[0]['iteration_result']
+        iteration_result = plpy.execute(run_training_iteration,
+                                        [model_state])[0]['iteration_result']
         end_iteration = time.time()
         plpy.info("Time for iteration {0}: {1} sec".
-                  format(i + 1, end_iteration - start_iteration))
+                  format(i, end_iteration - start_iteration))
         aggregate_runtime.append(datetime.datetime.now())
-        avg_loss, avg_accuracy, model_state = 
madlib_keras_serializer.deserialize_iteration_state(iteration_result)
+        avg_loss, avg_metric, model_state = madlib_keras_serializer.\
+            deserialize_iteration_state(iteration_result)
         plpy.info("Average loss after training iteration {0}: {1}".format(
-            i + 1, avg_loss))
+            i, avg_loss))
         plpy.info("Average accuracy after training iteration {0}: {1}".format(
-            i + 1, avg_accuracy))
-        if validation_set_provided:
-            _, _, _, updated_weights = 
madlib_keras_serializer.deserialize_weights(
-                model_state, model_shapes)
-            master_model.set_weights(updated_weights)
-            start_val = time.time()
-            evaluate_result = get_loss_acc_from_keras_eval(schema_madlib,
-                                                           validation_table,
-                                                           dependent_varname,
-                                                           independent_varname,
-                                                           
compile_params_to_pass,
-                                                           model_arch, 
model_state,
-                                                           gpus_per_host,
-                                                           segments_per_host,
-                                                           seg_ids_val,
-                                                           images_per_seg_val,
-                                                           gp_segment_id_col)
-            end_val = time.time()
-            plpy.info("Time for validation in iteration {0}: {1} sec". 
format(i + 1, end_val - start_val))
-            if len(evaluate_result) < 2:
-                plpy.error('Calling evaluate on validation data returned < 2 '
-                           'metrics. Expected metrics are loss and accuracy')
-            validation_loss = evaluate_result[0]
-            validation_accuracy = evaluate_result[1]
-            plpy.info("Validation set accuracy after iteration {0}: {1}".
-                      format(i + 1, validation_accuracy))
-            validation_aggregate_accuracy.append(validation_accuracy)
-            validation_aggregate_loss.append(validation_loss)
-        aggregate_loss.append(avg_loss)
-        aggregate_accuracy.append(avg_accuracy)
+            i, avg_metric))
+
+        if should_compute_metrics_this_iter(i, metrics_compute_frequency,
+                                            num_iterations):
+            # TODO: Do we need this code?
+            # _, _, _, updated_weights = 
madlib_keras_serializer.deserialize_weights(
+            #     model_state, model_shapes)
+            # master_model.set_weights(updated_weights)
+            # Compute loss/accuracy for training data.
+            # TODO: Uncomment this once JIRA MADLIB-1332 is merged to master
+            # compute_loss_and_metrics(
+            #     schema_madlib, source_table, dependent_varname,
+            #     independent_varname, compile_params_to_pass, model_arch,
+            #     model_state, gpus_per_host, segments_per_host, seg_ids_val,
+            #     images_per_seg_val, gp_segment_id_col,
+            #     training_metrics, training_loss,
+            #     i, "Training")
+            metrics_iters.append(i)
+            if validation_set_provided:
+                # Compute loss/accuracy for validation data.
+                compute_loss_and_metrics(
+                    schema_madlib, validation_table, dependent_varname,
+                    independent_varname, compile_params_to_pass, model_arch,
+                    model_state, gpus_per_host, segments_per_host, seg_ids_val,
+                    images_per_seg_val, gp_segment_id_col,
+                    validation_metrics, validation_loss,
+                    i, "Validation")
+        training_loss.append(avg_loss)
+        training_metrics.append(avg_metric)
 
     end_training_time = datetime.datetime.now()
 
-    final_validation_acc = None
-    if validation_aggregate_accuracy and len(validation_aggregate_accuracy) > 
0:
-        final_validation_acc = validation_aggregate_accuracy[-1]
-
-    final_validation_loss = None
-    if validation_aggregate_loss and len(validation_aggregate_loss) > 0:
-        final_validation_loss = validation_aggregate_loss[-1]
     version = madlib_version(schema_madlib)
-    class_values, class_values_type = get_col_value_and_type(
-        fit_validator.source_summary_table, CLASS_VALUES_COLNAME)
-    norm_const, norm_const_type = get_col_value_and_type(
-        fit_validator.source_summary_table, NORMALIZING_CONST_COLNAME)
-    dep_vartype = plpy.execute("SELECT {0} AS dep FROM {1}".format(
-        DEPENDENT_VARTYPE_COLNAME, 
fit_validator.source_summary_table))[0]['dep']
-    dependent_varname_in_source_table = quote_ident(plpy.execute("SELECT {0} 
FROM {1}".format(
-        'dependent_varname', 
fit_validator.source_summary_table))[0]['dependent_varname'])
-    independent_varname_in_source_table = quote_ident(plpy.execute("SELECT {0} 
FROM {1}".format(
-        'independent_varname', 
fit_validator.source_summary_table))[0]['independent_varname'])
+    src_summary_dict = get_source_summary_table_dict(fit_validator)
+    class_values = src_summary_dict['class_values']
+    class_values_type = src_summary_dict['class_values_type']
+    norm_const = src_summary_dict['norm_const']
+    norm_const_type = src_summary_dict['norm_const_type']
+    dep_vartype = src_summary_dict['dep_vartype']
+    dependent_varname_in_source_table = 
src_summary_dict['dependent_varname_in_source_table']
+    independent_varname_in_source_table = 
src_summary_dict['independent_varname_in_source_table']
+    # Define some constants to be inserted into the summary table.
+    model_type = "madlib_keras"
+    model_size = sys.getsizeof(model)
+    metrics_iters = metrics_iters if metrics_iters else 'NULL'
+    # We always compute the training loss and metrics, at least once.
+    training_metrics_final = training_metrics[-1]
+    training_loss_final = training_loss[-1]
+    training_metrics = training_metrics if training_metrics else 'NULL'
+    training_loss = training_loss if training_loss else 'NULL'
+    # Validation loss and metrics are computed only if validation_table
+    # is provided.
+    if validation_set_provided:
+        validation_metrics_final = validation_metrics[-1]
+        validation_loss_final = validation_loss[-1]
+        validation_metrics = 'ARRAY{0}'.format(validation_metrics)
+        validation_loss = 'ARRAY{0}'.format(validation_loss)
+        # Must quote the string before inserting to table. Explicitly
+        # quoting it here since this can also take a NULL value, done
+        # in the else part.
+        validation_table = "$MAD${0}$MAD$".format(validation_table)
+    else:
+        validation_metrics = validation_loss = 'NULL'
+        validation_metrics_final = validation_loss_final = 'NULL'
+        validation_table = 'NULL'
+
     create_output_summary_table = plpy.prepare("""
-        CREATE TABLE {0}_summary AS
+        CREATE TABLE {output_summary_model_table} AS
         SELECT
-        $1 AS model_arch_table,
-        $2 AS model_arch_id,
-        $3 AS model_type,
-        $4 AS start_training_time,
-        $5 AS end_training_time,
-        $6 AS source_table,
-        $7 AS validation_table,
-        $8 AS model,
-        $9 AS dependent_varname,
-        $10 AS independent_varname,
-        $11 AS name,
-        $12 AS description,
-        $13 AS model_size,
-        $14 AS madlib_version,
-        $15 AS compile_params,
-        $16 AS fit_params,
-        $17 AS num_iterations,
-        $18 AS num_classes,
-        $19 AS accuracy,
-        $20 AS loss,
-        $21 AS accuracy_iter,
-        $22 AS loss_iter,
-        $23 AS time_iter,
-        $24 AS accuracy_validation,
-        $25 AS loss_validation,
-        $26 AS accuracy_iter_validation,
-        $27 AS loss_iter_validation,
-        $28 AS {1},
-        $29 AS {2},
-        $30 AS {3}
-        """.format(model, CLASS_VALUES_COLNAME, DEPENDENT_VARTYPE_COLNAME,
-                   NORMALIZING_CONST_COLNAME),
-                   ["TEXT", "INTEGER", "TEXT", "TIMESTAMP",
-                    "TIMESTAMP", "TEXT", "TEXT","TEXT",
-                    "TEXT", "TEXT", "TEXT", "TEXT", "INTEGER",
-                    "TEXT", "TEXT", "TEXT", "INTEGER",
-                    "INTEGER", "DOUBLE PRECISION",
-                    "DOUBLE PRECISION", "DOUBLE PRECISION[]",
-                    "DOUBLE PRECISION[]", "TIMESTAMP[]",
-                    "DOUBLE PRECISION", "DOUBLE PRECISION",
-                    "DOUBLE PRECISION[]", "DOUBLE PRECISION[]",
-                    class_values_type, "TEXT", norm_const_type])
-    plpy.execute(
-        create_output_summary_table,
-        [
-            model_arch_table, model_arch_id,
-            "madlib_keras",
-            start_training_time, end_training_time,
-            source_table, validation_table,
-            model, dependent_varname_in_source_table,
-            independent_varname_in_source_table, name, description,
-            sys.getsizeof(model), version, compile_params,
-            fit_params, num_iterations, num_classes,
-            aggregate_accuracy[-1],
-            aggregate_loss[-1],
-            aggregate_accuracy, aggregate_loss,
-            aggregate_runtime, final_validation_acc,
-            final_validation_loss,
-            validation_aggregate_accuracy,
-            validation_aggregate_loss,
-            class_values,
-            dep_vartype,
-            norm_const
-        ]
-        )
+            $MAD${source_table}$MAD$::TEXT AS source_table,
+            $MAD${model}$MAD$::TEXT AS model,
+            $MAD${dependent_varname_in_source_table}$MAD$::TEXT AS 
dependent_varname,
+            $MAD${independent_varname_in_source_table}$MAD$::TEXT AS 
independent_varname,
+            $MAD${model_arch_table}$MAD$::TEXT AS model_arch_table,
+            {model_arch_id} AS model_arch_id,
+            $1 AS compile_params,
+            $2 AS fit_params,
+            {num_iterations} AS num_iterations,
+            {validation_table}::TEXT AS validation_table,
+            {metrics_compute_frequency} AS metrics_compute_frequency,
+            $3 AS name,
+            $4 AS description,
+            '{model_type}'::TEXT AS model_type,
+            {model_size} AS model_size,
+            '{start_training_time}'::TIMESTAMP AS start_training_time,
+            '{end_training_time}'::TIMESTAMP AS end_training_time,
+            $5 AS time_iter,
+            '{version}'::TEXT AS madlib_version,
+            {num_classes} AS num_classes,
+            $6 AS {class_values_colname},
+            '{dep_vartype}' AS {dependent_vartype_colname},
+            {norm_const} AS {normalizing_const_colname},
+            {training_metrics_final} AS training_metrics_final,
+            {training_loss_final} AS training_loss_final,
+            ARRAY{training_metrics}::DOUBLE PRECISION[] AS training_metrics,
+            ARRAY{training_loss}::DOUBLE PRECISION[] AS training_loss,
+            {validation_metrics_final} AS validation_metrics_final,
+            {validation_loss_final} AS validation_loss_final,
+            {validation_metrics}::DOUBLE PRECISION[] AS validation_metrics,
+            {validation_loss}::DOUBLE PRECISION[] AS validation_loss,
+            ARRAY{metrics_iters}::INTEGER[] AS metrics_iters
+        
""".format(output_summary_model_table=fit_validator.output_summary_model_table,
+                   class_values_colname=CLASS_VALUES_COLNAME,
+                   dependent_vartype_colname=DEPENDENT_VARTYPE_COLNAME,
+                   normalizing_const_colname=NORMALIZING_CONST_COLNAME,
+                   **locals()),
+                   ["TEXT", "TEXT", "TEXT","TEXT", "TIMESTAMP[]",
+                    class_values_type])
+    plpy.execute(create_output_summary_table,
+                 [compile_params, fit_params, name,
+                  description, aggregate_runtime, class_values])
 
     create_output_table = plpy.prepare("""
         CREATE TABLE {0} AS
@@ -310,6 +331,61 @@ def fit(schema_madlib, source_table, 
model,model_arch_table,
     #TODO add a unit test for this in a future PR
     reset_cuda_env(original_cuda_env)
 
+def compute_loss_and_metrics(schema_madlib, table, dependent_varname,
+                             independent_varname, compile_params, model_arch,
+                             model_state, gpus_per_host, segments_per_host,
+                             seg_ids_val, rows_per_seg_val,
+                             gp_segment_id_col, metrics_list, loss_list,
+                             curr_iter, dataset_name):
+    """
+    Compute the loss and metric using a given model (model_state) on the
+    given dataset (table.)
+    """
+    start_val = time.time()
+    evaluate_result = get_loss_acc_from_keras_eval(schema_madlib,
+                                                   table,
+                                                   dependent_varname,
+                                                   independent_varname,
+                                                   compile_params,
+                                                   model_arch, model_state,
+                                                   gpus_per_host,
+                                                   segments_per_host,
+                                                   seg_ids_val,
+                                                   rows_per_seg_val,
+                                                   gp_segment_id_col)
+    end_val = time.time()
+    plpy.info("Time for evaluation in iteration {0}: {1} sec.". format(
+        curr_iter, end_val - start_val))
+    if len(evaluate_result) < 2:
+        plpy.error('Calling evaluate on table {0} returned < 2 '
+                   'metrics. Expected both loss and a metric.'.format(
+            table))
+    loss = evaluate_result[0]
+    metric = evaluate_result[1]
+    plpy.info("{0} set metric after iteration {1}: {2}.".
+              format(dataset_name, curr_iter, metric))
+    plpy.info("{0} set loss after iteration {1}: {2}.".
+              format(dataset_name, curr_iter, loss))
+    metrics_list.append(metric)
+    loss_list.append(loss)
+
+def should_compute_metrics_this_iter(curr_iter, metrics_compute_frequency,
+                                     num_iterations):
+    """
+    Check if we want to compute loss/accuracy for the current iteration
+    :param curr_iter:
+    :param metrics_compute_frequency:
+    :param num_iterations:
+    :return: Returns a boolean
+            return TRUE, if it is the last iteration, or if 
metrics_compute_frequency
+            iterations have elapsed since the last time it was computed.
+            return FALSE otherwise.
+    """
+    # Compute loss/accuracy every metrics_compute_frequency'th iteration,
+    # and also for the last iteration.
+    return (curr_iter)%metrics_compute_frequency == 0 or \
+           curr_iter == num_iterations
+
 def get_images_per_seg(source_table, dependent_varname):
     """
     Compute total images in each segment, by querying source_table.  For
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
index 2d7170b..77222a9 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
@@ -38,6 +38,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
     num_iterations          INTEGER,
     gpus_per_host           INTEGER,
     validation_table        VARCHAR,
+    metrics_compute_frequency  INTEGER,
     name                    VARCHAR,
     description             VARCHAR
 ) RETURNS VOID AS $$
@@ -47,6 +48,39 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
+    source_table            VARCHAR,
+    model                   VARCHAR,
+    model_arch_table        VARCHAR,
+    model_arch_id           INTEGER,
+    compile_params          VARCHAR,
+    fit_params              VARCHAR,
+    num_iterations          INTEGER,
+    gpus_per_host           INTEGER,
+    validation_table        VARCHAR,
+    metrics_compute_frequency  INTEGER,
+    name                    VARCHAR
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9, 
$10, $11, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
+    source_table            VARCHAR,
+    model                   VARCHAR,
+    model_arch_table        VARCHAR,
+    model_arch_id           INTEGER,
+    compile_params          VARCHAR,
+    fit_params              VARCHAR,
+    num_iterations          INTEGER,
+    gpus_per_host           INTEGER,
+    validation_table        VARCHAR,
+    metrics_compute_frequency  INTEGER
+) RETURNS VOID AS $$
+    SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9, 
$10, NULL, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
 
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
     source_table            VARCHAR,
@@ -59,7 +93,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
     gpus_per_host           INTEGER,
     validation_table        VARCHAR
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9, 
NULL, NULL);
+    SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, $9, 
NULL, NULL, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
 
@@ -73,7 +107,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
     num_iterations          INTEGER,
     gpus_per_host           INTEGER
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, 
NULL, NULL, NULL);
+    SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, $8, 
NULL, NULL, NULL, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
 
@@ -86,7 +120,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_fit(
     fit_params              VARCHAR,
     num_iterations          INTEGER
 ) RETURNS VOID AS $$
-    SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, 0, NULL, 
NULL, NULL);
+    SELECT MADLIB_SCHEMA.madlib_keras_fit($1, $2, $3, $4, $5, $6, $7, 0, NULL, 
NULL, NULL, NULL);
 $$ LANGUAGE sql VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
 
diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
index d66219c..9210550 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
@@ -34,6 +34,7 @@ from utilities.utilities import is_var_valid
 from utilities.utilities import is_valid_psql_type
 from utilities.utilities import NUMERIC
 from utilities.utilities import ONLY_ARRAY
+from utilities.validate_args import cols_in_tbl_valid
 from utilities.validate_args import columns_exist_in_table
 from utilities.validate_args import get_expr_type
 from utilities.validate_args import input_tbl_valid
@@ -157,13 +158,14 @@ class PredictInputValidator:
 class FitInputValidator:
     def __init__(self, source_table, validation_table, output_model_table,
                  model_arch_table, dependent_varname, independent_varname,
-                 num_iterations):
+                 num_iterations, metrics_compute_frequency):
         self.source_table = source_table
         self.validation_table = validation_table
         self.output_model_table = output_model_table
         self.model_arch_table = model_arch_table
         self.dependent_varname = dependent_varname
         self.independent_varname = independent_varname
+        self.metrics_compute_frequency = metrics_compute_frequency
         self.num_iterations = num_iterations
         self.source_summary_table = None
         if self.source_table:
@@ -172,36 +174,42 @@ class FitInputValidator:
         if self.output_model_table:
             self.output_summary_model_table = add_postfix(
                 self.output_model_table, "_summary")
-        self.module_name = 'model_keras'
+        self.module_name = 'madlib_keras_fit'
         self._validate_input_args()
 
     def _validate_input_table(self, table):
         _assert(is_var_valid(table, self.independent_varname),
-                "model_keras error: invalid independent_varname "
+                "{module_name}: invalid independent_varname "
                 "('{independent_varname}') for table "
                 "({table}).".format(
+                    module_name=self.module_name,
                     independent_varname=self.independent_varname,
                     table=table))
 
         _assert(is_var_valid(table, self.dependent_varname),
-                "model_keras error: invalid dependent_varname "
+                "{module_name}: invalid dependent_varname "
                 "('{dependent_varname}') for table "
                 "({table}).".format(
+                    module_name=self.module_name,
                     dependent_varname=self.dependent_varname,
                     table=table))
 
+    def _is_valid_metrics_compute_frequency(self):
+        return self.metrics_compute_frequency is None or \
+               (self.metrics_compute_frequency >= 1 and \
+               self.metrics_compute_frequency <= self.num_iterations)
+
     def _validate_input_args(self):
         _assert(self.num_iterations > 0,
-            "model_keras error: Number of iterations cannot be < 1.")
+            "{0}: Number of iterations cannot be < 
1.".format(self.module_name))
+        _assert(self._is_valid_metrics_compute_frequency(),
+            "{0}: metrics_compute_frequency must be in the range (1 - 
{1}).".format(
+                self.module_name, self.num_iterations))
         input_tbl_valid(self.source_table, self.module_name)
         input_tbl_valid(self.source_summary_table, self.module_name)
-        _assert(is_var_valid(
-            self.source_summary_table, CLASS_VALUES_COLNAME),
-                "model_keras error: invalid class_values varname "
-                "('{class_values}') for source_summary_table "
-                "({source_summary_table}).".format(
-                    class_values=CLASS_VALUES_COLNAME,
-                    source_summary_table=self.source_summary_table))
+        cols_in_tbl_valid(self.source_summary_table, [CLASS_VALUES_COLNAME,
+            NORMALIZING_CONST_COLNAME, DEPENDENT_VARTYPE_COLNAME,
+            'dependent_varname', 'independent_varname'], self.module_name)
         # Source table and validation tables must have the same schema
         self._validate_input_table(self.source_table)
         validate_dependent_var_for_minibatch(self.source_table,
diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in 
b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
index bc0789b..8567637 100644
--- a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
@@ -137,15 +137,15 @@ SELECT assert(
         num_iterations = 3 AND
         num_classes = 2 AND
         class_values = '{0,1}' AND
-        accuracy >= 0  AND
-        loss  >= 0  AND
-        array_upper(accuracy_iter, 1) = 3 AND
-        array_upper(loss_iter, 1) = 3 AND
+        training_metrics_final >= 0  AND
+        training_loss_final  >= 0  AND
+        array_upper(training_metrics, 1) = 3 AND
+        array_upper(training_loss, 1) = 3 AND
         array_upper(time_iter, 1) = 3 AND
-        accuracy_validation >= 0 AND
-        loss_validation  >= 0  AND
-        array_upper(accuracy_iter_validation, 1) = 3 AND
-        array_upper(loss_iter_validation, 1) = 3 ,
+        validation_metrics_final >= 0 AND
+        validation_loss_final  >= 0  AND
+        array_upper(validation_metrics, 1) = 1 AND
+        array_upper(validation_loss, 1) = 1 ,
         'Keras model output Summary Validation failed. Actual:' || 
__to_char(summary))
 FROM (SELECT * FROM keras_saved_out_summary) summary;
 
@@ -200,37 +200,42 @@ SELECT madlib_keras_fit(
     2,
     NULL,
     NULL,
+    NULL,
     'model name', 'model desc');
+\x on;
+select * from keras_out_summary;
 SELECT assert(
-    model_arch_table = 'model_arch' AND
-    model_arch_id = 1 AND
-    model_type = 'madlib_keras' AND
-    start_training_time         < now() AND
-    end_training_time > start_training_time AND
     source_table = 'cifar_10_sample_batched' AND
-    validation_table is NULL AND
     model = 'keras_out' AND
     dependent_varname = 'y' AND
-    dependent_vartype = 'smallint' AND
     independent_varname = 'x' AND
+    model_arch_table = 'model_arch' AND
+    model_arch_id = 1 AND
+    compile_params = $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text AND
+    fit_params = $$ batch_size=2, epochs=1, verbose=0 $$::text AND
+    num_iterations = 2 AND
+    validation_table is NULL AND
+    metrics_compute_frequency = 2 AND
     name = 'model name' AND
     description = 'model desc' AND
+    model_type = 'madlib_keras' AND
     model_size > 0 AND
+    start_training_time         < now() AND
+    end_training_time > start_training_time AND
+    array_upper(time_iter, 1) = 2 AND
+    dependent_vartype = 'smallint' AND
     madlib_version is NOT NULL AND
-    compile_params = $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text AND
-    fit_params = $$ batch_size=2, epochs=1, verbose=0 $$::text AND
-    num_iterations = 2 AND
     num_classes = 2 AND
     class_values = '{0,1}' AND
-    accuracy is not NULL AND
-    loss is not NULL AND
-    array_upper(accuracy_iter, 1) = 2 AND
-    array_upper(loss_iter, 1) = 2 AND
-    array_upper(time_iter, 1) = 2 AND
-    accuracy_validation is  NULL AND
-    loss_validation is  NULL AND
-    array_upper(accuracy_iter_validation,1) = 0 AND
-    array_upper(loss_iter_validation,1) = 0 ,
+    normalizing_const = 255.0 AND
+    training_metrics_final is not NULL AND
+    training_loss_final is not NULL AND
+    array_upper(training_metrics, 1) = 2 AND
+    array_upper(training_loss, 1) = 2 AND
+    validation_metrics_final is  NULL AND
+    validation_loss_final is  NULL AND
+    array_upper(validation_metrics, 1) = 0 AND
+    array_upper(validation_loss, 1) = 0 ,
     'Keras model output Summary Validation failed. Actual:' || 
__to_char(summary))
 FROM (SELECT * FROM keras_out_summary) summary;
 
@@ -287,6 +292,7 @@ SELECT madlib_keras_fit(
     1,
     NULL,
     NULL,
+    NULL,
     'model name', 'model desc');
 
 DROP TABLE IF EXISTS keras_out, keras_out_summary;
@@ -300,6 +306,7 @@ SELECT madlib_keras_fit(
     1,
     NULL,
     NULL,
+    NULL,
     'model name', 'model desc');
 
 DROP TABLE IF EXISTS keras_out, keras_out_summary;
@@ -314,6 +321,7 @@ SELECT madlib_keras_fit(
     1,
     0,
     NULL,
+    NULL,
     'model name', 'model desc');
 
 DROP TABLE IF EXISTS keras_out, keras_out_summary;
@@ -327,6 +335,7 @@ SELECT madlib_keras_fit(
     1,
     NULL,
     NULL,
+    NULL,
     'model name', 'model desc');
 
 DROP TABLE IF EXISTS keras_out, keras_out_summary;

[madlib] 01/03: DL: Add new param metrics_compute_frequency to madlib_keras_fit()

Reply via email to