[madlib] 02/03: DL: Replace old evaluate1 function with madlib_keras_evaluate()

njayaram Mon, 03 Jun 2019 12:03:11 -0700

This is an automated email from the ASF dual-hosted git repository.

njayaram pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git


commit e41cf6e619e969e2c0b733a25fe41c3f6f7bad45
Author: Domino Valdano <[email protected]>
AuthorDate: Wed May 15 18:02:51 2019 -0700

    DL: Replace old evaluate1 function with madlib_keras_evaluate()
    
    This function can be called by a user on a test image set, to run loss
    and metric evaluation for a particular model. It handles either 0 or 1
    metric in compile params and adds a metric_type column to make the
    output table more readable.
    
    Also:
    
     - Fixes bug in final function (shouldn't return image count)
     - Validates if summary test table exists
     - Get independent_varname and dependent_varname from summary table.
       Refactor several functions related to evaluate, so that these
       parameters are only used to represent the original variable names
       input by the user. In cases where it refers to the fixed strings
       'independent_var' and 'dependent_var' (columns of minibatch output
       table), they have been removed.
       As with fit, they have also been removed from the interface, so the
       user no longer has to pass them in.
     - Supress extra warning output
     - Adds devcheck tests for evaluate()
    
    Closes #395
    
    Co-authored-by: Orhan Kislal <[email protected]>
    Co-authored-by: Ekta Khanna <[email protected]>
---
 .../modules/deep_learning/madlib_keras.py_in       | 186 ++++++++++++---------
 .../modules/deep_learning/madlib_keras.sql_in      |  33 ++--
 .../deep_learning/madlib_keras_helper.py_in        |   1 +
 .../deep_learning/madlib_keras_validator.py_in     |  80 +++++++--
 .../modules/deep_learning/test/madlib_keras.sql_in |  63 ++++++-
 .../test/unit_tests/test_madlib_keras.py_in        |  91 +++++-----
 6 files changed, 284 insertions(+), 170 deletions(-)

diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
index 6393dd8..f164ce7 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
@@ -38,10 +38,13 @@ from keras.regularizers import *
 import madlib_keras_serializer
 from madlib_keras_helper import MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL
 from madlib_keras_helper import MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL
+from madlib_keras_helper import DEPENDENT_VARNAME_COLNAME
+from madlib_keras_helper import INDEPENDENT_VARNAME_COLNAME
 from madlib_keras_helper import CLASS_VALUES_COLNAME
 from madlib_keras_helper import DEPENDENT_VARTYPE_COLNAME
 from madlib_keras_helper import NORMALIZING_CONST_COLNAME
 from madlib_keras_validator import FitInputValidator
+from madlib_keras_validator import EvaluateInputValidator
 from madlib_keras_wrapper import *
 from keras_model_arch_table import ModelArchSchema
 
@@ -55,23 +58,25 @@ from utilities.utilities import madlib_version
 from utilities.validate_args import get_col_value_and_type
 from utilities.validate_args import get_expr_type
 from utilities.validate_args import quote_ident
+from utilities.control import MinWarning
 
 @MinWarning("warning")
-def fit(schema_madlib, source_table, model,model_arch_table,
+def fit(schema_madlib, source_table, model, model_arch_table,
         model_arch_id, compile_params, fit_params, num_iterations,
-        gpus_per_host = 0, validation_table=None,
+        gpus_per_host=0, validation_table=None,
         metrics_compute_frequency=None, name="",
         description="", **kwargs):
     source_table = quote_ident(source_table)
-    dependent_varname = MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL
-    independent_varname = MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL
     model_arch_table = quote_ident(model_arch_table)
     fit_params = "" if not fit_params else fit_params
     _assert(compile_params, "Compile parameters cannot be empty or NULL.")
 
+    mb_dep_var_col = MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL
+    mb_indep_var_col = MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL
+
     fit_validator = FitInputValidator(
         source_table, validation_table, model, model_arch_table,
-        dependent_varname, independent_varname,
+        mb_dep_var_col, mb_indep_var_col,
         num_iterations, metrics_compute_frequency)
     if metrics_compute_frequency is None:
         metrics_compute_frequency = num_iterations
@@ -80,13 +85,7 @@ def fit(schema_madlib, source_table, model,model_arch_table,
     metrics_elapsed_start_time = time.time()
     start_training_time = datetime.datetime.now()
 
-    gpus_per_host = 0 if gpus_per_host is None else gpus_per_host
-    segments_per_host = get_segments_per_host()
-
-    if 0 < gpus_per_host < segments_per_host:
-        plpy.warning('The number of gpus per host is less than the number of '
-                     'segments per host. The support for this case is '
-                     'experimental and it may fail.')
+    segments_per_host, gpus_per_host = get_segments_and_gpus(gpus_per_host)
 
     #TODO add a unit test for this in a future PR
     # save the original value of the env variable so that we can reset it 
later.
@@ -123,10 +122,10 @@ def fit(schema_madlib, source_table, 
model,model_arch_table,
         gp_segment_id_col = 'gp_segment_id'
 
     # Compute total images on each segment
-    seg_ids_train, images_per_seg_train = get_images_per_seg(source_table, 
dependent_varname)
+    seg_ids_train, images_per_seg_train = get_images_per_seg(source_table)
 
     if validation_table:
-        seg_ids_val, images_per_seg_val = get_images_per_seg(validation_table, 
dependent_varname)
+        seg_ids_val, images_per_seg_val = get_images_per_seg(validation_table)
 
     # Convert model from json and initialize weights
     master_model = model_from_json(model_arch)
@@ -152,8 +151,8 @@ def fit(schema_madlib, source_table, model,model_arch_table,
     fit_params_to_pass = "$madlib$" + fit_params + "$madlib$"
     run_training_iteration = plpy.prepare("""
         SELECT {schema_madlib}.fit_step(
-            {dependent_varname}::SMALLINT[],
-            {independent_varname}::REAL[],
+            {mb_dep_var_col}::SMALLINT[],
+            {mb_indep_var_col}::REAL[],
             $MAD${model_arch}$MAD$::TEXT,
             {compile_params_to_pass}::TEXT,
             {fit_params_to_pass}::TEXT,
@@ -190,8 +189,7 @@ def fit(schema_madlib, source_table, model,model_arch_table,
                                             num_iterations):
             # Compute loss/accuracy for training data.
             compute_loss_and_metrics(
-                schema_madlib, source_table, dependent_varname,
-                independent_varname, compile_params_to_pass, model_arch,
+                schema_madlib, source_table, compile_params_to_pass, 
model_arch,
                 serialized_weights, gpus_per_host, segments_per_host, 
seg_ids_train,
                 images_per_seg_train, training_metrics, training_loss,
                 i, "Training")
@@ -199,11 +197,10 @@ def fit(schema_madlib, source_table, 
model,model_arch_table,
             if validation_set_provided:
                 # Compute loss/accuracy for validation data.
                 compute_loss_and_metrics(
-                    schema_madlib, validation_table, dependent_varname,
-                    independent_varname, compile_params_to_pass, model_arch,
-                    serialized_weights, gpus_per_host, segments_per_host, 
seg_ids_val,
-                    images_per_seg_val, validation_metrics, validation_loss,
-                    i, "Validation")
+                    schema_madlib, validation_table, compile_params_to_pass,
+                    model_arch, serialized_weights, gpus_per_host, 
segments_per_host,
+                    seg_ids_val, images_per_seg_val, validation_metrics,
+                    validation_loss, i, "Validation")
             metrics_elapsed_end_time = time.time()
             metrics_elapsed_time.append(
                 metrics_elapsed_end_time-metrics_elapsed_start_time)
@@ -217,8 +214,8 @@ def fit(schema_madlib, source_table, model,model_arch_table,
     norm_const = src_summary_dict['norm_const']
     norm_const_type = src_summary_dict['norm_const_type']
     dep_vartype = src_summary_dict['dep_vartype']
-    dependent_varname_in_source_table = 
src_summary_dict['dependent_varname_in_source_table']
-    independent_varname_in_source_table = 
src_summary_dict['independent_varname_in_source_table']
+    dependent_varname = src_summary_dict['dependent_varname_in_source_table']
+    independent_varname = 
src_summary_dict['independent_varname_in_source_table']
     # Define some constants to be inserted into the summary table.
     model_type = "madlib_keras"
     compile_params_dict = convert_string_of_args_to_dict(compile_params)
@@ -252,8 +249,8 @@ def fit(schema_madlib, source_table, model,model_arch_table,
         SELECT
             $MAD${source_table}$MAD$::TEXT AS source_table,
             $MAD${model}$MAD$::TEXT AS model,
-            $MAD${dependent_varname_in_source_table}$MAD$::TEXT AS 
dependent_varname,
-            $MAD${independent_varname_in_source_table}$MAD$::TEXT AS 
independent_varname,
+            $MAD${dependent_varname}$MAD$::TEXT AS dependent_varname,
+            $MAD${independent_varname}$MAD$::TEXT AS independent_varname,
             $MAD${model_arch_table}$MAD$::TEXT AS model_arch_table,
             {model_arch_id}::INTEGER AS model_arch_id,
             $1 AS compile_params,
@@ -288,8 +285,7 @@ def fit(schema_madlib, source_table, model,model_arch_table,
                    dependent_vartype_colname=DEPENDENT_VARTYPE_COLNAME,
                    normalizing_const_colname=NORMALIZING_CONST_COLNAME,
                    **locals()),
-                   ["TEXT", "TEXT", "TEXT","TEXT", "DOUBLE PRECISION[]",
-                    class_values_type])
+                   ["TEXT", "TEXT", "TEXT", "TEXT", "DOUBLE PRECISION[]", 
class_values_type])
     plpy.execute(create_output_summary_table,
                  [compile_params, fit_params, name,
                   description, metrics_elapsed_time, class_values])
@@ -338,11 +334,9 @@ def get_metrics_sql_string(metrics_list, 
is_metrics_specified):
         metrics_final = metrics_all = 'NULL'
     return metrics_final, metrics_all
 
-def compute_loss_and_metrics(schema_madlib, table, dependent_varname,
-                             independent_varname, compile_params, model_arch,
+def compute_loss_and_metrics(schema_madlib, table, compile_params, model_arch,
                              serialized_weights, gpus_per_host, 
segments_per_host,
-                             seg_ids, images_per_seg_val,
-                             metrics_list, loss_list,
+                             seg_ids, images_per_seg_val, metrics_list, 
loss_list,
                              curr_iter, dataset_name):
     """
     Compute the loss and metric using a given model (serialized_weights) on the
@@ -351,8 +345,6 @@ def compute_loss_and_metrics(schema_madlib, table, 
dependent_varname,
     start_val = time.time()
     evaluate_result = get_loss_metric_from_keras_eval(schema_madlib,
                                                    table,
-                                                   dependent_varname,
-                                                   independent_varname,
                                                    compile_params,
                                                    model_arch,
                                                    serialized_weights,
@@ -364,9 +356,8 @@ def compute_loss_and_metrics(schema_madlib, table, 
dependent_varname,
     plpy.info("Time for evaluation in iteration {0}: {1} sec.". format(
         curr_iter, end_val - start_val))
     if len(evaluate_result) not in [1, 2]:
-        plpy.error('Calling evaluate on table {0} must return loss '
-                   'and at most one metric value.'.format(
-            table))
+        plpy.error('Calling evaluate on table {0} returned < 2 '
+                   'metrics. Expected both loss and a metric.'.format(table))
     loss = evaluate_result[0]
     metric = evaluate_result[1]
     plpy.info("{0} set metric after iteration {1}: {2}.".
@@ -393,22 +384,24 @@ def should_compute_metrics_this_iter(curr_iter, 
metrics_compute_frequency,
     return (curr_iter)%metrics_compute_frequency == 0 or \
            curr_iter == num_iterations
 
-def get_images_per_seg(source_table, dependent_varname):
+def get_images_per_seg(source_table):
     """
     Compute total images in each segment, by querying source_table.  For
     postgres, this is just the total number of images in the db.
     :param source_table:
-    :param dependent_var:
     :return: Returns a string and two arrays
     1. An array containing all the segment numbers in ascending order
     1. An array containing the total images on each of the segments in the
     segment array.
     """
+
+    mb_dep_var_col = MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL
+
     if is_platform_pg():
         res = plpy.execute(
             """ SELECT SUM(ARRAY_LENGTH({0}, 1)) AS images_per_seg
                 FROM {1}
-            """.format(dependent_varname, source_table))
+            """.format(mb_dep_var_col, source_table))
         images_per_seg = [int(res[0]['images_per_seg'])]
         seg_ids = [0]
     else:
@@ -416,11 +409,11 @@ def get_images_per_seg(source_table, dependent_varname):
             """ SELECT gp_segment_id, SUM(ARRAY_LENGTH({0}, 1)) AS 
images_per_seg
                 FROM {1}
                 GROUP BY gp_segment_id
-            """.format(dependent_varname, source_table))
+            """.format(mb_dep_var_col, source_table))
         seg_ids = [int(each_segment["gp_segment_id"])
                    for each_segment in images_per_seg]
         images_per_seg = [int(each_segment["images_per_seg"])
-            for each_segment in images_per_seg]
+                          for each_segment in images_per_seg]
     return seg_ids, images_per_seg
 
 def fit_transition(state, dependent_var, independent_var, model_architecture,
@@ -472,12 +465,16 @@ def fit_transition(state, dependent_var, independent_var, 
model_architecture,
         total_images = images_per_seg[seg_ids.index(current_seg_id)]
 
     if total_images == 0:
-        plpy.error('Total images is 0 in fit_transition on segment 
{0}'.format(current_seg_id))
+        if is_platform_pg():
+            plpy.error('Total images is 0 in fit_transition')
+
+        else:
+            plpy.error('Total images is 0 in fit_transition on segment 
{0}'.format(current_seg_id))
 
     # Re-serialize the weights
     # Update image count, check if we are done
     if agg_image_count == total_images:
-       # Once done with all images on a segment, we update weights
+        # Once done with all images on a segment, we update weights
         # with the total number of images here instead of the merge function.
         # The merge function only deals with aggregating them.
         updated_weights = [ total_images * w for w in updated_weights ]
@@ -487,7 +484,7 @@ def fit_transition(state, dependent_var, independent_var, 
model_architecture,
             clear_keras_session()
     elif agg_image_count > total_images:
         plpy.error('Processed {0} images, but there were supposed to be only 
{1}!'
-            .format(agg_image_count, total_images))
+                   .format(agg_image_count, total_images))
 
     new_state = madlib_keras_serializer.serialize_state_with_nd_weights(
         agg_image_count, updated_weights)
@@ -535,46 +532,74 @@ def fit_final(state, **kwargs):
     return madlib_keras_serializer.serialize_state_with_1d_weights(
         image_count, weights)
 
-def evaluate1(schema_madlib, model_table, test_table, id_col, model_arch_table,
-            model_arch_id, dependent_varname, independent_varname,
-            compile_params, output_table, **kwargs):
-    # module_name = 'madlib_keras_evaluate'
-    # input_tbl_valid(test_table, module_name)
-    # input_tbl_valid(model_arch_table, module_name)
-    # output_tbl_valid(output_table, module_name)
+def get_segments_and_gpus(gpus_per_host):
+    gpus_per_host = 0 if gpus_per_host is None else gpus_per_host
+    segments_per_host = get_segments_per_host()
 
-    # _validate_input_args(test_table, model_arch_table, output_table)
+    if 0 < gpus_per_host < segments_per_host:
+        plpy.warning('The number of gpus per host is less than the number of '
+                     'segments per host. The support for this case is '
+                     'experimental and it may fail.')
 
-    model_data_query = "SELECT model_data from {0}".format(model_table)
-    serialized_weights = plpy.execute(model_data_query)[0]['model_data']
+    return segments_per_host, gpus_per_host
 
-    model_arch_query = "SELECT model_arch, model_weights FROM {0} " \
-                       "WHERE id = {1}".format(model_arch_table, model_arch_id)
-    query_result = plpy.execute(model_arch_query)
-    if not  query_result or len(query_result) == 0:
-        plpy.error("no model arch found in table {0} with id {1}".format(
-            model_arch_table, model_arch_id))
-    query_result = query_result[0]
-    model_arch = query_result[ModelArchSchema.MODEL_ARCH]
-    compile_params = "$madlib$" + compile_params + "$madlib$"
+def evaluate(schema_madlib, model_table, test_table, output_table, 
gpus_per_host, **kwargs):
+    module_name = 'madlib_keras_evaluate'
+    input_validator = EvaluateInputValidator(test_table, model_table, 
output_table, module_name)
 
-    loss_metric = get_loss_metric_from_keras_eval(
-                    schema_madlib, test_table, dependent_varname,
-                    independent_varname, compile_params, model_arch,
-                    serialized_weights, False, None)
+    model_summary_table = input_validator.model_summary_table
+    test_summary_table = input_validator.test_summary_table
+
+    segments_per_host, gpus_per_host = get_segments_and_gpus(gpus_per_host)
+
+    model_data_query = "SELECT model_data, model_arch from 
{0}".format(model_table)
+    res = plpy.execute(model_data_query)[0]
+    model_data = res['model_data']
+    model_arch = res['model_arch']
+
+    input_shape = get_input_shape(model_arch)
+    input_validator.validate_input_shape(input_shape)
 
-    #TODO remove these infos after adding create table command
-    plpy.info('len of evaluate result is {}'.format(len(loss_metric)))
-    plpy.info('evaluate result loss is {}'.format(loss_metric[0]))
-    plpy.info('evaluate result metric is {}'.format(loss_metric[1]))
+    compile_params_query = "SELECT compile_params, metrics_type FROM 
{0}".format(model_summary_table)
+    res = plpy.execute(compile_params_query)[0]
+    metrics_type = res['metrics_type']
+    compile_params = "$madlib$" + res['compile_params'] + "$madlib$"
 
-def get_loss_metric_from_keras_eval(schema_madlib, table, dependent_varname,
-                                 independent_varname, compile_params,
-                                 model_arch, serialized_weights, gpus_per_host,
-                                 segments_per_host, seg_ids, images_per_seg):
+    seg_ids, images_per_seg = get_images_per_seg(test_table)
+
+    res = plpy.execute("""
+        SELECT {dependent_varname_col}, {independent_varname_col}
+            FROM {test_summary_table}
+        """.format(dependent_varname_col=DEPENDENT_VARNAME_COLNAME,
+                   independent_varname_col=INDEPENDENT_VARNAME_COLNAME,
+                   test_summary_table=test_summary_table))
+
+    dependent_varname = res[0][DEPENDENT_VARNAME_COLNAME]
+    independent_varname = res[0][INDEPENDENT_VARNAME_COLNAME]
+
+    loss, metric =\
+        get_loss_metric_from_keras_eval(schema_madlib, test_table, 
compile_params, model_arch,
+                                        model_data, gpus_per_host, 
segments_per_host,
+                                        seg_ids, images_per_seg)
+
+    if not metrics_type:
+        metrics_type = None
+        metric = None
+
+    with MinWarning("error"):
+        create_output_table = plpy.prepare("""
+            CREATE TABLE {0} AS
+            SELECT $1 as loss, $2 as metric, $3 as 
metrics_type""".format(output_table), ["FLOAT", "FLOAT", "TEXT[]"])
+        plpy.execute(create_output_table, [loss, metric, metrics_type])
+
+def get_loss_metric_from_keras_eval(schema_madlib, table, compile_params,
+                                    model_arch, serialized_weights, 
gpus_per_host,
+                                    segments_per_host, seg_ids, 
images_per_seg):
 
     gp_segment_id_col = '0' if is_platform_pg() else 'gp_segment_id'
 
+    mb_dep_var_col = MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL
+    mb_indep_var_col = MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL
     """
     This function will call the internal keras evaluate function to get the 
loss
     and accuracy of each tuple which then gets averaged to get the final 
result.
@@ -585,8 +610,8 @@ def get_loss_metric_from_keras_eval(schema_madlib, table, 
dependent_varname,
     --  SMALLINT to INTEGER, or change the output of minibatch util to produce 
SMALLINT
     --  For the first, we should change fit_step also
     select ({schema_madlib}.internal_keras_evaluate(
-                                            {dependent_varname}::SMALLINT[],
-                                            {independent_varname}::REAL[],
+                                            {mb_dep_var_col}::SMALLINT[],
+                                            {mb_indep_var_col}::REAL[],
                                             $MAD${model_arch}$MAD$,
                                             $1,
                                             {compile_params},
@@ -689,5 +714,4 @@ def internal_keras_eval_final(state, **kwargs):
     loss /= image_count
     metric /= image_count
 
-    state = loss, metric
-    return state
+    return loss, metric
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
index f18c63d..6b7b0c0 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
@@ -253,32 +253,25 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.internal_keras_predict(
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
-CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_evaluate1(
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_evaluate(
     model_table             VARCHAR,
     test_table              VARCHAR,
-    id_col                  VARCHAR,
-    model_arch_table        VARCHAR,
-    model_arch_id           INTEGER,
-    dependent_varname       VARCHAR,
-    independent_varname     VARCHAR,
-    compile_params          VARCHAR,
-    output_table            VARCHAR
+    output_table            VARCHAR,
+    gpus_per_host           INTEGER
 ) RETURNS VOID AS $$
-    PythonFunctionBodyOnly(`deep_learning', `madlib_keras')
-    with AOControl(False):
-        madlib_keras.evaluate1(schema_madlib,
-               model_table,
-               test_table,
-               id_col,
-               model_arch_table,
-               model_arch_id,
-               dependent_varname,
-               independent_varname,
-               compile_params,
-               output_table)
+    PythonFunction(`deep_learning', `madlib_keras', `evaluate')
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.madlib_keras_evaluate(
+    model_table             VARCHAR,
+    test_table              VARCHAR,
+    output_table            VARCHAR
+) RETURNS VOID AS $$
+  SELECT MADLIB_SCHEMA.madlib_keras_evaluate($1, $2, $3, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA');
+
 CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.internal_keras_eval_transition(
     state                              REAL[3],
     dependent_var                      SMALLINT[],
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
index c83f2f0..03a8399 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
@@ -72,6 +72,7 @@ INDEPENDENT_VARNAME_COLNAME = "independent_varname"
 MODEL_ARCH_TABLE_COLNAME = "model_arch_table"
 MODEL_ARCH_ID_COLNAME = "model_arch_id"
 MODEL_DATA_COLNAME = "model_data"
+METRIC_TYPE_COLNAME = "metrics_type"
 
 # Name of independent and dependent colnames in batched table.
 # These are readonly variables, do not modify.
diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
index 5892308..e344c05 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
@@ -23,10 +23,14 @@ from madlib_keras_helper import CLASS_VALUES_COLNAME
 from madlib_keras_helper import COMPILE_PARAMS_COLNAME
 from madlib_keras_helper import DEPENDENT_VARNAME_COLNAME
 from madlib_keras_helper import DEPENDENT_VARTYPE_COLNAME
+from madlib_keras_helper import INDEPENDENT_VARNAME_COLNAME
 from madlib_keras_helper import MODEL_ARCH_ID_COLNAME
 from madlib_keras_helper import MODEL_ARCH_TABLE_COLNAME
 from madlib_keras_helper import MODEL_DATA_COLNAME
 from madlib_keras_helper import NORMALIZING_CONST_COLNAME
+from madlib_keras_helper import MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL
+from madlib_keras_helper import MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL
+from madlib_keras_helper import METRIC_TYPE_COLNAME
 
 from utilities.minibatch_validation import validate_dependent_var_for_minibatch
 from utilities.utilities import _assert
@@ -82,15 +86,13 @@ def _validate_input_shapes(table, independent_varname, 
input_shape, offset):
                     input_shape, input_shape_from_table,
                     independent_varname, table))
 
-class PredictInputValidator:
-    def __init__(self, test_table, model_table, id_col, independent_varname,
-                 output_table, pred_type, module_name):
+class InputValidator:
+    def __init__(self, test_table, model_table, independent_varname,
+                 output_table, module_name):
         self.test_table = test_table
         self.model_table = model_table
-        self.id_col = id_col
         self.independent_varname = independent_varname
         self.output_table = output_table
-        self.pred_type = pred_type
         if self.model_table:
             self.model_summary_table = add_postfix(
                 self.model_table, "_summary")
@@ -99,14 +101,15 @@ class PredictInputValidator:
 
     def _validate_input_args(self):
         input_tbl_valid(self.model_table, self.module_name)
-        self._validate_model_data_col()
+        self._validate_model_data_cols()
         input_tbl_valid(self.model_summary_table, self.module_name)
-        self._validate_summary_tbl_cols()
+        self._validate_model_summary_tbl_cols()
         input_tbl_valid(self.test_table, self.module_name)
         self._validate_test_tbl_cols()
         output_tbl_valid(self.output_table, self.module_name)
 
-    def _validate_model_data_col(self):
+
+    def _validate_model_data_cols(self):
         _assert(is_var_valid(self.model_table, MODEL_DATA_COLNAME),
                 "{module_name} error: column '{model_data}' "
                 "does not exist in model table '{table}'.".format(
@@ -129,14 +132,7 @@ class PredictInputValidator:
                     independent_varname=self.independent_varname,
                     table=self.test_table))
 
-        _assert(is_var_valid(self.test_table, self.id_col),
-                "{module_name} error: invalid id column "
-                "('{id_col}') for test table ({table}).".format(
-                    module_name=self.module_name,
-                    id_col=self.id_col,
-                    table=self.test_table))
-
-    def _validate_summary_tbl_cols(self):
+    def _validate_model_summary_tbl_cols(self):
         cols_to_check_for = [CLASS_VALUES_COLNAME,
                              DEPENDENT_VARNAME_COLNAME,
                              DEPENDENT_VARTYPE_COLNAME,
@@ -149,6 +145,49 @@ class PredictInputValidator:
             "summary table ('{1}'). The expected columns are {2}.".format(
                 self.module_name, self.model_summary_table, cols_to_check_for))
 
+class EvaluateInputValidator(InputValidator):
+    def __init__(self, test_table, model_table, output_table, module_name):
+        self.test_summary_table = None
+        if test_table:
+            self.test_summary_table = add_postfix(test_table, "_summary")
+
+        self.independent_varname = MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL
+        InputValidator.__init__(self, test_table, model_table,
+                                self.independent_varname,
+                                output_table, module_name)
+
+    def _validate_input_args(self):
+        input_tbl_valid(self.test_summary_table, self.module_name)
+        self._validate_test_summary_tbl_cols()
+        InputValidator._validate_input_args(self)
+        validate_dependent_var_for_minibatch(self.test_table,
+                                             
MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL)
+
+    def _validate_model_summary_tbl_cols(self):
+        cols_to_check_for = [COMPILE_PARAMS_COLNAME, METRIC_TYPE_COLNAME]
+        _assert(columns_exist_in_table(
+            self.model_summary_table, cols_to_check_for),
+            "{0} error: One or more expected columns missing in model "
+            "summary table ('{1}'). The expected columns are {2}.".format(
+                self.module_name, self.model_summary_table, cols_to_check_for))
+
+    def _validate_test_summary_tbl_cols(self):
+        cols_in_tbl_valid(self.test_summary_table, [CLASS_VALUES_COLNAME,
+            NORMALIZING_CONST_COLNAME, DEPENDENT_VARTYPE_COLNAME,
+            DEPENDENT_VARNAME_COLNAME, INDEPENDENT_VARNAME_COLNAME], 
self.module_name)
+
+    def validate_input_shape(self, input_shape_from_arch):
+        _validate_input_shapes(self.test_table, self.independent_varname,
+                               input_shape_from_arch, 2)
+
+class PredictInputValidator(InputValidator):
+    def __init__(self, test_table, model_table, id_col, independent_varname,
+                 output_table, pred_type, module_name):
+        self.id_col = id_col
+        self.pred_type = pred_type
+        InputValidator.__init__(self, test_table, model_table, 
independent_varname,
+                               output_table, module_name)
+
     def validate_pred_type(self, class_values):
         if not self.pred_type in ['prob', 'response']:
             plpy.error("{0}: Invalid value for pred_type param ({1}). Must be 
"\
@@ -162,6 +201,15 @@ class PredictInputValidator:
         _validate_input_shapes(self.test_table, self.independent_varname,
                                input_shape_from_arch, 1)
 
+    def _validate_test_tbl_cols(self):
+        InputValidator._validate_test_tbl_cols(self)
+        _assert(is_var_valid(self.test_table, self.id_col),
+                "{module_name} error: invalid id column "
+                "('{id_col}') for test table ({table}).".format(
+                    module_name=self.module_name,
+                    id_col=self.id_col,
+                    table=self.test_table))
+
 class FitInputValidator:
     def __init__(self, source_table, validation_table, output_model_table,
                  model_arch_table, dependent_varname, independent_varname,
diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in 
b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
index 6e6065e..847418f 100644
--- a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
@@ -25,12 +25,7 @@ copy cifar_10_sample from stdin delimiter '|';
 
2|1|'dog'|'0/img2.jpg'|{{{126,118,110},{122,115,108},{126,119,111},{127,119,109},{130,122,111},{130,122,111},{132,124,113},{133,125,114},{130,122,111},{132,124,113},{134,126,115},{131,123,112},{131,123,112},{134,126,115},{133,125,114},{136,128,117},{137,129,118},{137,129,118},{136,128,117},{131,123,112},{130,122,111},{132,124,113},{132,124,113},{132,124,113},{129,122,110},{127,121,109},{127,121,109},{125,119,107},{124,118,106},{124,118,106},{120,114,102},{117,111,99}},{{122,115,107},{119
 [...]
 \.
 
-drop table if exists cifar_10_sample_val;
-create table cifar_10_sample_val(independent_var REAL[], dependent_var 
INTEGER[], buffer_id SMALLINT);
-copy cifar_10_sample_val from stdin delimiter '|';
-{{{{0.494118,0.462745,0.431373},{0.478431,0.45098,0.423529},{0.494118,0.466667,0.435294},{0.498039,0.466667,0.427451},{0.509804,0.478431,0.435294},{0.509804,0.478431,0.435294},{0.517647,0.486275,0.443137},{0.521569,0.490196,0.447059},{0.509804,0.478431,0.435294},{0.517647,0.486275,0.443137},{0.52549,0.494118,0.45098},{0.513726,0.482353,0.439216},{0.513726,0.482353,0.439216},{0.52549,0.494118,0.45098},{0.521569,0.490196,0.447059},{0.533333,0.501961,0.458824},{0.537255,0.505882,0.462745},{
 [...]
-{{{{0.792157,0.8,0.780392},{0.792157,0.8,0.780392},{0.8,0.807843,0.788235},{0.807843,0.815686,0.796079},{0.815686,0.823529,0.803922},{0.819608,0.827451,0.807843},{0.823529,0.831373,0.811765},{0.831373,0.839216,0.823529},{0.835294,0.843137,0.831373},{0.843137,0.85098,0.839216},{0.847059,0.854902,0.843137},{0.847059,0.854902,0.843137},{0.843137,0.85098,0.839216},{0.847059,0.854902,0.843137},{0.847059,0.854902,0.843137},{0.847059,0.854902,0.839216},{0.85098,0.858824,0.839216},{0.85098,0.858
 [...]
-\.
+
 -- normalize the indep variable
 -- TODO Calling this function makes keras.fit fail with the exception 
(investigate later)
 -- NOTICE:  Releasing segworker groups to finish aborting the transaction.
@@ -72,6 +67,17 @@ INSERT INTO cifar_10_sample_batched_summary values (
     1,
     255.0);
 
+drop table if exists cifar_10_sample_val;
+create table cifar_10_sample_val(independent_var REAL[], dependent_var 
INTEGER[], buffer_id SMALLINT);
+copy cifar_10_sample_val from stdin delimiter '|';
+{{{{0.494118,0.462745,0.431373},{0.478431,0.45098,0.423529},{0.494118,0.466667,0.435294},{0.498039,0.466667,0.427451},{0.509804,0.478431,0.435294},{0.509804,0.478431,0.435294},{0.517647,0.486275,0.443137},{0.521569,0.490196,0.447059},{0.509804,0.478431,0.435294},{0.517647,0.486275,0.443137},{0.52549,0.494118,0.45098},{0.513726,0.482353,0.439216},{0.513726,0.482353,0.439216},{0.52549,0.494118,0.45098},{0.521569,0.490196,0.447059},{0.533333,0.501961,0.458824},{0.537255,0.505882,0.462745},{
 [...]
+{{{{0.792157,0.8,0.780392},{0.792157,0.8,0.780392},{0.8,0.807843,0.788235},{0.807843,0.815686,0.796079},{0.815686,0.823529,0.803922},{0.819608,0.827451,0.807843},{0.823529,0.831373,0.811765},{0.831373,0.839216,0.823529},{0.835294,0.843137,0.831373},{0.843137,0.85098,0.839216},{0.847059,0.854902,0.843137},{0.847059,0.854902,0.843137},{0.843137,0.85098,0.839216},{0.847059,0.854902,0.843137},{0.847059,0.854902,0.843137},{0.847059,0.854902,0.839216},{0.85098,0.858824,0.839216},{0.85098,0.858
 [...]
+\.
+
+DROP TABLE IF EXISTS cifar_10_sample_val_summary;
+CREATE TABLE cifar_10_sample_val_summary AS
+       SELECT * FROM cifar_10_sample_batched_summary;
+
 --- NOTE:  In order to test fit_merge, we need at least 2 rows in the batched 
table (1 on each segment).
 --- ALSO NOTE: As part of supporting Postgres, an issue was reported JIRA 
MADLIB-1326.
 --- Once this bug is fixed, we should uncomment these 2 lines, which was used 
to generate
@@ -157,6 +163,29 @@ SELECT assert(
         model_arch IS NOT NULL, 'Keras model output validation failed. 
Actual:' || __to_char(k))
 FROM (SELECT * FROM keras_saved_out) k;
 
+-- Test that evaluate works as expected:
+DROP TABLE IF EXISTS evaluate_out;
+SELECT madlib_keras_evaluate('keras_saved_out', 'cifar_10_sample_val', 
'evaluate_out', 0);
+
+SELECT assert(loss IS NOT NULL AND
+        metric IS NOT NULL AND
+        metrics_type = '{mae}', 'Evaluate output validation failed.  Actual:' 
|| __to_char(evaluate_out))
+FROM evaluate_out;
+
+-- Test that passing NULL / None instead of 0 for gpus_per_host works
+DROP TABLE IF EXISTS evaluate_out;
+SELECT madlib_keras_evaluate('keras_saved_out', 'cifar_10_sample_val', 
'evaluate_out');
+SELECT assert(loss IS NOT NULL AND
+        metric IS NOT NULL AND
+        metrics_type = '{mae}', 'Evaluate output validation failed.  Actual:' 
|| __to_char(evaluate_out))
+FROM evaluate_out;
+
+-- Test that evaluate errors out correctly if model_arch field missing from 
fit output
+DROP TABLE IF EXISTS evaluate_out;
+ALTER TABLE keras_saved_out DROP COLUMN model_arch;
+SELECT assert(trap_error($TRAP$
+       SELECT madlib_keras_evaluate('keras_saved_out', 'cifar_10_sample_val', 
'evaluate_out');
+       $TRAP$) = 1, 'Should error out if model_arch column is missing from 
model_table');
 
 -- Verify number of iterations for which metrics and loss are computed
 DROP TABLE IF EXISTS keras_saved_out, keras_saved_out_summary;
@@ -276,7 +305,7 @@ FROM (SELECT * FROM keras_out_summary) summary;
 
 SELECT assert(model_data IS NOT NULL , 'Keras model output validation failed') 
FROM (SELECT * FROM keras_out) k;
 
--- Validate metrics=NULL works fine
+-- Validate metrics=NULL works with fit
 DROP TABLE IF EXISTS keras_saved_out, keras_saved_out_summary;
 SELECT madlib_keras_fit(
 'cifar_10_sample_batched',
@@ -299,7 +328,16 @@ SELECT assert(
         'Keras model output Summary Validation failed. Actual:' || 
__to_char(summary))
 FROM (SELECT * FROM keras_saved_out_summary) summary;
 
--- Validate metrics=[] works fine
+-- Validate that metrics=NULL works with evaluate
+DROP TABLE IF EXISTS evaluate_out;
+SELECT madlib_keras_evaluate('keras_saved_out', 'cifar_10_sample_val', 
'evaluate_out', 0);
+
+SELECT assert(loss IS NOT NULL AND
+        metric IS NULL AND
+        metrics_type IS NULL, 'Evaluate output validation for NULL metric 
failed.  Actual:' || __to_char(evaluate_out))
+FROM evaluate_out;
+
+-- Validate metrics=[] works with fit
 DROP TABLE IF EXISTS keras_saved_out, keras_saved_out_summary;
 SELECT madlib_keras_fit(
 'cifar_10_sample_batched',
@@ -322,6 +360,15 @@ SELECT assert(
         'Keras model output Summary Validation failed. Actual:' || 
__to_char(summary))
 FROM (SELECT * FROM keras_saved_out_summary) summary;
 
+-- Validate metrics=[] works with evaluate
+DROP TABLE IF EXISTS evaluate_out;
+SELECT madlib_keras_evaluate('keras_saved_out', 'cifar_10_sample_val', 
'evaluate_out', 0);
+
+SELECT assert(loss IS NOT NULL AND
+        metric IS NULL AND
+        metrics_type IS NULL, 'Evaluate output validation for [] metric 
failed.  Actual:' || __to_char(evaluate_out))
+FROM evaluate_out;
+
 DROP TABLE IF EXISTS cifar10_predict;
 SELECT madlib_keras_predict(
     'keras_saved_out',
diff --git 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
index 286798f..20546a1 100644
--- 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
+++ 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
@@ -258,7 +258,7 @@ class MadlibKerasFitTestCase(unittest.TestCase):
                 None, self.dependent_var, self.independent_var , 
self.model.to_json(),
                 self.compile_params, self.fit_params, 0, self.all_seg_ids,
                 total_images_per_seg, 0, 4, previous_state.tostring(), **k)
-        self.assertIn('0 rows', str(error.exception))
+        self.assertIn('Total images is 0', str(error.exception))
 
     def test_fit_transition_too_many_images(self):
         self.subject.K.set_session = Mock()
@@ -819,7 +819,7 @@ class MadlibKerasWrapperTestCase(unittest.TestCase):
         self.assertIn('invalid optimizer', str(error.exception))
 
 
-class MadlibKerasValidatorTestCase(unittest.TestCase):
+class MadlibKerasFitInputValidatorTestCase(unittest.TestCase):
     def setUp(self):
         self.plpy_mock = Mock(spec='error')
         patches = {
@@ -888,6 +888,49 @@ class MadlibKerasValidatorTestCase(unittest.TestCase):
             'dep_varname', 'independent_varname', 5, 6)
         self.assertEqual(False, obj._is_valid_metrics_compute_frequency())
 
+class PredictInputValidatorTestCases(unittest.TestCase):
+    def setUp(self):
+        self.plpy_mock = Mock(spec='error')
+        patches = {
+            'plpy': plpy
+        }
+
+        self.plpy_mock_execute = MagicMock()
+        plpy.execute = self.plpy_mock_execute
+
+        self.module_patcher = patch.dict('sys.modules', patches)
+        self.module_patcher.start()
+        import madlib_keras_validator
+        self.module = madlib_keras_validator
+        self.module.PredictInputValidator._validate_input_args = Mock()
+        self.subject = self.module.PredictInputValidator(
+            'test_table', 'model_table', 'id_col', 'independent_varname',
+            'output_table', 'pred_type', 'module_name')
+        self.classes = ['train', 'boat', 'car', 'airplane']
+
+    def tearDown(self):
+        self.module_patcher.stop()
+
+    def test_validate_pred_type_invalid_pred_type(self):
+        self.subject.pred_type = 'invalid'
+        with self.assertRaises(plpy.PLPYException):
+            self.subject.validate_pred_type(['cat', 'dog'])
+
+    def test_validate_pred_type_valid_pred_type_invalid_num_class_values(self):
+        self.subject.pred_type = 'prob'
+        with self.assertRaises(plpy.PLPYException):
+            self.subject.validate_pred_type(range(1599))
+
+    def test_validate_pred_type_valid_pred_type_valid_class_values_prob(self):
+        self.subject.pred_type = 'prob'
+        self.subject.validate_pred_type(range(1598))
+        self.subject.validate_pred_type(None)
+
+    def 
test_validate_pred_type_valid_pred_type_valid_class_values_response(self):
+        self.subject.pred_type = 'response'
+        self.subject.validate_pred_type(range(1598))
+        self.subject.validate_pred_type(None)
+
 class MadlibSerializerTestCase(unittest.TestCase):
     def setUp(self):
         self.plpy_mock = Mock(spec='error')
@@ -967,49 +1010,6 @@ class MadlibSerializerTestCase(unittest.TestCase):
         self.assertEqual(np.array([0,1,3,4,5], dtype=np.float32).tostring(),
                          res)
 
-class PredictInputPredTypeValidationTestCase(unittest.TestCase):
-    def setUp(self):
-        self.plpy_mock = Mock(spec='error')
-        patches = {
-            'plpy': plpy
-        }
-
-        self.plpy_mock_execute = MagicMock()
-        plpy.execute = self.plpy_mock_execute
-
-        self.module_patcher = patch.dict('sys.modules', patches)
-        self.module_patcher.start()
-        import madlib_keras_validator
-        self.module = madlib_keras_validator
-        self.module.PredictInputValidator._validate_input_args = Mock()
-        self.subject = self.module.PredictInputValidator(
-            'test_table', 'model_table', 'id_col', 'independent_varname',
-            'output_table', 'pred_type', 'module_name')
-        self.classes = ['train', 'boat', 'car', 'airplane']
-
-    def tearDown(self):
-        self.module_patcher.stop()
-
-    def test_validate_pred_type_invalid_pred_type(self):
-        self.subject.pred_type = 'invalid'
-        with self.assertRaises(plpy.PLPYException):
-            self.subject.validate_pred_type(['cat', 'dog'])
-
-    def test_validate_pred_type_valid_pred_type_invalid_num_class_values(self):
-        self.subject.pred_type = 'prob'
-        with self.assertRaises(plpy.PLPYException):
-            self.subject.validate_pred_type(range(1599))
-
-    def test_validate_pred_type_valid_pred_type_valid_class_values_prob(self):
-        self.subject.pred_type = 'prob'
-        self.subject.validate_pred_type(range(1598))
-        self.subject.validate_pred_type(None)
-
-    def 
test_validate_pred_type_valid_pred_type_valid_class_values_response(self):
-        self.subject.pred_type = 'response'
-        self.subject.validate_pred_type(range(1598))
-        self.subject.validate_pred_type(None)
-
 class MadlibKerasHelperTestCase(unittest.TestCase):
     def setUp(self):
         self.plpy_mock = Mock(spec='error')
@@ -1255,6 +1255,7 @@ class MadlibKerasEvaluationTestCase(unittest.TestCase):
         input_state = [image_count*self.loss, image_count*self.accuracy, 
image_count]
 
         output_state = self.subject.internal_keras_eval_final(input_state)
+        self.assertEqual(len(output_state), 2)
         agg_loss = output_state[0]
         agg_accuracy = output_state[1]

[madlib] 02/03: DL: Replace old evaluate1 function with madlib_keras_evaluate()

Reply via email to