[madlib] branch master updated: Deep Learning: Add support for one-hot encoded dep var

njayaram Mon, 01 Apr 2019 12:55:38 -0700

This is an automated email from the ASF dual-hosted git repository.

njayaram pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git



The following commit(s) were added to refs/heads/master by this push:
     new ff7dd5a  Deep Learning: Add support for one-hot encoded dep var
ff7dd5a is described below

commit ff7dd5a5bed581ba1b97a5eff0704d6d10c06497
Author: Ekta Khanna <[email protected]>
AuthorDate: Fri Mar 22 17:23:19 2019 -0700

    Deep Learning: Add support for one-hot encoded dep var
    
    JIRA: MADLIB-1313
    The current madlib_keras_fit function for DL assumes dependent variable is 
not
    one-hot encoded. But fit uses data obtained after running
    minibatch_preprocessor_dl that returns a 1-hot encoded array for each 
dependent
    var (https://issues.apache.org/jira/browse/MADLIB-1303).
    
    This commit updates madlib_keras_fit to expect a 1-hot encoded dependent 
var to
    train the model. We also create a new column named class_values in the model
    summary table that is passed along from the input summary table (which is
    mini-batched).  Predict then uses class_values to find the exact class label
    for test data. class_values can be NULL in case where user had already 1-hot
    encoded the data before calling minibatch_preprocessor_dl. In that case, we
    return the index of the expected class label.
    
    Closes #360
    
    Co-authored-by: Nandish Jayaram <[email protected]>
---
 src/ports/postgres/modules/convex/mlp_igd.py_in    |  11 +-
 .../modules/deep_learning/madlib_keras.py_in       | 134 +++++---------------
 .../modules/deep_learning/madlib_keras.sql_in      |   6 +-
 .../deep_learning/madlib_keras_helper.py_in        | 129 +++++++++++++++++--
 .../deep_learning/madlib_keras_predict.py_in       |  62 ++++++++--
 .../modules/deep_learning/test/madlib_keras.sql_in | 136 ++++++++++++---------
 .../test/unit_tests/test_madlib_keras.py_in        |  23 +---
 .../test/unit_tests/test_madlib_keras_helper.py_in |  54 ++++++--
 .../modules/utilities/minibatch_validation.py_in   |  13 +-
 9 files changed, 348 insertions(+), 220 deletions(-)

diff --git a/src/ports/postgres/modules/convex/mlp_igd.py_in 
b/src/ports/postgres/modules/convex/mlp_igd.py_in
index 0126d31..3d93a9a 100644
--- a/src/ports/postgres/modules/convex/mlp_igd.py_in
+++ b/src/ports/postgres/modules/convex/mlp_igd.py_in
@@ -58,7 +58,7 @@ from utilities.validate_args import is_var_valid
 from utilities.validate_args import output_tbl_valid
 from utilities.validate_args import table_exists
 from utilities.validate_args import quote_ident
-from utilities.minibatch_validation import is_var_one_hot_encoded_for_minibatch
+from utilities.minibatch_validation import validate_dependent_var_for_minibatch
 
 
 @MinWarning("error")
@@ -693,14 +693,9 @@ def _validate_dependent_var(source_table, 
dependent_varname,
     classification_types = INTEGER | BOOLEAN | TEXT
 
     if is_minibatch_enabled:
-        # The dependent variable is always a double precision array in
-        # preprocessed data (so check for numeric types)
-        # strip out '[]' from expr_type
-        _assert(is_valid_psql_type(expr_type, NUMERIC | ONLY_ARRAY),
-                "Dependent variable column should be a numeric array.")
-
         if is_classification:
-            
is_var_one_hot_encoded_for_minibatch(source_table,dependent_varname)
+            validate_dependent_var_for_minibatch(
+                source_table, dependent_varname, expr_type)
     else:
         if is_classification:
             _assert((is_valid_psql_type(expr_type, NUMERIC | ONLY_ARRAY)
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
index 32cd921..1c5145c 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
@@ -31,102 +31,22 @@ from keras.models import *
 from keras.optimizers import *
 from keras.regularizers import *
 
-from madlib_keras_helper import KerasWeightsSerializer
+from madlib_keras_helper import CLASS_VALUES_COLNAME
+from madlib_keras_helper import FitInputValidator
+from madlib_keras_helper import get_class_values_and_type
 from madlib_keras_helper import get_data_as_np_array
 from madlib_keras_wrapper import *
 
 from utilities.model_arch_info import get_input_shape
-from utilities.validate_args import input_tbl_valid
-from utilities.validate_args import output_tbl_valid
-from utilities.utilities import _assert
-from utilities.utilities import add_postfix
-from utilities.utilities import is_var_valid
 from utilities.utilities import madlib_version
 
-def _validate_input_table(source_table, independent_varname,
-                          dependent_varname):
-    _assert(is_var_valid(source_table, independent_varname),
-            "model_keras error: invalid independent_varname "
-            "('{independent_varname}') for source_table "
-            "({source_table})!".format(
-                independent_varname=independent_varname,
-                source_table=source_table))
-
-    _assert(is_var_valid(source_table, dependent_varname),
-            "model_keras error: invalid dependent_varname "
-            "('{dependent_varname}') for source_table "
-            "({source_table})!".format(
-                dependent_varname=dependent_varname, 
source_table=source_table))
-
-def _validate_input_args(
-    source_table, dependent_varname, independent_varname, model_arch_table,
-    validation_table, output_model_table, num_iterations):
-
-    module_name = 'model_keras'
-    _assert(num_iterations > 0,
-        "model_keras error: Number of iterations cannot be < 1.")
-
-    output_summary_model_table = add_postfix(output_model_table, "_summary")
-    input_tbl_valid(source_table, module_name)
-    # Source table and validation tables must have the same schema
-    _validate_input_table(source_table, independent_varname, dependent_varname)
-    if validation_table and validation_table.strip() != '':
-        input_tbl_valid(validation_table, module_name)
-        _validate_input_table(validation_table, independent_varname,
-                              dependent_varname)
-    # Validate model arch table's schema.
-    input_tbl_valid(model_arch_table, module_name)
-    # Validate output tables
-    output_tbl_valid(output_model_table, module_name)
-    output_tbl_valid(output_summary_model_table, module_name)
-
-def _validate_input_shapes(source_table, independent_varname, input_shape):
-    """
-    Validate if the input shape specified in model architecture is the same
-    as the shape of the image specified in the indepedent var of the input
-    table.
-    """
-    # The weird indexing with 'i+2' and 'i' below has two reasons:
-    # 1) The indexing for array_upper() starts from 1, but indexing in the
-    # input_shape list starts from 0.
-    # 2) Input_shape is only the image's dimension, whereas a row of
-    # independent varname in a table contains buffer size as the first
-    # dimension, followed by the image's dimension. So we must ignore
-    # the first dimension from independent varname.
-    array_upper_query = ", ".join("array_upper({0}, {1}) AS n_{2}".format(
-        independent_varname, i+2, i) for i in range(len(input_shape)))
-    query = """
-        SELECT {0}
-        FROM {1}
-        LIMIT 1
-    """.format(array_upper_query, source_table)
-    # This query will fail if an image in independent var does not have the
-    # same number of dimensions as the input_shape.
-    result = plpy.execute(query)[0]
-    _assert(len(result) == len(input_shape),
-        "model_keras error: The number of dimensions ({0}) of each image in" \
-        " model architecture and {1} in {2} ({3}) do not match.".format(
-            len(input_shape), independent_varname, source_table, len(result)))
-    for i in range(len(input_shape)):
-        key_name = "n_{0}".format(i)
-        if result[key_name] != input_shape[i]:
-            # Construct the shape in independent varname to display meaningful
-            # error msg.
-            input_shape_from_table = [result["n_{0}".format(i)]
-                for i in range(len(input_shape))]
-            plpy.error("model_keras error: Input shape {0} in the model" \
-                " architecture does not match the input shape {1} of column" \
-                " {2} in table {3}.".format(
-                    input_shape, input_shape_from_table, independent_varname,
-                    source_table))
-
 def fit(schema_madlib, source_table, model, dependent_varname,
         independent_varname, model_arch_table, model_arch_id, compile_params,
         fit_params, num_iterations, num_classes, use_gpu = True,
         validation_table=None, name="", description="", **kwargs):
-    _validate_input_args(source_table, dependent_varname, independent_varname,
-                         model_arch_table, validation_table,
-                         model, num_iterations)
+    fit_validator = FitInputValidator(
+        source_table, validation_table, model, model_arch_table,
+        dependent_varname, independent_varname, num_iterations)
 
     start_training_time = datetime.datetime.now()
 
@@ -146,10 +66,9 @@ def fit(schema_madlib, source_table, model, 
dependent_varname,
     query_result = query_result[0]
     model_arch = query_result['model_arch']
     input_shape = get_input_shape(model_arch)
-    _validate_input_shapes(source_table, independent_varname, input_shape)
+    fit_validator.validate_input_shapes(source_table, input_shape)
     if validation_table:
-        _validate_input_shapes(
-            validation_table, independent_varname, input_shape)
+        fit_validator.validate_input_shapes(validation_table, input_shape)
     model_weights_serialized = query_result['model_weights']
 
     # Convert model from json and initialize weights
@@ -234,13 +153,15 @@ def fit(schema_madlib, source_table, model, 
dependent_varname,
         plpy.info("Time for iteration {0}: {1} sec".
                   format(i + 1, end_iteration - start_iteration))
         aggregate_runtime.append(datetime.datetime.now())
-        avg_loss, avg_accuracy, model_state = 
KerasWeightsSerializer.deserialize_iteration_state(iteration_result)
+        avg_loss, avg_accuracy, model_state = \
+            
KerasWeightsSerializer.deserialize_iteration_state(iteration_result)
         plpy.info("Average loss after training iteration {0}: {1}".format(
             i + 1, avg_loss))
         plpy.info("Average accuracy after training iteration {0}: {1}".format(
             i + 1, avg_accuracy))
         if validation_set_provided:
-            _, _, _, updated_weights = 
KerasWeightsSerializer.deserialize_weights(model_state, model_shapes)
+            _, _, _, updated_weights = \
+                KerasWeightsSerializer.deserialize_weights(model_state, 
model_shapes)
             master_model.set_weights(updated_weights)
             (opt_name,final_args,compile_dict) = 
parse_compile_params(compile_params)
             master_model.compile(optimizer=optimizers[opt_name](**final_args),
@@ -270,6 +191,8 @@ def fit(schema_madlib, source_table, model, 
dependent_varname,
     if validation_aggregate_loss and len(validation_aggregate_loss) > 0:
         final_validation_loss = validation_aggregate_loss[-1]
     version = madlib_version(schema_madlib)
+    class_values, class_values_type = get_class_values_and_type(
+        fit_validator.source_summary_table)
     create_output_summary_table = plpy.prepare("""
         CREATE TABLE {0}_summary AS
         SELECT
@@ -299,16 +222,19 @@ def fit(schema_madlib, source_table, model, 
dependent_varname,
         $24 AS accuracy_validation,
         $25 AS loss_validation,
         $26 AS accuracy_iter_validation,
-        $27 AS loss_iter_validation
-        """.format(model), ["TEXT", "INTEGER", "TEXT", "TIMESTAMP",
-                                 "TIMESTAMP", "TEXT", "TEXT","TEXT",
-                                 "TEXT", "TEXT", "TEXT", "TEXT", "INTEGER",
-                                 "TEXT", "TEXT", "TEXT", "INTEGER",
-                                 "INTEGER", "DOUBLE PRECISION",
-                                 "DOUBLE PRECISION", "DOUBLE PRECISION[]",
-                                 "DOUBLE PRECISION[]", "TIMESTAMP[]",
-                                 "DOUBLE PRECISION", "DOUBLE PRECISION",
-                                 "DOUBLE PRECISION[]", "DOUBLE PRECISION[]"])
+        $27 AS loss_iter_validation,
+        $28 AS {1}
+        """.format(model, CLASS_VALUES_COLNAME),
+                   ["TEXT", "INTEGER", "TEXT", "TIMESTAMP",
+                    "TIMESTAMP", "TEXT", "TEXT","TEXT",
+                    "TEXT", "TEXT", "TEXT", "TEXT", "INTEGER",
+                    "TEXT", "TEXT", "TEXT", "INTEGER",
+                    "INTEGER", "DOUBLE PRECISION",
+                    "DOUBLE PRECISION", "DOUBLE PRECISION[]",
+                    "DOUBLE PRECISION[]", "TIMESTAMP[]",
+                    "DOUBLE PRECISION", "DOUBLE PRECISION",
+                    "DOUBLE PRECISION[]", "DOUBLE PRECISION[]",
+                    class_values_type])
     plpy.execute(
         create_output_summary_table,
         [
@@ -326,7 +252,8 @@ def fit(schema_madlib, source_table, model, 
dependent_varname,
             aggregate_runtime, final_validation_acc,
             final_validation_loss,
             validation_aggregate_accuracy,
-            validation_aggregate_loss
+            validation_aggregate_loss,
+            class_values
         ]
         )
 
@@ -394,7 +321,6 @@ def fit_transition(state, ind_var, dep_var, current_seg_id, 
num_classes,
     x_train = np.array(ind_var, dtype='float64').reshape(
         len(ind_var), *input_shape)
     y_train = np.array(dep_var)
-    y_train = keras_utils.to_categorical(y_train, num_classes)
 
     # Fit segment model on data
     start_fit = time.time()
@@ -585,7 +511,7 @@ def internal_keras_evaluate(x_test, y_test, model_arch, 
model_data, input_shape,
     x_test = np.array(x_test).reshape(len(x_test), input_shape[0], 
input_shape[1],
                                       input_shape[2])
     x_test = x_test.astype('float32')
-    y_test = keras_utils.to_categorical(np.array(y_test), 10)
+    y_test = np.array(y_test)
     with K.tf.device(device_name):
         res = model.evaluate(x_test, y_test)
     plpy.info('evaluate result from internal_keras_evaluate is {}'.format(res))
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
index 6b2daf7..aebe270 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
@@ -199,7 +199,8 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.internal_keras_predict(
    model_architecture TEXT,
    model_data bytea,
    input_shape integer[],
-   compile_params TEXT
+   compile_params TEXT,
+   class_values TEXT[]
 ) RETURNS DOUBLE PRECISION[] AS $$
     PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict')
     with AOControl(False):
@@ -208,7 +209,8 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.internal_keras_predict(
                model_architecture,
                model_data,
                input_shape,
-               compile_params)
+               compile_params,
+               class_values)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
index a198b7a..bd3963e 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
@@ -24,6 +24,14 @@ import plpy
 # Import needed for get_data_as_np_array()
 from keras import utils as keras_utils
 
+from utilities.minibatch_validation import validate_dependent_var_for_minibatch
+from utilities.utilities import _assert
+from utilities.utilities import add_postfix
+from utilities.utilities import is_var_valid
+from utilities.validate_args import get_expr_type
+from utilities.validate_args import input_tbl_valid
+from utilities.validate_args import output_tbl_valid
+
 #######################################################################
 ########### Helper functions to serialize and deserialize weights #####
 #######################################################################
@@ -174,21 +182,128 @@ def get_data_as_np_array(table_name, y, x, input_shape, 
num_classes):
     indep_len = len(val_data[0][x])
     pixels_per_image = int(input_shape[0] * input_shape[1] * input_shape[2])
     x_validation = np.ndarray((0,indep_len, pixels_per_image))
-    y_validation = np.ndarray((0,indep_len))
+    y_validation = np.ndarray((0,indep_len, num_classes))
     for i in range(len(val_data)):
         x_test = np.asarray((val_data[i][x],))
         x_test = x_test.reshape(1, indep_len, pixels_per_image)
         y_test = np.asarray((val_data[i][y],))
-        y_test = y_test.reshape(1, indep_len)
         x_validation=np.concatenate((x_validation, x_test))
         y_validation=np.concatenate((y_validation, y_test))
     num_test_examples = x_validation.shape[0]
     x_validation = x_validation.reshape(indep_len * num_test_examples, 
*input_shape)
     x_validation = x_validation.astype('float64')
-    y_validation = y_validation.reshape(indep_len * num_test_examples)
-
-    x_validation = x_validation.astype('float64')
-    #x_validation /= 255.0
-    y_validation = keras_utils.to_categorical(y_validation, num_classes)
+    y_validation = y_validation.reshape(indep_len * num_test_examples, 
num_classes)
 
     return x_validation, y_validation
+
+CLASS_VALUES_COLNAME = "class_values"
+class FitInputValidator:
+    def __init__(self, source_table, validation_table, output_model_table,
+                 model_arch_table, dependent_varname, independent_varname,
+                 num_iterations):
+        self.source_table = source_table
+        self.validation_table = validation_table
+        self.output_model_table = output_model_table
+        self.model_arch_table = model_arch_table
+        self.dependent_varname = dependent_varname
+        self.independent_varname = independent_varname
+        self.num_iterations = num_iterations
+        self.source_summary_table = None
+        if self.source_table:
+            self.source_summary_table = add_postfix(
+                self.source_table, "_summary")
+        if self.output_model_table:
+            self.output_summary_model_table = add_postfix(
+                self.output_model_table, "_summary")
+        self.module_name = 'model_keras'
+        self._validate_input_args()
+
+    def _validate_input_table(self, table):
+        _assert(is_var_valid(table, self.independent_varname),
+                "model_keras error: invalid independent_varname "
+                "('{independent_varname}') for table "
+                "({table}).".format(
+                    independent_varname=self.independent_varname,
+                    table=table))
+
+        _assert(is_var_valid(table, self.dependent_varname),
+                "model_keras error: invalid dependent_varname "
+                "('{dependent_varname}') for table "
+                "({table}).".format(
+                    dependent_varname=self.dependent_varname,
+                    table=table))
+
+    def _validate_input_args(self):
+        _assert(self.num_iterations > 0,
+            "model_keras error: Number of iterations cannot be < 1.")
+        input_tbl_valid(self.source_table, self.module_name)
+        input_tbl_valid(self.source_summary_table, self.module_name)
+        _assert(is_var_valid(
+            self.source_summary_table, CLASS_VALUES_COLNAME),
+                "model_keras error: invalid class_values varname "
+                "('{class_values}') for source_summary_table "
+                "({source_summary_table}).".format(
+                    class_values=CLASS_VALUES_COLNAME,
+                    source_summary_table=self.source_summary_table))
+        # Source table and validation tables must have the same schema
+        self._validate_input_table(self.source_table)
+        validate_dependent_var_for_minibatch(self.source_table,
+                                             self.dependent_varname)
+        if self.validation_table and self.validation_table.strip() != '':
+            input_tbl_valid(self.validation_table, self.module_name)
+            self._validate_input_table(self.validation_table)
+            validate_dependent_var_for_minibatch(self.validation_table,
+                                                 self.dependent_varname)
+        # Validate model arch table's schema.
+        input_tbl_valid(self.model_arch_table, self.module_name)
+        # Validate output tables
+        output_tbl_valid(self.output_model_table, self.module_name)
+        output_tbl_valid(self.output_summary_model_table, self.module_name)
+
+    def validate_input_shapes(self, table, input_shape):
+        """
+        Validate if the input shape specified in model architecture is the same
+        as the shape of the image specified in the indepedent var of the input
+        table.
+        """
+        # The weird indexing with 'i+2' and 'i' below has two reasons:
+        # 1) The indexing for array_upper() starts from 1, but indexing in the
+        # input_shape list starts from 0.
+        # 2) Input_shape is only the image's dimension, whereas a row of
+        # independent varname in a table contains buffer size as the first
+        # dimension, followed by the image's dimension. So we must ignore
+        # the first dimension from independent varname.
+        array_upper_query = ", ".join("array_upper({0}, {1}) AS n_{2}".format(
+            self.independent_varname, i+2, i) for i in range(len(input_shape)))
+        query = """
+            SELECT {0}
+            FROM {1}
+            LIMIT 1
+        """.format(array_upper_query, table)
+        # This query will fail if an image in independent var does not have the
+        # same number of dimensions as the input_shape.
+        result = plpy.execute(query)[0]
+        _assert(len(result) == len(input_shape),
+            "model_keras error: The number of dimensions ({0}) of each image" \
+            " in model architecture and {1} in {2} ({3}) do not match.".format(
+                len(input_shape), self.independent_varname, table, 
len(result)))
+        for i in range(len(input_shape)):
+            key_name = "n_{0}".format(i)
+            if result[key_name] != input_shape[i]:
+                # Construct the shape in independent varname to display
+                # meaningful error msg.
+                input_shape_from_table = [result["n_{0}".format(i)]
+                    for i in range(len(input_shape))]
+                plpy.error("model_keras error: Input shape {0} in the model" \
+                    " architecture does not match the input shape {1} of 
column" \
+                    " {2} in table {3}.".format(
+                        input_shape, input_shape_from_table,
+                        self.independent_varname, table))
+
+def get_class_values_and_type(source_summary_table):
+    class_values = plpy.execute("SELECT {0} AS class_values FROM {1}".
+        format(CLASS_VALUES_COLNAME, source_summary_table)
+        )[0]['class_values']
+    class_values_type = get_expr_type(CLASS_VALUES_COLNAME,
+                                      source_summary_table)
+    return class_values, class_values_type
diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
index e2ed883..5e4e62b 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
@@ -28,10 +28,13 @@ from keras.optimizers import *
 import numpy as np
 
 from utilities.model_arch_info import get_input_shape
+from utilities.utilities import add_postfix
 from utilities.validate_args import input_tbl_valid
 from utilities.validate_args import output_tbl_valid
 
-from madlib_keras_helper import convert_string_of_args_to_dict
+from madlib_keras_wrapper import convert_string_of_args_to_dict
+from madlib_keras_helper import get_class_values_and_type
+from madlib_keras_helper import KerasWeightsSerializer
 
 def predict(schema_madlib, model_table, test_table, id_col, model_arch_table,
             model_arch_id, independent_varname, compile_params, output_table,
@@ -55,15 +58,23 @@ def predict(schema_madlib, model_table, test_table, id_col, 
model_arch_table,
     model_arch = query_result['model_arch']
     input_shape = get_input_shape(model_arch)
     compile_params = "$madlib$" + compile_params + "$madlib$"
-    predict_query = plpy.prepare("""create table {output_table} as
-        select {id_col}, (madlib.internal_keras_predict({independent_varname},
-                                             $MAD${model_arch}$MAD$,
-                                             $1,ARRAY{input_shape},
-                                             {compile_params}))[1] as 
prediction
+    model_summary_table = add_postfix(model_table, "_summary")
+    class_values, _ = get_class_values_and_type(model_summary_table)
+    predict_query = plpy.prepare("""
+        CREATE TABLE {output_table} AS
+        SELECT {id_col},
+            ({schema_madlib}.internal_keras_predict
+                ({independent_varname},
+                 $MAD${model_arch}$MAD$,
+                 $1,ARRAY{input_shape},
+                 {compile_params},
+                 ARRAY{class_values}::TEXT[])
+            )[1] as prediction
         from {test_table}""".format(**locals()), ["bytea"])
     plpy.execute(predict_query, [model_data])
 
-def internal_keras_predict(x_test, model_arch, model_data, input_shape, 
compile_params):
+def internal_keras_predict(x_test, model_arch, model_data, input_shape,
+                           compile_params, class_values):
     model = model_from_json(model_arch)
     compile_params = convert_string_of_args_to_dict(compile_params)
     device_name = '/cpu:0'
@@ -75,9 +86,40 @@ def internal_keras_predict(x_test, model_arch, model_data, 
input_shape, compile_
     model_shapes = []
     for weight_arr in model.get_weights():
         model_shapes.append(weight_arr.shape)
-    _,_,_, model_weights = deserialize_weights(model_data, model_shapes)
+    _,_,_, model_weights = KerasWeightsSerializer.deserialize_weights(
+        model_data, model_shapes)
     model.set_weights(model_weights)
     x_test = np.array(x_test).reshape(1, *input_shape)
     x_test /= 255
-    res = model.predict_classes(x_test)
-    return res
+    proba_argmax = model.predict_classes(x_test)
+    # proba_argmax is a list with exactly one element in it. That element
+    # refers to the index containing the largest probability value in the
+    # output of Keras' predict function.
+    return _get_class_label(class_values, proba_argmax[0])
+
+def _get_class_label(class_values, class_index):
+    """
+    Returns back the class label associated with the index returned by Keras'
+    predict_classes function. Keras' predict_classes function returns back
+    the index of the 1-hot encoded output that has the highest probability
+    value. We should infer the exact class label corresponding to the index
+    by looking at the class_values list (which is obtained from the
+    class_values column of the model summary table). If class_values is None,
+    we return the index as is.
+    Args:
+        @param class_values: list of class labels.
+        @param class_index: integer representing the index with max
+                            probability value.
+    Returns:
+        scalar. If class_values is None, returns class_index, else returns
+        class_values[class_index].
+    """
+    if class_values:
+        if class_index < len(class_values):
+            return class_values[class_index]
+        else:
+            plpy.error("Invalid class index {0} returned from Keras predict. "\
+                "Index value must be less than {1}".format(
+                    class_index, len(class_values)))
+    else:
+        return class_index
diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in 
b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
index fbf6a81..f393db3 100644
--- a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
@@ -28,7 +28,7 @@ create table cifar_10_sample(
         );
 copy cifar_10_sample from stdin delimiter '|';
 
1|{{{202,204,199},{202,204,199},{204,206,201},{206,208,203},{208,210,205},{209,211,206},{210,212,207},{212,214,210},{213,215,212},{215,217,214},{216,218,215},{216,218,215},{215,217,214},{216,218,215},{216,218,215},{216,218,214},{217,219,214},{217,219,214},{218,220,215},{218,219,214},{216,217,212},{217,218,213},{218,219,214},{214,215,209},{213,214,207},{212,213,206},{211,212,205},{209,210,203},{208,209,202},{207,208,200},{205,206,199},{203,204,198}},{{206,208,203},{206,208,203},{207,209,2
 [...]
-2|{{{126,118,110},{122,115,108},{126,119,111},{127,119,109},{130,122,111},{130,122,111},{132,124,113},{133,125,114},{130,122,111},{132,124,113},{134,126,115},{131,123,112},{131,123,112},{134,126,115},{133,125,114},{136,128,117},{137,129,118},{137,129,118},{136,128,117},{131,123,112},{130,122,111},{132,124,113},{132,124,113},{132,124,113},{129,122,110},{127,121,109},{127,121,109},{125,119,107},{124,118,106},{124,118,106},{120,114,102},{117,111,99}},{{122,115,107},{119,112,104},{121,114,10
 [...]
+2|{{{126,118,110},{122,115,108},{126,119,111},{127,119,109},{130,122,111},{130,122,111},{132,124,113},{133,125,114},{130,122,111},{132,124,113},{134,126,115},{131,123,112},{131,123,112},{134,126,115},{133,125,114},{136,128,117},{137,129,118},{137,129,118},{136,128,117},{131,123,112},{130,122,111},{132,124,113},{132,124,113},{132,124,113},{129,122,110},{127,121,109},{127,121,109},{125,119,107},{124,118,106},{124,118,106},{120,114,102},{117,111,99}},{{122,115,107},{119,112,104},{121,114,10
 [...]
 \.
 
 DROP TABLE IF EXISTS cifar_10_sample_batched;
@@ -54,7 +54,7 @@ SELECT load_keras_model('model_arch',
        {"class_name": "Dropout", "config": {"rate": 0.25, "noise_shape": null, 
"trainable": true, "seed": null, "name": "dropout_1"}},
        {"class_name": "Flatten", "config": {"trainable": true, "name": 
"flatten_1", "data_format": "channels_last"}},
        {"class_name": "Dense", "config": {"kernel_initializer": {"class_name": 
"VarianceScaling", "config": {"distribution": "uniform", "scale": 1.0, "seed": 
null, "mode": "fan_avg"}}, "name": "dense_1", "kernel_constraint": null, 
"bias_regularizer": null, "bias_constraint": null, "activation": "softmax", 
"trainable": true, "kernel_regularizer": null, "bias_initializer":
-       {"class_name": "Zeros", "config": {}}, "units": 10, "use_bias": true, 
"activity_regularizer": null}
+       {"class_name": "Zeros", "config": {}}, "units": 2, "use_bias": true, 
"activity_regularizer": null}
        }], "backend": "tensorflow"}$$);
 ALTER TABLE model_arch RENAME model_id TO id;
 
@@ -62,18 +62,19 @@ ALTER TABLE model_arch RENAME model_id TO id;
 -- It might break the assertion
 
 DROP TABLE IF EXISTS keras_out, keras_out_summary;
-SELECT madlib_keras_fit('cifar_10_sample_batched',
-              'keras_out',
-              'dependent_var',
-              'independent_var',
-              'model_arch',
-              1,
-              $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
-              $$ batch_size=2, epochs=1, verbose=0 $$::text,
-              3,
-              10,
-              FALSE,
-              'cifar_10_sample_batched');
+SELECT madlib_keras_fit(
+    'cifar_10_sample_batched',
+    'keras_out',
+    'dependent_var',
+    'independent_var',
+    'model_arch',
+    1,
+    $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
+    $$ batch_size=2, epochs=1, verbose=0 $$::text,
+    3,
+    2,
+    FALSE,
+    'cifar_10_sample_batched');
 SELECT assert(
         model_arch_table = 'model_arch' AND
         model_arch_id = 1 AND
@@ -92,7 +93,8 @@ SELECT assert(
         compile_params = $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text AND
         fit_params = $$ batch_size=2, epochs=1, verbose=0 $$::text AND
         num_iterations = 3 AND
-        num_classes = 10 AND
+        num_classes = 2 AND
+        class_values = '{0,1}' AND
         accuracy is not NULL AND
         loss is not NULL AND
         array_upper(accuracy_iter, 1) = 3 AND
@@ -103,57 +105,71 @@ SELECT assert(
         array_upper(accuracy_iter_validation, 1) = 3 AND
         array_upper(loss_iter_validation, 1) = 3 ,
         'Keras model output Summary Validation failed. Actual:' || 
__to_char(summary))
-from (select * from keras_out_summary) summary;
+FROM (SELECT * FROM keras_out_summary) summary;
 
-SELECT assert(model_data is not NULL , 'Keras model output validation failed') 
from (select * from keras_out) k;
+SELECT assert(model_data IS NOT NULL , 'Keras model output validation failed') 
FROM (SELECT * FROM keras_out) k;
 
 
 -- Test for
   -- Non null name and description columns
        -- Null validation table
 DROP TABLE IF EXISTS keras_out, keras_out_summary;
-SELECT madlib_keras_fit('cifar_10_sample_batched',
-                                               'keras_out',
-                                               'dependent_var',
-                                               'independent_var',
-                                               'model_arch',
-                                               1,
-                        $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
-                        $$ batch_size=2, epochs=1, verbose=0 $$::text,
-                                               1,
-                                               10,
-                                               FALSE,
-                                               NULL,
-                                               'model name', 'model desc');
+SELECT madlib_keras_fit(
+    'cifar_10_sample_batched',
+    'keras_out',
+    'dependent_var',
+    'independent_var',
+    'model_arch',
+    1,
+    $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
+    $$ batch_size=2, epochs=1, verbose=0 $$::text,
+    1,
+    2,
+    FALSE,
+    NULL,
+    'model name', 'model desc');
 SELECT assert(
-        model_arch_table = 'model_arch' AND
-        model_arch_id = 1 AND
-        model_type = 'madlib_keras' AND
-        start_training_time         < now() AND
-        end_training_time > start_training_time AND
-        source_table = 'cifar_10_sample_batched' AND
-        validation_table = 'cifar_10_sample_batched' AND
-        model = 'keras_out' AND
-        dependent_varname = 'dependent_var' AND
-        independent_varname = 'independent_var' AND
-        name = 'model name' AND
-        description = 'model desc' AND
-        model_size > 0 AND
-        madlib_version is NOT NULL AND
-        compile_params = $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text AND
-        fit_params = $$ batch_size=2, epochs=1, verbose=0 $$::text AND
-        num_iterations = 1 AND
-        num_classes = 10 AND
-        accuracy is not NULL AND
-        loss is not NULL AND
-        array_upper(accuracy_iter, 1) = 1 AND
-        array_upper(loss_iter, 1) = 1 AND
-        array_upper(time_iter, 1) = 1 AND
-        accuracy_validation is  NULL AND
-        loss_validation is  NULL AND
-        array_upper(accuracy_iter_validation,1) = 0 AND
-        array_upper(loss_iter_validation,1) = 0 ,
-        'Keras model output Summary Validation failed. Actual:' || 
__to_char(summary))
-from (select * from keras_out_summary) summary;
+    model_arch_table = 'model_arch' AND
+    model_arch_id = 1 AND
+    model_type = 'madlib_keras' AND
+    start_training_time         < now() AND
+    end_training_time > start_training_time AND
+    source_table = 'cifar_10_sample_batched' AND
+    validation_table = 'cifar_10_sample_batched' AND
+    model = 'keras_out' AND
+    dependent_varname = 'dependent_var' AND
+    independent_varname = 'independent_var' AND
+    name = 'model name' AND
+    description = 'model desc' AND
+    model_size > 0 AND
+    madlib_version is NOT NULL AND
+    compile_params = $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text AND
+    fit_params = $$ batch_size=2, epochs=1, verbose=0 $$::text AND
+    num_iterations = 1 AND
+    num_classes = 2 AND
+    class_values = '{0,1}' AND
+    accuracy is not NULL AND
+    loss is not NULL AND
+    array_upper(accuracy_iter, 1) = 1 AND
+    array_upper(loss_iter, 1) = 1 AND
+    array_upper(time_iter, 1) = 1 AND
+    accuracy_validation is  NULL AND
+    loss_validation is  NULL AND
+    array_upper(accuracy_iter_validation,1) = 0 AND
+    array_upper(loss_iter_validation,1) = 0 ,
+    'Keras model output Summary Validation failed. Actual:' || 
__to_char(summary))
+FROM (SELECT * FROM keras_out_summary) summary;
+
+SELECT assert(model_data IS NOT NULL , 'Keras model output validation failed') 
FROM (SELECT * FROM keras_out) k;
 
-SELECT assert(model_data is not NULL , 'Keras model output validation failed') 
from (select * from keras_out) k;
+-- Temporary predict test, to be updated as part of another jira
+DROP TABLE IF EXISTS cifar10_predict;
+SELECT madlib_keras_predict(
+    'keras_out',
+    'cifar_10_sample',
+    'id',
+    'model_arch',
+    1,
+    'x',
+    '''optimizer''=SGD(lr=0.01, decay=1e-6, nesterov=True), 
''loss''=''categorical_crossentropy'', ''metrics''=[''accuracy'']'::text,
+    'cifar10_predict');
diff --git 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
index d9613ca..a66a292 100644
--- 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
+++ 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
@@ -82,7 +82,7 @@ class MadlibKerasFitTestCase(unittest.TestCase):
 
         k = {'SD': {'buffer_count': buffer_count}}
         new_model_state = self.subject.fit_transition(
-            None, [[0.5]] , [0], 1, 2, self.all_seg_ids, 
self.total_buffers_per_seg,
+            None, [[0.5]] , [[0,1]], 1, 2, self.all_seg_ids, 
self.total_buffers_per_seg,
             self.model.to_json(), self.compile_params, self.fit_params, False,
             previous_state.tostring(), **k)
         buffer_count = np.fromstring(new_model_state, dtype=np.float32)[2]
@@ -113,7 +113,7 @@ class MadlibKerasFitTestCase(unittest.TestCase):
                    'model_shapes': self.model_shapes}}
         k['SD']['segment_model'] = self.model
         new_model_state = self.subject.fit_transition(
-            state.tostring(), [[0.5]] , [0], 1, 2, self.all_seg_ids, 
self.total_buffers_per_seg,
+            state.tostring(), [[0.5]] , [[1,0]], 1, 2, self.all_seg_ids, 
self.total_buffers_per_seg,
             self.model.to_json(), None, self.fit_params, False, 
'dummy_previous_state', **k)
 
         buffer_count = np.fromstring(new_model_state, dtype=np.float32)[2]
@@ -142,7 +142,7 @@ class MadlibKerasFitTestCase(unittest.TestCase):
                    'model_shapes': self.model_shapes}}
         k['SD']['segment_model'] = self.model
         new_model_state = self.subject.fit_transition(
-            state.tostring(), [[0.5]] , [0], 1, 2, self.all_seg_ids, 
self.total_buffers_per_seg,
+            state.tostring(), [[0.5]] , [[0,1]], 1, 2, self.all_seg_ids, 
self.total_buffers_per_seg,
             self.model.to_json(), None, self.fit_params, False, 
'dummy_previous_state', **k)
 
         buffer_count = np.fromstring(new_model_state, dtype=np.float32)[2]
@@ -177,23 +177,6 @@ class MadlibKerasFitTestCase(unittest.TestCase):
             [0,1,2], [3,3,3], 'dummy_model_json', "foo", "bar", False,
             'dummy_prev_state', **k))
 
-    def test_validate_input_shapes_shapes_do_not_match(self):
-        self.plpy_mock_execute.return_value = [{'n_0': 32, 'n_1': 32}]
-        with self.assertRaises(plpy.PLPYException):
-            self.subject._validate_input_shapes('foo', 'bar', [32,32,3])
-
-        self.plpy_mock_execute.return_value = [{'n_0': 3, 'n_1': 32, 'n_2': 
32}]
-        with self.assertRaises(plpy.PLPYException):
-            self.subject._validate_input_shapes('foo', 'bar', [32,32,3])
-
-        self.plpy_mock_execute.return_value = [{'n_0': 3, 'n_1': None, 'n_2': 
None}]
-        with self.assertRaises(plpy.PLPYException):
-            self.subject._validate_input_shapes('foo', 'bar', [3,32])
-
-    def test_validate_input_shapes_shapes_match(self):
-        self.plpy_mock_execute.return_value = [{'n_0': 32, 'n_1': 32, 'n_2': 
3}]
-        self.subject._validate_input_shapes('foo', 'bar', [32,32,3])
-
 if __name__ == '__main__':
     unittest.main()
 # ---------------------------------------------------------------------
diff --git 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_helper.py_in
 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_helper.py_in
index b9feef8..61439f1 100644
--- 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_helper.py_in
+++ 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_helper.py_in
@@ -25,6 +25,8 @@ 
sys.path.append(path.dirname(path.dirname(path.dirname(path.dirname(path.abspath
 
sys.path.append(path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))
 
 from keras import utils as keras_utils
+from keras.models import *
+from keras.layers import *
 
 import unittest
 from mock import *
@@ -139,8 +141,8 @@ class MadlibKerasHelperTestCase(unittest.TestCase):
                          res)
 
     def test_get_data_as_np_array_one_image_per_row(self):
-        self.plpy_mock_execute.return_value = [{'x': [[1,2]], 'y': 0},
-                                               {'x': [[5,6]], 'y': 1}]
+        self.plpy_mock_execute.return_value = [{'x': [[1,2]], 'y': [[1, 0, 
0]]},
+                                               {'x': [[5,6]], 'y': [[0, 1, 
0]]}]
         x_res, y_res = self.subject.get_data_as_np_array('foo','y','x', 
[1,1,2],
                                                          3)
         self.assertEqual(np.array([[[[1, 2]]], [[[5, 6]]]]).tolist(),
@@ -149,8 +151,8 @@ class MadlibKerasHelperTestCase(unittest.TestCase):
                          y_res.tolist())
 
     def test_get_data_as_np_array_multiple_images_per_row(self):
-        self.plpy_mock_execute.return_value = [{'x': [[1,2], [3,4]], 'y': 
[0,2]},
-                                               {'x': [[5,6], [7,8]], 'y': 
[1,0]}]
+        self.plpy_mock_execute.return_value = [{'x': [[1,2], [3,4]], 'y': 
[[1,0,0],[0,0,1]]},
+                                               {'x': [[5,6], [7,8]], 'y': 
[[0,1,0],[1,0,0]]}]
         x_res, y_res = self.subject.get_data_as_np_array('foo','y','x', 
[1,1,2],
                                                          3)
         self.assertEqual(np.array([[[[1,2]]], [[[3,4]]],
@@ -161,8 +163,8 @@ class MadlibKerasHelperTestCase(unittest.TestCase):
                          y_res.tolist())
 
     def test_get_data_as_np_array_float_input_shape(self):
-        self.plpy_mock_execute.return_value = [{'x': [[1,2]], 'y': 0},
-                                               {'x': [[5,6]], 'y': 1}]
+        self.plpy_mock_execute.return_value = [{'x': [[1,2]], 'y': [[1, 0, 
0]]},
+                                               {'x': [[5,6]], 'y': [[0, 1, 
0]]}]
         x_res, y_res = self.subject.get_data_as_np_array('foo','y','x',
                                                          [1.5,1.9,2.3], 3)
         self.assertEqual(np.array([[[[1, 2]]], [[[5, 6]]]]).tolist(),
@@ -171,9 +173,45 @@ class MadlibKerasHelperTestCase(unittest.TestCase):
                          y_res.tolist())
 
     def test_get_data_as_np_array_invalid_input_shape(self):
-        self.plpy_mock_execute.return_value = [{'x': [[1,2]], 'y': 0},
-                                               {'x': [[5,6]], 'y': 1}]
+        self.plpy_mock_execute.return_value = [{'x': [[1,2]], 'y': [[1, 0, 
0]]},
+                                               {'x': [[5,6]], 'y': [[0, 1, 
0]]}]
         # we expect keras failure(ValueError) because we cannot reshape
         # the input which is of size 2 to input shape of 1,1,3
         with self.assertRaises(ValueError):
             self.subject.get_data_as_np_array('foo','y','x', [1,1,3], 3)
+
+    def test_validate_input_shapes_shapes_do_not_match(self):
+        self.plpy_mock_execute.return_value = [{'n_0': 32, 'n_1': 32}]
+        self.subject.FitInputValidator._validate_input_args = Mock()
+        input_validator_obj = self.subject.FitInputValidator('foo',
+                                                             'foo_valid',
+                                                             'model',
+                                                             
'model_arch_table',
+                                                             
'dependent_varname',
+                                                             
'independent_varname',
+                                                             1)
+        with self.assertRaises(plpy.PLPYException):
+            input_validator_obj.validate_input_shapes('dummy_tbl', [32,32,3])
+
+        self.plpy_mock_execute.return_value = [{'n_0': 3, 'n_1': 32, 'n_2': 
32}]
+        with self.assertRaises(plpy.PLPYException):
+            input_validator_obj.validate_input_shapes('dummy_tbl', [32,32,3])
+
+        self.plpy_mock_execute.return_value = [{'n_0': 3, 'n_1': None, 'n_2': 
None}]
+        with self.assertRaises(plpy.PLPYException):
+            input_validator_obj.validate_input_shapes('dummy_tbl', [3,32])
+
+    def test_validate_input_shapes_shapes_match(self):
+        self.plpy_mock_execute.return_value = [{'n_0': 32, 'n_1': 32, 'n_2': 
3}]
+        self.subject.FitInputValidator._validate_input_args = Mock()
+        input_validator_obj = self.subject.FitInputValidator('foo',
+                                                             'foo_valid',
+                                                             'model',
+                                                             
'model_arch_table',
+                                                             
'dependent_varname',
+                                                             
'independent_varname',
+                                                             1)
+        input_validator_obj.validate_input_shapes('dummy_tbl', [32,32,3])
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/src/ports/postgres/modules/utilities/minibatch_validation.py_in 
b/src/ports/postgres/modules/utilities/minibatch_validation.py_in
index 16b11a9..5c77c2c 100644
--- a/src/ports/postgres/modules/utilities/minibatch_validation.py_in
+++ b/src/ports/postgres/modules/utilities/minibatch_validation.py_in
@@ -18,8 +18,19 @@
 # under the License.
 
 import plpy
+from utilities import _assert
+from utilities import is_valid_psql_type
+from utilities import NUMERIC, ONLY_ARRAY
+from validate_args import get_expr_type
+
+def validate_dependent_var_for_minibatch(table_name, var_name, expr_type=None):
+    # The dependent variable is always a double precision array in
+    # preprocessed data (so check for numeric types)
+    if not expr_type:
+        expr_type = get_expr_type(var_name, table_name)
+    _assert(is_valid_psql_type(expr_type, NUMERIC | ONLY_ARRAY),
+            "Dependent variable column should be a numeric array.")
 
-def is_var_one_hot_encoded_for_minibatch(table_name, var_name):
     query = """SELECT array_upper({var_name}, 2) > 1 AS is_encoded FROM
               {table_name} LIMIT 1;""".format(**locals())
     result = plpy.execute(query)

[madlib] branch master updated: Deep Learning: Add support for one-hot encoded dep var

Reply via email to