This is an automated email from the ASF dual-hosted git repository.

njayaram pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git

commit f43765944c92e82eb2bfcf1449b68da75df4c582
Author: Nandish Jayaram <[email protected]>
AuthorDate: Tue Apr 16 16:53:03 2019 -0700

    DL: Remove reshaping and hard-coded normalizing_const from predict
    
    This commit makes the following changes:
    1. Predict was still reshaping the input, but that's not necessary anymore
    since we assume the input data is already shaped correctly.
    2. Remove hard coded normalizing constant for test data in predict,
    and instead use the normalizing_const from model summary table which is
    essentially the value that was used to normalize training data.
    3. Add dev-check and unit tests for the changes.
    
    Co-authored-by: Ekta Khanna <[email protected]>
---
 .../modules/deep_learning/madlib_keras.py_in       | 32 +++++-----
 .../modules/deep_learning/madlib_keras.sql_in      |  6 +-
 .../deep_learning/madlib_keras_helper.py_in        | 44 ++++++++------
 .../deep_learning/madlib_keras_predict.py_in       | 21 ++++---
 .../deep_learning/madlib_keras_validator.py_in     |  6 +-
 .../modules/deep_learning/test/madlib_keras.sql_in | 69 ++++++++++++++++++++++
 .../test/unit_tests/test_madlib_keras.py_in        | 24 ++++++++
 7 files changed, 154 insertions(+), 48 deletions(-)

diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
index afe2187..bbbcfb4 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
@@ -36,9 +36,10 @@ from keras.models import *
 from keras.optimizers import *
 from keras.regularizers import *
 import madlib_keras_serializer
-from madlib_keras_helper import CLASS_VALUES_CNAME
-from madlib_keras_helper import DEPENDENT_VARTYPE_CNAME
-from madlib_keras_helper import NORMALIZING_CONST_CNAME
+from madlib_keras_helper import CLASS_VALUES_COLNAME
+from madlib_keras_helper import DEPENDENT_VARTYPE_COLNAME
+from madlib_keras_helper import expand_input_dims
+from madlib_keras_helper import NORMALIZING_CONST_COLNAME
 from madlib_keras_validator import FitInputValidator
 from madlib_keras_wrapper import *
 from keras_model_arch_table import Format
@@ -196,11 +197,11 @@ def fit(schema_madlib, source_table, model, 
dependent_varname,
         final_validation_loss = validation_aggregate_loss[-1]
     version = madlib_version(schema_madlib)
     class_values, class_values_type = get_col_value_and_type(
-        fit_validator.source_summary_table, CLASS_VALUES_CNAME)
+        fit_validator.source_summary_table, CLASS_VALUES_COLNAME)
     norm_const, norm_const_type = get_col_value_and_type(
-        fit_validator.source_summary_table, NORMALIZING_CONST_CNAME)
+        fit_validator.source_summary_table, NORMALIZING_CONST_COLNAME)
     dep_vartype = plpy.execute("SELECT {0} AS dep FROM {1}".format(
-        DEPENDENT_VARTYPE_CNAME, fit_validator.source_summary_table))[0]['dep']
+        DEPENDENT_VARTYPE_COLNAME, 
fit_validator.source_summary_table))[0]['dep']
     create_output_summary_table = plpy.prepare("""
         CREATE TABLE {0}_summary AS
         SELECT
@@ -234,8 +235,8 @@ def fit(schema_madlib, source_table, model, 
dependent_varname,
         $28 AS {1},
         $29 AS {2},
         $30 AS {3}
-        """.format(model, CLASS_VALUES_CNAME, DEPENDENT_VARTYPE_CNAME,
-                   NORMALIZING_CONST_CNAME),
+        """.format(model, CLASS_VALUES_COLNAME, DEPENDENT_VARTYPE_COLNAME,
+                   NORMALIZING_CONST_COLNAME),
                    ["TEXT", "INTEGER", "TEXT", "TIMESTAMP",
                     "TIMESTAMP", "TEXT", "TEXT","TEXT",
                     "TEXT", "TEXT", "TEXT", "TEXT", "INTEGER",
@@ -479,15 +480,12 @@ def internal_keras_evaluate(dependent_var, 
independent_var, model_architecture,
     with K.tf.device(device_name):
         compile_model(model, compile_params)
 
-    # Since the training data is batched but the validation data isn't, we have
-    # to make sure that the validation data np array has the same no of 
dimensions
-    # as training data. So we prepend 1 to both x and y np arrays using 
expand_dims.
-    independent_var = np.array(independent_var)
-    independent_var = np.expand_dims(independent_var, axis=0)
-    independent_var = independent_var.astype('float32')
-
-    dependent_var = np.array(dependent_var)
-    dependent_var = np.expand_dims(dependent_var, axis=0)
+    # Since the training data is batched but the validation data isn't,
+    # we have to make sure that the validation data np array has the same
+    # number of dimensions as training data. So we prepend a dimension to
+    # both x and y np arrays using expand_dims.
+    independent_var = expand_input_dims(independent_var, target_type='float32')
+    dependent_var = expand_input_dims(dependent_var)
 
     with K.tf.device(device_name):
         res = model.evaluate(independent_var, dependent_var)
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
index 8ba24c7..34bf2c2 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.sql_in
@@ -204,7 +204,8 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.internal_keras_predict(
    model_data bytea,
    input_shape integer[],
    compile_params TEXT,
-   is_response BOOLEAN
+   is_response BOOLEAN,
+   normalizing_const DOUBLE PRECISION
 ) RETURNS DOUBLE PRECISION[] AS $$
     PythonFunctionBodyOnly(`deep_learning', `madlib_keras_predict')
     with AOControl(False):
@@ -214,7 +215,8 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.internal_keras_predict(
                model_data,
                input_shape,
                compile_params,
-               is_response)
+               is_response,
+               normalizing_const)
 $$ LANGUAGE plpythonu VOLATILE
 m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
 
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
index bc5e703..d56a0e3 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
@@ -17,21 +17,29 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import numpy as np
 import plpy
 from keras_model_arch_table import Format
 from utilities.utilities import add_postfix
 from utilities.validate_args import input_tbl_valid
 
+# Prepend 1 to np arrays using expand_dims.
+def expand_input_dims(input_data, target_type=None):
+    input_data = np.array(input_data)
+    input_data = np.expand_dims(input_data, axis=0)
+    if target_type:
+        input_data = input_data.astype(target_type)
+    return input_data
+
 # Name of columns in model summary table.
-CLASS_VALUES_CNAME = "class_values"
-NORMALIZING_CONST_CNAME = "normalizing_const"
-DEPENDENT_VARTYPE_CNAME = "dependent_vartype"
-COMPILE_PARAMS_CNAME = "compile_params"
-DEPENDENT_VARNAME_CNAME = "dependent_varname"
-DEPENDENT_VARTYPE_CNAME = "dependent_vartype"
-MODEL_ARCH_TABLE_CNAME = "model_arch_table"
-MODEL_ARCH_ID_CNAME = "model_arch_id"
-MODEL_DATA_CNAME = "model_data"
+CLASS_VALUES_COLNAME = "class_values"
+NORMALIZING_CONST_COLNAME = "normalizing_const"
+COMPILE_PARAMS_COLNAME = "compile_params"
+DEPENDENT_VARNAME_COLNAME = "dependent_varname"
+DEPENDENT_VARTYPE_COLNAME = "dependent_vartype"
+MODEL_ARCH_TABLE_COLNAME = "model_arch_table"
+MODEL_ARCH_ID_COLNAME = "model_arch_id"
+MODEL_DATA_COLNAME = "model_data"
 
 class PredictParamsProcessor:
     def __init__(self, model_table, module_name):
@@ -47,8 +55,8 @@ class PredictParamsProcessor:
             self.model_summary_table))[0]
 
     def _get_model_arch_dict(self):
-        model_arch_table = self.model_summary_dict[MODEL_ARCH_TABLE_CNAME]
-        model_arch_id = self.model_summary_dict[MODEL_ARCH_ID_CNAME]
+        model_arch_table = self.model_summary_dict[MODEL_ARCH_TABLE_COLNAME]
+        model_arch_id = self.model_summary_dict[MODEL_ARCH_ID_COLNAME]
         input_tbl_valid(model_arch_table, self.module_name)
         model_arch_query = """
             SELECT {0}
@@ -63,16 +71,16 @@ class PredictParamsProcessor:
         return query_result[0]
 
     def get_class_values(self):
-        return self.model_summary_dict[CLASS_VALUES_CNAME]
+        return self.model_summary_dict[CLASS_VALUES_COLNAME]
 
     def get_compile_params(self):
-        return self.model_summary_dict[COMPILE_PARAMS_CNAME]
+        return self.model_summary_dict[COMPILE_PARAMS_COLNAME]
 
     def get_dependent_varname(self):
-        return self.model_summary_dict[DEPENDENT_VARNAME_CNAME]
+        return self.model_summary_dict[DEPENDENT_VARNAME_COLNAME]
 
     def get_dependent_vartype(self):
-        return self.model_summary_dict[DEPENDENT_VARTYPE_CNAME]
+        return self.model_summary_dict[DEPENDENT_VARTYPE_COLNAME]
 
     def get_model_arch(self):
         return self.model_arch_dict[Format.MODEL_ARCH]
@@ -80,8 +88,8 @@ class PredictParamsProcessor:
     def get_model_data(self):
         return plpy.execute("""
                 SELECT {0} FROM {1}
-            """.format(MODEL_DATA_CNAME, self.model_table)
-                            )[0][MODEL_DATA_CNAME]
+            """.format(MODEL_DATA_COLNAME, self.model_table)
+                            )[0][MODEL_DATA_COLNAME]
 
     def get_normalizing_const(self):
-        return self.model_summary_dict[NORMALIZING_CONST_CNAME]
+        return self.model_summary_dict[NORMALIZING_CONST_COLNAME]
diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
index 34b26c3..3108be5 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
@@ -27,8 +27,9 @@ from keras.models import *
 from keras.optimizers import *
 import numpy as np
 
+from madlib_keras_helper import expand_input_dims
 from madlib_keras_helper import PredictParamsProcessor
-from madlib_keras_helper import MODEL_DATA_CNAME
+from madlib_keras_helper import MODEL_DATA_COLNAME
 from madlib_keras_wrapper import compile_and_set_weights
 from utilities.model_arch_info import get_input_shape
 from utilities.utilities import add_postfix
@@ -64,7 +65,8 @@ def predict(schema_madlib, model_table, test_table, id_col,
     dependent_vartype = param_proc.get_dependent_vartype()
     model_data = param_proc.get_model_data()
     model_arch = param_proc.get_model_arch()
-
+    normalizing_const = param_proc.get_normalizing_const()
+    # TODO: Validate input shape as part of MADLIB-1312
     input_shape = get_input_shape(model_arch)
     compile_params = "$madlib$" + compile_params + "$madlib$"
 
@@ -102,23 +104,26 @@ def predict(schema_madlib, model_table, test_table, 
id_col,
                         {0},
                         ARRAY{input_shape},
                         {compile_params},
-                        {is_response})
+                        {is_response},
+                        {normalizing_const})
                    ) AS {intermediate_col}
         FROM {test_table}, {model_table}
         ) q
-        """.format(MODEL_DATA_CNAME, **locals()))
+        """.format(MODEL_DATA_COLNAME, **locals()))
 
 def internal_keras_predict(x_test, model_arch, model_data, input_shape,
-                           compile_params, is_response):
+                           compile_params, is_response, normalizing_const):
     model = model_from_json(model_arch)
     device_name = '/cpu:0'
     os.environ["CUDA_VISIBLE_DEVICES"] = '-1'
     model_shapes = madlib_keras_serializer.get_model_shapes(model)
     compile_and_set_weights(model, compile_params, device_name,
                             model_data, model_shapes)
-
-    x_test = np.array(x_test).reshape(1, *input_shape)
-    x_test /= 255
+    # Since the test data isn't mini-batched,
+    # we have to make sure that the test data np array has the same
+    # number of dimensions as input_shape. So we add a dimension to x.
+    x_test = expand_input_dims(x_test, target_type='float32')
+    x_test /= normalizing_const
     if is_response:
         proba_argmax = model.predict_classes(x_test)
         # proba_argmax is a list with exactly one element in it. That element
diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
index 481fb1b..606461d 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in
@@ -17,7 +17,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-from madlib_keras_helper import CLASS_VALUES_CNAME
+from madlib_keras_helper import CLASS_VALUES_COLNAME
 from utilities.minibatch_validation import validate_dependent_var_for_minibatch
 from utilities.utilities import _assert
 from utilities.utilities import add_postfix
@@ -72,11 +72,11 @@ class FitInputValidator:
         input_tbl_valid(self.source_table, self.module_name)
         input_tbl_valid(self.source_summary_table, self.module_name)
         _assert(is_var_valid(
-            self.source_summary_table, CLASS_VALUES_CNAME),
+            self.source_summary_table, CLASS_VALUES_COLNAME),
                 "model_keras error: invalid class_values varname "
                 "('{class_values}') for source_summary_table "
                 "({source_summary_table}).".format(
-                    class_values=CLASS_VALUES_CNAME,
+                    class_values=CLASS_VALUES_COLNAME,
                     source_summary_table=self.source_summary_table))
         # Source table and validation tables must have the same schema
         self._validate_input_table(self.source_table)
diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in 
b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
index a259630..08ac9cb 100644
--- a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
@@ -494,3 +494,72 @@ SELECT assert(UPPER(atttypid::regtype::TEXT) =
     'SMALLINT', 'prediction column should be smallint type')
 FROM pg_attribute
 WHERE attrelid = 'cifar10_predict'::regclass AND attname = 
'estimated_dependent_var';
+
+-- Test case with a different input shape (3, 32, 32) instead of (32, 32, 3).
+-- Create a new table with image shape 3, 32, 32
+drop table if exists cifar_10_sample_test_shape;
+create table cifar_10_sample_test_shape(id INTEGER, y SMALLINT, x  REAL[] );
+copy cifar_10_sample_test_shape from stdin delimiter '|';
+1|0|{{{248,248,250,245,245,246,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247,245,245},{247,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247,245},{245,247,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247},{248,248,250,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247,245,245,247
 [...]
+\.
+
+DROP TABLE IF EXISTS cifar_10_sample_test_shape_batched;
+DROP TABLE IF EXISTS cifar_10_sample_test_shape_batched_summary;
+SELECT 
minibatch_preprocessor_dl('cifar_10_sample_test_shape','cifar_10_sample_test_shape_batched','y','x',
 NULL, 255, 3);
+
+-- Change model_arch to reflect channels_first
+DROP TABLE IF EXISTS model_arch;
+SELECT load_keras_model('model_arch',
+  $${
+  "class_name": "Sequential",
+  "keras_version": "2.1.6",
+  "config": [{
+    "class_name": "Conv2D", "config": {"kernel_initializer": {"class_name": 
"VarianceScaling", "config": {"distribution": "uniform", "scale": 1.0, "seed": 
null, "mode": "fan_avg"}},
+    "name": "conv2d_1",
+    "kernel_constraint": null, "bias_regularizer": null, "bias_constraint": 
null,
+    "dtype": "float32", "activation": "relu", "trainable": true,
+    "data_format": "channels_first", "filters": 32, "padding": "valid",
+    "strides": [1, 1], "dilation_rate": [1, 1], "kernel_regularizer": null,
+    "bias_initializer": {"class_name": "Zeros", "config": {}},
+    "batch_input_shape": [null, 3, 32, 32], "use_bias": true,
+    "activity_regularizer": null, "kernel_size": [3, 3]}},
+    {"class_name": "MaxPooling2D", "config": {"name": "max_pooling2d_1", 
"trainable": true, "data_format": "channels_first", "pool_size": [2, 2], 
"padding": "valid", "strides": [2, 2]}},
+    {"class_name": "Dropout", "config": {"rate": 0.25, "noise_shape": null, 
"trainable": true, "seed": null, "name": "dropout_1"}},
+    {"class_name": "Flatten", "config": {"trainable": true, "name": 
"flatten_1", "data_format": "channels_first"}},
+    {"class_name": "Dense", "config": {"kernel_initializer": {"class_name": 
"VarianceScaling", "config": {"distribution": "uniform", "scale": 1.0, "seed": 
null, "mode": "fan_avg"}}, "name": "dense_1", "kernel_constraint": null, 
"bias_regularizer": null, "bias_constraint": null, "activation": "softmax", 
"trainable": true, "kernel_regularizer": null, "bias_initializer":
+    {"class_name": "Zeros", "config": {}}, "units": 3, "use_bias": true, 
"activity_regularizer": null}
+    }], "backend": "tensorflow"}$$);
+
+DROP TABLE IF EXISTS keras_saved_out, keras_saved_out_summary;
+SELECT madlib_keras_fit(
+    'cifar_10_sample_test_shape_batched',
+    'keras_saved_out',
+    'dependent_var',
+    'independent_var',
+    'model_arch',
+    1,
+    $$ optimizer=SGD(lr=0.01, decay=1e-6, nesterov=True), 
loss='categorical_crossentropy', metrics=['accuracy']$$::text,
+    $$ batch_size=2, epochs=1, verbose=0 $$::text,
+    3,
+    FALSE);
+
+-- Predict with correctly shaped data, must go thru.
+DROP TABLE IF EXISTS cifar10_predict;
+SELECT madlib_keras_predict(
+    'keras_saved_out',
+    'cifar_10_sample_test_shape',
+    'id',
+    'x',
+    'cifar10_predict',
+    'prob');
+
+-- Prediction with incorrectly shaped data must error out.
+DROP TABLE IF EXISTS cifar10_predict;
+SELECT assert(trap_error($TRAP$madlib_keras_predict(
+        'keras_saved_out',
+        'cifar_10_sample',
+        'id',
+        'x',
+        'cifar10_predict',
+        'prob');$TRAP$) = 1,
+    'Input shape is (32, 32, 3) but model was trained with (3, 32, 32). Should 
have failed.');
diff --git 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
index f2375ba..533e347 100644
--- 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
+++ 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in
@@ -402,6 +402,30 @@ class MadlibKerasPredictTestCase(unittest.TestCase):
         self.subject.validate_pred_type('response', range(1598))
         self.subject.validate_pred_type('response', None)
 
+class MadlibKerasHelperTestCase(unittest.TestCase):
+    def setUp(self):
+        self.plpy_mock = Mock(spec='error')
+        patches = {
+            'plpy': plpy
+        }
+
+        self.plpy_mock_execute = MagicMock()
+        plpy.execute = self.plpy_mock_execute
+
+        self.module_patcher = patch.dict('sys.modules', patches)
+        self.module_patcher.start()
+        import madlib_keras_helper
+        self.subject = madlib_keras_helper
+        self.input_data = [32, 32, 3]
+
+    def tearDown(self):
+        self.module_patcher.stop()
+
+    def test_expand_input_dims(self):
+        self.assertEqual(np.array(self.input_data).shape, (3,))
+        res = self.subject.expand_input_dims(self.input_data)
+        self.assertEqual(res.shape, (1, 3))
+
 if __name__ == '__main__':
     unittest.main()
 # ---------------------------------------------------------------------

Reply via email to