[madlib] branch master updated: DL: Standardize the data types used for keras and sql

nkak Wed, 19 Jun 2019 12:13:12 -0700

This is an automated email from the ASF dual-hosted git repository.

nkak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git



The following commit(s) were added to refs/heads/master by this push:
     new 15cbe27  DL: Standardize the data types used for keras and sql
15cbe27 is described below

commit 15cbe27e0030fc6f46cc0dc3ce0d397c3a2402ff
Author: Jingyi Mei <[email protected]>
AuthorDate: Fri Jun 7 12:13:55 2019 -0700

    DL: Standardize the data types used for keras and sql
    
    JIRA: MADLIB-1358
    
    Under deep_learning module (this includes preprocessor, keras model arch
    table, fit, evaluate and predict), we made the following changes:
    
    1. independent_var and normalizing const to be float32 (REAL).
    2. dependent_var to be a 16-bit integer (SMALLINT)
    
    We were previously converting independent_var to float64 which is not
    necessary at all. Instead we might be able to save resources by just
    using float32 for the independent_var and int16 for the one hot encoded
    dependent_var.
    
    Co-authored-by: Nikhil Kak <[email protected]>
---
 .../deep_learning/input_data_preprocessor.py_in    | 35 ++++++-------
 .../deep_learning/input_data_preprocessor.sql_in   |  6 +--
 .../modules/deep_learning/madlib_keras.py_in       | 26 ++++------
 .../deep_learning/madlib_keras_helper.py_in        | 59 +++++++++++++---------
 .../deep_learning/madlib_keras_predict.py_in       |  2 +-
 .../deep_learning/madlib_keras_serializer.py_in    |  2 +-
 .../test/input_data_preprocessor.sql_in            | 16 ++++--
 .../modules/deep_learning/test/madlib_keras.sql_in |  7 +--
 .../unit_tests/test_input_data_preprocessor.py_in  |  9 ++--
 9 files changed, 88 insertions(+), 74 deletions(-)

diff --git 
a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in 
b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
index 71e5feb..5cdd638 100644
--- a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
+++ b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in
@@ -44,14 +44,7 @@ from utilities.utilities import validate_module_input_params
 from utilities.validate_args import input_tbl_valid
 from utilities.validate_args import get_expr_type
 
-from madlib_keras_helper import CLASS_VALUES_COLNAME
-from madlib_keras_helper import DEPENDENT_VARNAME_COLNAME
-from madlib_keras_helper import DEPENDENT_VARTYPE_COLNAME
-from madlib_keras_helper import INDEPENDENT_VARNAME_COLNAME
-from madlib_keras_helper import MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL
-from madlib_keras_helper import MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL
-from madlib_keras_helper import NORMALIZING_CONST_COLNAME
-from madlib_keras_helper import strip_trailing_nulls_from_class_values
+from madlib_keras_helper import *
 
 NUM_CLASSES_COLNAME = "num_classes"
 
@@ -127,7 +120,7 @@ class InputDataPreprocessorDL(object):
         # Assuming the input NUMERIC[] is already one_hot_encoded,
         # so casting to INTEGER[]
         if is_valid_psql_type(self.dependent_vartype, NUMERIC | ONLY_ARRAY):
-            return "{0}::INTEGER[]".format(self.dependent_varname)
+            return "{0}::{1}[]".format(self.dependent_varname, 
SMALLINT_SQL_TYPE)
 
         # For DL use case, we want to allow NULL as a valid class value,
         # so the query must have 'IS NOT DISTINCT FROM' instead of '='
@@ -141,8 +134,10 @@ class InputDataPreprocessorDL(object):
         if self.num_classes:
             one_hot_encoded_expr.extend(['false'
                 for i in range(self.padding_size)])
-        return 'ARRAY[{0}]::INTEGER[]'.format(
-            ', '.join(one_hot_encoded_expr))
+        # In psql, we can't directly convert boolean to smallint, so we firstly
+        # convert it to integer and then cast to smallint
+        return 'ARRAY[{0}]::INTEGER[]::{1}[]'.format(
+            ', '.join(one_hot_encoded_expr), SMALLINT_SQL_TYPE)
 
     def input_preprocessor_dl(self, order_by_random=True):
         """
@@ -169,11 +164,12 @@ class InputDataPreprocessorDL(object):
         scalar_mult_sql = """
             CREATE TEMP TABLE {norm_tbl} AS
             SELECT {self.schema_madlib}.array_scalar_mult(
-                {self.independent_varname}::REAL[], 
(1/{self.normalizing_const})::REAL) AS x_norm,
+                {self.independent_varname}::{FLOAT32_SQL_TYPE}[],
+                (1/{self.normalizing_const})::{FLOAT32_SQL_TYPE}) AS x_norm,
                 {one_hot_dep_var_array_expr} AS y,
                 row_number() over() AS row_id
             FROM {self.source_table} {order_by_clause}
-            """.format(**locals())
+            """.format(FLOAT32_SQL_TYPE=FLOAT32_SQL_TYPE, **locals())
         plpy.execute(scalar_mult_sql)
         # Create the mini-batched output table
         if is_platform_pg():
@@ -185,7 +181,7 @@ class InputDataPreprocessorDL(object):
             SELECT * FROM
             (
                 SELECT {self.schema_madlib}.agg_array_concat(
-                            ARRAY[{norm_tbl}.x_norm::REAL[]]) AS {x},
+                            ARRAY[{norm_tbl}.x_norm::{FLOAT32_SQL_TYPE}[]]) AS 
{x},
                        {self.schema_madlib}.agg_array_concat(
                             ARRAY[{norm_tbl}.y]) AS {y},
                        ({norm_tbl}.row_id%{self.num_of_buffers})::smallint AS 
buffer_id
@@ -195,6 +191,7 @@ class InputDataPreprocessorDL(object):
             {distributed_by_clause}
             """.format(x=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL,
                        y=MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL,
+                       FLOAT32_SQL_TYPE=FLOAT32_SQL_TYPE,
                        **locals())
         plpy.execute(sql)
         plpy.execute("DROP TABLE IF EXISTS {0}".format(norm_tbl))
@@ -226,7 +223,7 @@ class InputDataPreprocessorDL(object):
                 $__madlib__${self.dependent_vartype}$__madlib__$::TEXT AS 
{dependent_vartype_colname},
                 {class_level_str} AS {class_values_colname},
                 {self.buffer_size} AS buffer_size,
-                {self.normalizing_const} AS {normalizing_const_colname},
+                {self.normalizing_const}::{FLOAT32_SQL_TYPE} AS 
{normalizing_const_colname},
                 {self.num_classes} AS {num_classes_colname}
             """.format(self=self, class_level_str=class_level_str,
                        dependent_varname_colname=DEPENDENT_VARNAME_COLNAME,
@@ -234,7 +231,8 @@ class InputDataPreprocessorDL(object):
                        dependent_vartype_colname=DEPENDENT_VARTYPE_COLNAME,
                        class_values_colname=CLASS_VALUES_COLNAME,
                        normalizing_const_colname=NORMALIZING_CONST_COLNAME,
-                       num_classes_colname=NUM_CLASSES_COLNAME)
+                       num_classes_colname=NUM_CLASSES_COLNAME,
+                       FLOAT32_SQL_TYPE=FLOAT32_SQL_TYPE)
         plpy.execute(query)
 
     def _validate_args(self):
@@ -562,9 +560,8 @@ class InputDataPreprocessorDocumentation:
                                       column.
             buffer_size            -- INTEGER. Default computed automatically.
                                       Number of source input rows to pack into 
a buffer.
-            normalizing_const      -- DOUBLE PRECISON. Default 1.0. The
-                                      normalizing constant to use for
-                                      standardizing arrays in 
independent_varname.
+            normalizing_const      -- REAL. Default 1.0. The normalizing 
constant to
+                                      use for standardizing arrays in 
independent_varname.
             num_classes            -- INTEGER. Default NULL. Number of class 
labels
                                       to be considered for 1-hot encoding. If 
NULL,
                                       the 1-hot encoded array length will be 
equal to
diff --git 
a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in 
b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in
index 01936a3..987b557 100644
--- a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in
+++ b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in
@@ -124,7 +124,7 @@ training_preprocessor_dl(source_table,
   </dd>
 
   <dt>normalizing_const (optional)</dt>
-  <dd>DOUBLE PRECISION, default: 1.0. The normalizing constant to divide
+  <dd>REAL, default: 1.0. The normalizing constant to divide
   each value in the 'independent_varname' array by.  For example,
   you would use 255 for this value if the image data is in the form 0-255.
   </dd>
@@ -768,7 +768,7 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.training_preprocessor_dl(
     dependent_varname           VARCHAR,
     independent_varname         VARCHAR,
     buffer_size                 INTEGER,
-    normalizing_const           DOUBLE PRECISION,
+    normalizing_const           REAL,
     num_classes                 INTEGER
 ) RETURNS VOID AS $$
     PythonFunctionBodyOnly(deep_learning, input_data_preprocessor)
@@ -786,7 +786,7 @@ CREATE OR REPLACE FUNCTION 
MADLIB_SCHEMA.training_preprocessor_dl(
     dependent_varname       VARCHAR,
     independent_varname     VARCHAR,
     buffer_size             INTEGER,
-    normalizing_const       DOUBLE PRECISION
+    normalizing_const       REAL
 ) RETURNS VOID AS $$
   SELECT MADLIB_SCHEMA.training_preprocessor_dl($1, $2, $3, $4, $5, $6, NULL);
 $$ LANGUAGE sql VOLATILE
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
index 9d00619..04f00af 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras.py_in
@@ -18,7 +18,6 @@
 # under the License.
 
 import datetime
-import numpy as np
 import os
 import plpy
 import sys
@@ -122,8 +121,8 @@ def fit(schema_madlib, source_table, model, 
model_arch_table,
     fit_params_to_pass = "$madlib$" + fit_params + "$madlib$"
     run_training_iteration = plpy.prepare("""
         SELECT {schema_madlib}.fit_step(
-            {mb_dep_var_col}::SMALLINT[],
-            {mb_indep_var_col}::REAL[],
+            {mb_dep_var_col},
+            {mb_indep_var_col},
             $MAD${model_arch}$MAD$::TEXT,
             {compile_params_to_pass}::TEXT,
             {fit_params_to_pass}::TEXT,
@@ -242,7 +241,7 @@ def fit(schema_madlib, source_table, model, 
model_arch_table,
             {num_classes}::INTEGER AS num_classes,
             $6 AS {class_values_colname},
             $MAD${dep_vartype}$MAD$::TEXT AS {dependent_vartype_colname},
-            {norm_const}::DOUBLE PRECISION AS {normalizing_const_colname},
+            {norm_const}::{FLOAT32_SQL_TYPE} AS {normalizing_const_colname},
             {metrics_type}::TEXT[] AS metrics_type,
             {training_metrics_final}::DOUBLE PRECISION AS 
training_metrics_final,
             {training_loss_final}::DOUBLE PRECISION AS training_loss_final,
@@ -257,6 +256,7 @@ def fit(schema_madlib, source_table, model, 
model_arch_table,
                    class_values_colname=CLASS_VALUES_COLNAME,
                    dependent_vartype_colname=DEPENDENT_VARTYPE_COLNAME,
                    normalizing_const_colname=NORMALIZING_CONST_COLNAME,
+                   FLOAT32_SQL_TYPE = FLOAT32_SQL_TYPE,
                    **locals()),
                    ["TEXT", "TEXT", "TEXT", "TEXT", "DOUBLE PRECISION[]", 
class_values_type])
     plpy.execute(create_output_summary_table,
@@ -415,8 +415,8 @@ def fit_transition(state, dependent_var, independent_var, 
model_architecture,
         agg_image_count = 
madlib_keras_serializer.get_image_count_from_state(state)
 
     # Prepare the data
-    x_train = np.array(independent_var, dtype='float64')
-    y_train = np.array(dependent_var)
+    x_train = np_array_float32(independent_var)
+    y_train = np_array_int16(dependent_var)
 
     # Fit segment model on data
     start_fit = time.time()
@@ -557,13 +557,9 @@ def get_loss_metric_from_keras_eval(schema_madlib, table, 
compile_params,
     and accuracy of each tuple which then gets averaged to get the final 
result.
     """
     evaluate_query = plpy.prepare("""
-    -- TODO:  really, we should not be casting integers and big integers to 
smallint's
-    --  The right solution is either to change the datatype of the agg 
function from
-    --  SMALLINT to INTEGER, or change the output of minibatch util to produce 
SMALLINT
-    --  For the first, we should change fit_step also
-    select ({schema_madlib}.internal_keras_evaluate(
-                                            {mb_dep_var_col}::SMALLINT[],
-                                            {mb_indep_var_col}::REAL[],
+        select ({schema_madlib}.internal_keras_evaluate(
+                                            {mb_dep_var_col},
+                                            {mb_indep_var_col},
                                             $MAD${model_arch}$MAD$,
                                             $1,
                                             {compile_params},
@@ -602,8 +598,8 @@ def internal_keras_eval_transition(state, dependent_var, 
independent_var,
         # Same model every time, no need to re-compile or update weights
         model = SD['segment_model']
 
-    x_val = np.array(independent_var)
-    y_val = np.array(dependent_var)
+    x_val = np_array_float32(independent_var)
+    y_val = np_array_int16(dependent_var)
 
     with K.tf.device(device_name):
         res = model.evaluate(x_val, y_val)
diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
index 948f2ad..17bdda4 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in
@@ -21,14 +21,44 @@ import numpy as np
 from utilities.utilities import is_platform_pg
 import plpy
 
+
+############### Constants used in other deep learning files #########
+# Name of columns in model summary table.
+CLASS_VALUES_COLNAME = "class_values"
+NORMALIZING_CONST_COLNAME = "normalizing_const"
+COMPILE_PARAMS_COLNAME = "compile_params"
+DEPENDENT_VARNAME_COLNAME = "dependent_varname"
+DEPENDENT_VARTYPE_COLNAME = "dependent_vartype"
+INDEPENDENT_VARNAME_COLNAME = "independent_varname"
+MODEL_ARCH_TABLE_COLNAME = "model_arch_table"
+MODEL_ARCH_ID_COLNAME = "model_arch_id"
+MODEL_DATA_COLNAME = "model_data"
+METRIC_TYPE_COLNAME = "metrics_type"
+
+# Name of independent and dependent colnames in batched table.
+# These are readonly variables, do not modify.
+# MADLIB-1300 Adding these variables for DL only at this time.
+MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL = "dependent_var"
+MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL = "independent_var"
+
+## sql variable types
+FLOAT32_SQL_TYPE = 'REAL'
+SMALLINT_SQL_TYPE = 'SMALLINT'
+
+#####################################################################
+
 # Prepend a dimension to np arrays using expand_dims.
-def expand_input_dims(input_data, target_type=None):
-    input_data = np.array(input_data)
+def expand_input_dims(input_data):
+    input_data = np_array_float32(input_data)
     input_data = np.expand_dims(input_data, axis=0)
-    if target_type:
-        input_data = input_data.astype(target_type)
     return input_data
 
+def np_array_float32(var):
+    return np.array(var, dtype=np.float32)
+
+def np_array_int16(var):
+    return np.array(var, dtype=np.int16)
+
 def strip_trailing_nulls_from_class_values(class_values):
     """
         class_values is a list of unique class levels in training data. This
@@ -68,7 +98,7 @@ def get_image_count_per_seg_from_array(current_seg_id, 
seg_ids, images_per_seg):
     """
     Get the image count from the array containing all the images
     per segment. Based on the platform, we find the index of the current 
segment.
-    This function is only called from inside the transition function. 
+    This function is only called from inside the transition function.
     """
     if is_platform_pg():
         total_images = images_per_seg[0]
@@ -79,7 +109,7 @@ def get_image_count_per_seg_from_array(current_seg_id, 
seg_ids, images_per_seg):
 def get_image_count_per_seg_for_minibatched_data_from_db(table_name):
     """
     Query the given minibatch formatted table and return the total rows per 
segment.
-    Since we cannot pass a dictionary to the keras fit step function we create 
+    Since we cannot pass a dictionary to the keras fit step function we create
     arrays out of the segment numbers and the rows per segment values.
     This function assumes that the table is not empty.
     :param table_name:
@@ -143,20 +173,3 @@ def 
get_image_count_per_seg_for_non_minibatched_data_from_db(table_name):
     return gp_segment_id_col, seg_ids, images_per_seg
 
 
-# Name of columns in model summary table.
-CLASS_VALUES_COLNAME = "class_values"
-NORMALIZING_CONST_COLNAME = "normalizing_const"
-COMPILE_PARAMS_COLNAME = "compile_params"
-DEPENDENT_VARNAME_COLNAME = "dependent_varname"
-DEPENDENT_VARTYPE_COLNAME = "dependent_vartype"
-INDEPENDENT_VARNAME_COLNAME = "independent_varname"
-MODEL_ARCH_TABLE_COLNAME = "model_arch_table"
-MODEL_ARCH_ID_COLNAME = "model_arch_id"
-MODEL_DATA_COLNAME = "model_data"
-METRIC_TYPE_COLNAME = "metrics_type"
-
-# Name of independent and dependent colnames in batched table.
-# These are readonly variables, do not modify.
-# MADLIB-1300 Adding these variables for DL only at this time.
-MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL = "dependent_var"
-MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL = "independent_var"
diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
index b5f35dc..ca7a9ad 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in
@@ -128,7 +128,7 @@ def internal_keras_predict(independent_var, 
model_architecture, model_data,
         # Since the test data isn't mini-batched,
         # we have to make sure that the test data np array has the same
         # number of dimensions as input_shape. So we add a dimension to x.
-        independent_var = expand_input_dims(independent_var, 
target_type='float32')
+        independent_var = expand_input_dims(independent_var)
         independent_var /= normalizing_const
 
         if is_response:
diff --git 
a/src/ports/postgres/modules/deep_learning/madlib_keras_serializer.py_in 
b/src/ports/postgres/modules/deep_learning/madlib_keras_serializer.py_in
index ba6672e..b2ccce4 100644
--- a/src/ports/postgres/modules/deep_learning/madlib_keras_serializer.py_in
+++ b/src/ports/postgres/modules/deep_learning/madlib_keras_serializer.py_in
@@ -43,7 +43,7 @@ that will be passed on to model.set_weights()
 the model weights into a bytestring that will be passed on to the fit merge 
function.
 5. In fit merge, deserialize the state as image and 1d np arrays. Do some 
averaging
 operations and serialize them again into a state which contains the image
-and the 1d state. same for fit final 
+and the 1d state. same for fit final
 6. Return the final state from fit final to fit which will then be deserialized
 as 1d weights to be passed on to the evaluate function
 """
diff --git 
a/src/ports/postgres/modules/deep_learning/test/input_data_preprocessor.sql_in 
b/src/ports/postgres/modules/deep_learning/test/input_data_preprocessor.sql_in
index ff0452e..8626ecd 100644
--- 
a/src/ports/postgres/modules/deep_learning/test/input_data_preprocessor.sql_in
+++ 
b/src/ports/postgres/modules/deep_learning/test/input_data_preprocessor.sql_in
@@ -138,10 +138,15 @@ SELECT assert
         class_values        = '{-6,-3,-1,0,2,3,4,5,6,7,8,9,10,12,NULL,NULL}' 
AND
         buffer_size         = 4 AND  -- we sort the class values in python
         normalizing_const   = 5 AND
+        pg_typeof(normalizing_const) = 'real'::regtype AND
         num_classes         = 16,
         'Summary Validation failed. Actual:' || __to_char(summary)
         ) from (select * from data_preprocessor_input_batch_summary) summary;
 
+--- Test output data type
+SELECT assert(pg_typeof(independent_var) = 'real[]'::regtype, 'Wrong 
independent_var type') FROM data_preprocessor_input_batch WHERE buffer_id = 0;
+SELECT assert(pg_typeof(dependent_var) = 'smallint[]'::regtype, 'Wrong 
dependent_var type') FROM data_preprocessor_input_batch WHERE buffer_id = 0;
+
 -- Test for validation data where the input table has only a subset of
 -- the classes compared to the original training data
 -- (data_preprocessor_input_batch). The one hot encoding must be based
@@ -177,6 +182,7 @@ SELECT assert
         class_values        = '{-6,-3,-1,0,2,3,4,5,6,7,8,9,10,12,NULL,NULL}' 
AND
         buffer_size         = 1 AND  -- we sort the class values in python
         normalizing_const   = 5 AND
+        pg_typeof(normalizing_const) = 'real'::regtype AND
         num_classes         = 16,
         'Summary Validation failed. Actual:' || __to_char(summary)
         ) from (select * from validation_out_summary) summary;
@@ -191,7 +197,7 @@ SELECT training_preprocessor_dl(
   'x',
   4,
   5);
-SELECT assert(pg_typeof(dependent_var) = 'integer[]'::regtype, 'One-hot encode 
doesn''t convert into integer array format') FROM data_preprocessor_input_batch 
WHERE buffer_id = 0;
+SELECT assert(pg_typeof(dependent_var) = 'smallint[]'::regtype, 'One-hot 
encode doesn''t convert into integer array format') FROM 
data_preprocessor_input_batch WHERE buffer_id = 0;
 SELECT assert(array_upper(dependent_var, 2) = 2, 'Incorrect one-hot encode 
dimension') FROM
   data_preprocessor_input_batch WHERE buffer_id = 0;
 SELECT assert(SUM(y) = 1, 'Incorrect one-hot encode format') FROM (SELECT 
buffer_id, UNNEST(dependent_var[1:1]) as y FROM data_preprocessor_input_batch) 
a WHERE buffer_id = 0;
@@ -229,7 +235,7 @@ SELECT training_preprocessor_dl(
   'x',
   4,
   5);
-SELECT assert(pg_typeof(dependent_var) = 'integer[]'::regtype, 'One-hot encode 
doesn''t convert into integer array format') FROM data_preprocessor_input_batch 
WHERE buffer_id = 0;
+SELECT assert(pg_typeof(dependent_var) = 'smallint[]'::regtype, 'One-hot 
encode doesn''t convert into integer array format') FROM 
data_preprocessor_input_batch WHERE buffer_id = 0;
 SELECT assert(array_upper(dependent_var, 2) = 3, 'Incorrect one-hot encode 
dimension') FROM
   data_preprocessor_input_batch WHERE buffer_id = 0;
 SELECT assert(SUM(y) = 1, 'Incorrect one-hot encode format') FROM (SELECT 
buffer_id, UNNEST(dependent_var[1:1]) as y FROM data_preprocessor_input_batch) 
a WHERE buffer_id = 0;
@@ -260,7 +266,7 @@ SELECT training_preprocessor_dl(
   'x',
   4,
   5);
-SELECT assert(pg_typeof(dependent_var) = 'integer[]'::regtype, 'One-hot encode 
doesn''t convert into integer array format') FROM data_preprocessor_input_batch 
WHERE buffer_id = 0;
+SELECT assert(pg_typeof(dependent_var) = 'smallint[]'::regtype, 'One-hot 
encode doesn''t convert into integer array format') FROM 
data_preprocessor_input_batch WHERE buffer_id = 0;
 SELECT assert(array_upper(dependent_var, 2) = 3, 'Incorrect one-hot encode 
dimension') FROM
   data_preprocessor_input_batch WHERE buffer_id = 0;
 SELECT assert(SUM(y) = 1, 'Incorrect one-hot encode format') FROM (SELECT 
buffer_id, UNNEST(dependent_var[1:1]) as y FROM data_preprocessor_input_batch) 
a WHERE buffer_id = 0;
@@ -279,7 +285,7 @@ SELECT training_preprocessor_dl(
   'x',
   4,
   5);
-SELECT assert(pg_typeof(dependent_var) = 'integer[]'::regtype, 'One-hot encode 
doesn''t convert into integer array format') FROM data_preprocessor_input_batch 
WHERE buffer_id = 0;
+SELECT assert(pg_typeof(dependent_var) = 'smallint[]'::regtype, 'One-hot 
encode doesn''t convert into integer array format') FROM 
data_preprocessor_input_batch WHERE buffer_id = 0;
 SELECT assert(array_upper(dependent_var, 2) = 2, 'Incorrect one-hot encode 
dimension') FROM
   data_preprocessor_input_batch WHERE buffer_id = 0;
 SELECT assert(relative_error(SUM(y), SUM(y4)) < 0.000001, 'Incorrect one-hot 
encode value') FROM (SELECT UNNEST(dependent_var) AS y FROM 
data_preprocessor_input_batch) a, (SELECT UNNEST(y4) as y4 FROM 
data_preprocessor_input) b;
@@ -309,7 +315,7 @@ SELECT training_preprocessor_dl(
   'x',
   4,
   5);
-SELECT assert(pg_typeof(dependent_var) = 'integer[]'::regtype, 'One-hot encode 
doesn''t convert into integer array format') FROM data_preprocessor_input_batch 
WHERE buffer_id = 0;
+SELECT assert(pg_typeof(dependent_var) = 'smallint[]'::regtype, 'One-hot 
encode doesn''t convert into integer array format') FROM 
data_preprocessor_input_batch WHERE buffer_id = 0;
 SELECT assert(array_upper(dependent_var, 2) = 2, 'Incorrect one-hot encode 
dimension') FROM
   data_preprocessor_input_batch WHERE buffer_id = 0;
 SELECT assert(relative_error(SUM(y), SUM(y5)) < 0.000001, 'Incorrect one-hot 
encode value') FROM (SELECT UNNEST(dependent_var) AS y FROM 
data_preprocessor_input_batch) a, (SELECT UNNEST(y5) as y5 FROM 
data_preprocessor_input) b;
diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in 
b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
index abd2b54..dacf236 100644
--- a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
+++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in
@@ -39,8 +39,8 @@ copy cifar_10_sample from stdin delimiter '|';
 DROP TABLE IF EXISTS cifar_10_sample_batched;
 CREATE TABLE cifar_10_sample_batched(
     buffer_id smallint,
-    dependent_var integer[],
-    dependent_var_text_with_null integer[],
+    dependent_var smallint[],
+    dependent_var_text_with_null smallint[],
     independent_var real[]);
 copy cifar_10_sample_batched from stdin delimiter '|';
 
0|{{0,1}}|{{0,0,1,0,0}}|{{{{0.494118,0.462745,0.431373},{0.478431,0.45098,0.423529},{0.494118,0.466667,0.435294},{0.498039,0.466667,0.427451},{0.509804,0.478431,0.435294},{0.509804,0.478431,0.435294},{0.517647,0.486275,0.443137},{0.521569,0.490196,0.447059},{0.509804,0.478431,0.435294},{0.517647,0.486275,0.443137},{0.52549,0.494118,0.45098},{0.513726,0.482353,0.439216},{0.513726,0.482353,0.439216},{0.52549,0.494118,0.45098},{0.521569,0.490196,0.447059},{0.533333,0.501961,0.458824},{0.537
 [...]
@@ -68,7 +68,7 @@ INSERT INTO cifar_10_sample_batched_summary values (
     255.0);
 
 drop table if exists cifar_10_sample_val;
-create table cifar_10_sample_val(independent_var REAL[], dependent_var 
INTEGER[], buffer_id SMALLINT);
+create table cifar_10_sample_val(independent_var REAL[], dependent_var 
SMALLINT[], buffer_id SMALLINT);
 copy cifar_10_sample_val from stdin delimiter '|';
 
{{{{0.494118,0.462745,0.431373},{0.478431,0.45098,0.423529},{0.494118,0.466667,0.435294},{0.498039,0.466667,0.427451},{0.509804,0.478431,0.435294},{0.509804,0.478431,0.435294},{0.517647,0.486275,0.443137},{0.521569,0.490196,0.447059},{0.509804,0.478431,0.435294},{0.517647,0.486275,0.443137},{0.52549,0.494118,0.45098},{0.513726,0.482353,0.439216},{0.513726,0.482353,0.439216},{0.52549,0.494118,0.45098},{0.521569,0.490196,0.447059},{0.533333,0.501961,0.458824},{0.537255,0.505882,0.462745},{
 [...]
 
{{{{0.792157,0.8,0.780392},{0.792157,0.8,0.780392},{0.8,0.807843,0.788235},{0.807843,0.815686,0.796079},{0.815686,0.823529,0.803922},{0.819608,0.827451,0.807843},{0.823529,0.831373,0.811765},{0.831373,0.839216,0.823529},{0.835294,0.843137,0.831373},{0.843137,0.85098,0.839216},{0.847059,0.854902,0.843137},{0.847059,0.854902,0.843137},{0.843137,0.85098,0.839216},{0.847059,0.854902,0.843137},{0.847059,0.854902,0.843137},{0.847059,0.854902,0.839216},{0.85098,0.858824,0.839216},{0.85098,0.858
 [...]
@@ -135,6 +135,7 @@ SELECT assert(
         dependent_vartype = 'smallint' AND
         independent_varname = 'x' AND
         normalizing_const = 255.0 AND
+        pg_typeof(normalizing_const) = 'real'::regtype AND
         name is NULL AND
         description is NULL AND
         model_size > 0 AND
diff --git 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_input_data_preprocessor.py_in
 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_input_data_preprocessor.py_in
index 4eb835c..8358697 100644
--- 
a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_input_data_preprocessor.py_in
+++ 
b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_input_data_preprocessor.py_in
@@ -204,7 +204,7 @@ class InputPreProcessorDLTestCase(unittest.TestCase):
                                                 self.default_module_name)
 
     def test_get_one_hot_encoded_dep_var_expr_null_val(self):
-        self.module.get_expr_type = Mock(side_effect = ['integer[]', 'text'])
+        self.module.get_expr_type = Mock(side_effect = ['smallint[]', 'text'])
         self.module.get_distinct_col_levels = Mock(return_value = ["NULL", 
"'a'"])
         obj = self.module.InputDataPreprocessorDL(
             self.default_schema_madlib,
@@ -218,12 +218,13 @@ class InputPreProcessorDLTestCase(unittest.TestCase):
             self.default_module_name)
         obj.dependent_levels = ["NULL", "'a'"]
         dep_var_array_expr = obj.get_one_hot_encoded_dep_var_expr()
-        self.assertEqual("array[({0}) is not distinct from null, ({0}) is not 
distinct from 'a']::integer[]".
+        self.assertEqual("array[({0}) is not distinct from null, " \
+            "({0}) is not distinct from 'a']::integer[]::smallint[]".
                      format(self.default_dep_var),
                      dep_var_array_expr.lower())
 
     def test_get_one_hot_encoded_dep_var_expr_numeric_array_val(self):
-        self.module.get_expr_type = Mock(side_effect = ['integer[]', 
'integer[]'])
+        self.module.get_expr_type = Mock(side_effect = ['smallint[]', 
'integer[]'])
         obj = self.module.InputDataPreprocessorDL(
             self.default_schema_madlib,
             self.default_source_table,
@@ -235,7 +236,7 @@ class InputPreProcessorDLTestCase(unittest.TestCase):
             self.default_num_classes,
             self.default_module_name)
         dep_var_array_expr = obj.get_one_hot_encoded_dep_var_expr()
-        self.assertEqual("{0}::integer[]".
+        self.assertEqual("{0}::smallint[]".
                      format(self.default_dep_var),
                      dep_var_array_expr.lower())

[madlib] branch master updated: DL: Standardize the data types used for keras and sql

Reply via email to