This is an automated email from the ASF dual-hosted git repository. njayaram pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/madlib.git
commit 8eaf563a8e36817dcfb171d06f3c756c2f1d1215 Author: Nandish Jayaram <[email protected]> AuthorDate: Fri Apr 26 17:20:41 2019 -0700 DL: Add input preprocessor for validation data JIRA: MADLIB-1333 This commit makes the following changes: 1) Rename minibatch_preprocessor_dl to training_preprocessor_dl. 2) Code refactoring to move it from utilities module to deep_learning. 3) Add a new function named validation_preprocessor_dl that can be used to preprocess validation data. The independent var in validation data is minibatched and normalized while the dependent var is 1-hot encoded. We do not distribute the minibatched data randomly, since this data is only used for evaluation and not training. 4) Necessary changes to get existing tests to green. Co-authored-by: Ekta Khanna <[email protected]> --- doc/mainpage.dox.in | 2 +- .../deep_learning/input_data_preprocessor.py_in | 494 +++++++++++++++++++++ .../input_data_preprocessor.sql_in} | 100 +++-- .../deep_learning/madlib_keras_helper.py_in | 36 ++ .../deep_learning/madlib_keras_predict.py_in | 38 +- .../test/input_data_preprocessor.sql_in} | 176 ++++---- .../modules/deep_learning/test/madlib_keras.sql_in | 8 +- .../unit_tests/test_input_data_preprocessor.py_in | 294 ++++++++++++ .../test/unit_tests/test_madlib_keras.py_in | 46 +- .../utilities/minibatch_preprocessing.py_in | 304 ------------- .../unit_tests/test_minibatch_preprocessing.py_in | 248 +---------- 11 files changed, 1009 insertions(+), 737 deletions(-) diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in index e221319..6e0ac48 100644 --- a/doc/mainpage.dox.in +++ b/doc/mainpage.dox.in @@ -291,7 +291,7 @@ Interface and implementation are subject to change. @details A collection of modules for deep learning. @{ @defgroup grp_keras_model_arch Load Model Architecture - @defgroup grp_minibatch_preprocessing_dl Mini-Batch Preprocessor for Images + @defgroup grp_input_preprocessor_dl Input Preprocessor for Images @} @defgroup grp_bayes Naive Bayes Classification @defgroup grp_sample Random Sampling diff --git a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in new file mode 100644 index 0000000..e22ab59 --- /dev/null +++ b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in @@ -0,0 +1,494 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +""" +@file input_data_preprocessor.py_in + +""" +from math import ceil +import plpy + +from internal.db_utils import get_distinct_col_levels +from internal.db_utils import quote_literal +from utilities.minibatch_preprocessing import MiniBatchBufferSizeCalculator +from utilities.utilities import _assert +from utilities.utilities import add_postfix +from utilities.utilities import is_platform_pg +from utilities.utilities import is_psql_char_type +from utilities.utilities import is_valid_psql_type +from utilities.utilities import BOOLEAN, NUMERIC, ONLY_ARRAY, TEXT +from utilities.utilities import py_list_to_sql_string +from utilities.utilities import split_quoted_delimited_str +from utilities.utilities import strip_end_quotes +from utilities.utilities import unique_string +from utilities.utilities import validate_module_input_params + +from utilities.validate_args import _tbl_dimension_rownum +from utilities.validate_args import input_tbl_valid +from utilities.validate_args import get_expr_type + +from madlib_keras_helper import CLASS_VALUES_COLNAME +from madlib_keras_helper import DEPENDENT_VARNAME_COLNAME +from madlib_keras_helper import DEPENDENT_VARTYPE_COLNAME +from madlib_keras_helper import INDEPENDENT_VARNAME_COLNAME +from madlib_keras_helper import NORMALIZING_CONST_COLNAME +from madlib_keras_helper import strip_trailing_nulls_from_class_values + +# These are readonly variables, do not modify +# MADLIB-1300 Adding these variables for DL only at this time. +MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL = "dependent_var" +MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL = "independent_var" +NUM_CLASSES_COLNAME = "num_classes" + +class InputDataPreprocessorDL(object): + def __init__(self, schema_madlib, source_table, output_table, + dependent_varname, independent_varname, buffer_size, + normalizing_const, num_classes, module_name): + self.schema_madlib = schema_madlib + self.source_table = source_table + self.output_table = output_table + self.dependent_varname = dependent_varname + self.independent_varname = independent_varname + self.buffer_size = buffer_size + self.normalizing_const = normalizing_const if normalizing_const is not None else 1.0 + self.num_classes = num_classes + self.module_name = module_name + self.output_summary_table = None + self.dependent_vartype = None + self.independent_vartype = None + if self.output_table: + self.output_summary_table = add_postfix(self.output_table, "_summary") + + ## Validating input args prior to using them in _set_validate_vartypes() + self._validate_args() + self._set_validate_vartypes() + self.num_of_buffers = self._get_num_buffers() + self.dependent_levels = None + # The number of padded zeros to include in 1-hot vector + self.padding_size = 0 + + def _set_one_hot_encoding_variables(self): + if self.dependent_levels: + # if any class level was NULL in sql, that would show up as + # None in self.dependent_levels. Replace all None with NULL + # in the list. + self.dependent_levels = ['NULL' if level is None else level + for level in self.dependent_levels] + self._validate_num_classes() + # Try computing padding_size after running all necessary validations. + if self.num_classes: + self.padding_size = self.num_classes - len(self.dependent_levels) + + def _validate_num_classes(self): + if self.num_classes is not None and \ + self.num_classes < len(self.dependent_levels): + plpy.error("{0}: Invalid num_classes value specified. It must "\ + "be equal to or greater than distinct class values found "\ + "in table ({1}).".format( + self.module_name, len(self.dependent_levels))) + + def get_one_hot_encoded_dep_var_expr(self): + """ + :param dependent_varname: Name of the dependent variable + :param num_classes: Number of class values to consider in 1-hot + :return: + This function returns a tuple of + 1. A string with transformed dependent varname depending on it's type + 2. All the distinct dependent class levels encoded as a string + + If dep_type == numeric[] , do not encode + 1. dependent_varname = rings + transformed_value = ARRAY[rings] + 2. dependent_varname = ARRAY[a, b, c] + transformed_value = ARRAY[a, b, c] + else if dep_type in ("text", "boolean"), encode: + 3. dependent_varname = rings (encoding) + transformed_value = ARRAY[rings=1, rings=2, rings=3] + """ + # Assuming the input NUMERIC[] is already one_hot_encoded, + # so casting to INTEGER[] + if is_valid_psql_type(self.dependent_vartype, NUMERIC | ONLY_ARRAY): + return "{0}::INTEGER[]".format(self.dependent_varname) + + # For DL use case, we want to allow NULL as a valid class value, + # so the query must have 'IS NOT DISTINCT FROM' instead of '=' + # like in the generic get_one_hot_encoded_expr() defined in + # db_utils.py_in. We also have this optional 'num_classes' param + # that affects the logic of 1-hot encoding. Since this is very + # specific to input_preprocessor_dl for now, let's keep + # it here instead of refactoring it out to a generic helper function. + one_hot_encoded_expr = ["({0}) IS NOT DISTINCT FROM {1}".format( + self.dependent_varname, c) for c in self.dependent_levels] + if self.num_classes: + one_hot_encoded_expr.extend(['false' + for i in range(self.padding_size)]) + return 'ARRAY[{0}]::INTEGER[]'.format( + ', '.join(one_hot_encoded_expr)) + + def input_preprocessor_dl(self, order_by_random=True): + self._set_one_hot_encoding_variables() + # Create a temp table that has independent var normalized. + norm_tbl = unique_string(desp='normalized') + + # Always one-hot encode the dependent var. For now, we are assuming + # that input_preprocessor_dl will be used only for deep + # learning and mostly for classification. So make a strong + # assumption that it is only for classification, so one-hot + # encode the dep var, unless it's already a numeric array in + # which case we assume it's already one-hot encoded. + one_hot_dep_var_array_expr = \ + self.get_one_hot_encoded_dep_var_expr() + order_by_clause = " ORDER BY RANDOM() " if order_by_random else "" + scalar_mult_sql = """ + CREATE TEMP TABLE {norm_tbl} AS + SELECT {self.schema_madlib}.array_scalar_mult( + {self.independent_varname}::REAL[], (1/{self.normalizing_const})::REAL) AS x_norm, + {one_hot_dep_var_array_expr} AS y, + row_number() over() AS row_id + FROM {self.source_table} {order_by_clause} + """.format(**locals()) + plpy.execute(scalar_mult_sql) + # Create the mini-batched output table + if is_platform_pg(): + distributed_by_clause = '' + else: + distributed_by_clause= ' DISTRIBUTED BY (buffer_id) ' + sql = """ + CREATE TABLE {self.output_table} AS + SELECT * FROM + ( + SELECT {self.schema_madlib}.agg_array_concat( + ARRAY[{norm_tbl}.x_norm::REAL[]]) AS {x}, + {self.schema_madlib}.agg_array_concat( + ARRAY[{norm_tbl}.y]) AS {y}, + ({norm_tbl}.row_id%{self.num_of_buffers})::smallint AS buffer_id + FROM {norm_tbl} + GROUP BY buffer_id + ) b + {distributed_by_clause} + """.format(x=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL, + y=MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL, + **locals()) + plpy.execute(sql) + plpy.execute("DROP TABLE IF EXISTS {0}".format(norm_tbl)) + # Create summary table + self._create_output_summary_table() + + def _create_output_summary_table(self): + class_level_str='NULL::TEXT' + if self.dependent_levels: + # Update dependent_levels to include NULL when + # num_classes > len(self.dependent_levels) + if self.num_classes: + self.dependent_levels.extend(['NULL' + for i in range(self.padding_size)]) + else: + self.num_classes = len(self.dependent_levels) + class_level_str=py_list_to_sql_string( + self.dependent_levels, array_type=self.dependent_vartype, + long_format=True) + if self.num_classes is None: + self.num_classes = 'NULL' + query = """ + CREATE TABLE {self.output_summary_table} AS + SELECT + $__madlib__${self.source_table}$__madlib__$::TEXT AS source_table, + $__madlib__${self.output_table}$__madlib__$::TEXT AS output_table, + $__madlib__${self.dependent_varname}$__madlib__$::TEXT AS {dependent_varname_colname}, + $__madlib__${self.independent_varname}$__madlib__$::TEXT AS {independent_varname_colname}, + $__madlib__${self.dependent_vartype}$__madlib__$::TEXT AS {dependent_vartype_colname}, + {class_level_str} AS {class_values_colname}, + {self.buffer_size} AS buffer_size, + {self.normalizing_const} AS {normalizing_const_colname}, + {self.num_classes} AS {num_classes_colname} + """.format(self=self, class_level_str=class_level_str, + dependent_varname_colname=DEPENDENT_VARNAME_COLNAME, + independent_varname_colname=INDEPENDENT_VARNAME_COLNAME, + dependent_vartype_colname=DEPENDENT_VARTYPE_COLNAME, + class_values_colname=CLASS_VALUES_COLNAME, + normalizing_const_colname=NORMALIZING_CONST_COLNAME, + num_classes_colname=NUM_CLASSES_COLNAME) + plpy.execute(query) + + def _validate_args(self): + validate_module_input_params( + self.source_table, self.output_table, self.independent_varname, + self.dependent_varname, self.module_name, None, + [self.output_summary_table]) + if self.buffer_size is not None: + _assert(self.buffer_size > 0, + "{0}: The buffer size has to be a " + "positive integer or NULL.".format(self.module_name)) + _assert(self.normalizing_const > 0, + "{0}: The normalizing constant has to be a " + "positive integer or NULL.".format(self.module_name)) + + def _set_validate_vartypes(self): + self.independent_vartype = get_expr_type(self.independent_varname, + self.source_table) + self.dependent_vartype = get_expr_type(self.dependent_varname, + self.source_table) + num_of_independent_cols = split_quoted_delimited_str(self.independent_varname) + _assert(len(num_of_independent_cols) == 1, + "Invalid independent_varname: only one column name is allowed " + "as input.") + _assert(is_valid_psql_type(self.independent_vartype, + NUMERIC | ONLY_ARRAY), + "Invalid independent variable type, should be an array of " + "one of {0}".format(','.join(NUMERIC))) + # The dependent variable needs to be either: + # 1. NUMERIC, TEXT OR BOOLEAN, which we always one-hot encode + # 2. NUMERIC ARRAY, which we assume it is already one-hot encoded, and we + # just cast it the INTEGER ARRAY + num_of_dependent_cols = split_quoted_delimited_str(self.dependent_varname) + _assert(len(num_of_dependent_cols) == 1, + "Invalid dependent_varname: only one column name is allowed " + "as input.") + _assert((is_valid_psql_type(self.dependent_vartype, NUMERIC | TEXT | BOOLEAN) or + is_valid_psql_type(self.dependent_vartype, NUMERIC | ONLY_ARRAY)), + """Invalid dependent variable type, should be one of the type in this list: + numeric, text, boolean, or numeric array""") + + def get_distinct_dependent_levels(self, table, dependent_varname, + dependent_vartype): + # Refactoring this out into the parent class to ensure include_nulls + # is passed in as true for both training and validation tables + return get_distinct_col_levels(table, dependent_varname, + dependent_vartype, include_nulls=True) + + def _get_num_buffers(self): + num_rows_in_tbl = plpy.execute(""" + SELECT count(*) AS cnt FROM {0} + """.format(self.source_table))[0]['cnt'] + buffer_size_calculator = MiniBatchBufferSizeCalculator() + indepdent_var_dim = _tbl_dimension_rownum( + self.schema_madlib, self.source_table, + self.independent_varname, skip_row_count=True) + self.buffer_size = buffer_size_calculator.calculate_default_buffer_size( + self.buffer_size, num_rows_in_tbl, indepdent_var_dim[0]) + return ceil((1.0 * num_rows_in_tbl) / self.buffer_size) + +class ValidationDataPreprocessorDL(InputDataPreprocessorDL): + def __init__(self, schema_madlib, source_table, output_table, + dependent_varname, independent_varname, + training_preprocessor_table, buffer_size, **kwargs): + """ + This prepares the variables that are required by + InputDataPreprocessorDL. + """ + self.module_name = "validation_preprocessor_dl" + self.training_preprocessor_table = training_preprocessor_table + summary_table = self._validate_and_process_training_preprocessor_table() + num_classes = summary_table[NUM_CLASSES_COLNAME] + InputDataPreprocessorDL.__init__( + self, schema_madlib, source_table, output_table, + dependent_varname, independent_varname, buffer_size, + summary_table[NORMALIZING_CONST_COLNAME], num_classes, + self.module_name) + # Update value of dependent_levels from training batch summary table. + self.dependent_levels = self._get_dependent_levels( + summary_table[CLASS_VALUES_COLNAME], + summary_table[DEPENDENT_VARTYPE_COLNAME]) + + def _get_dependent_levels(self, training_dependent_levels, + training_dependent_vartype): + # Validate that dep var type is exactly the same as what was in + # trainig_preprocessor_table's input. + _assert(self.dependent_vartype == training_dependent_vartype, + "{0}: the dependent variable's type in {1} must be {2}.".format( + self.module_name, self.source_table, + training_dependent_vartype)) + # training_dependent_levels is the class_values column from the + # training batch summary table. This already has the padding with + # NULLs in it based on num_classes that was provided to + # training_preprocessor_dl(). We have to work our way backwards + # to strip out those trailing NULLs from class_values, since + # they will anyway get added later in + # InputDataPreprocessorDL._set_one_hot_encoding_variables. + dependent_levels = strip_trailing_nulls_from_class_values( + training_dependent_levels) + if training_dependent_levels: + dependent_levels_val_data = self.get_distinct_dependent_levels( + self.source_table, self.dependent_varname, + self.dependent_vartype) + unquoted_dependent_levels_val_data = [strip_end_quotes(level, "'") + for level in dependent_levels_val_data] + # Assert to check if the class values in validation data is a subset + # of the class values in training data. + _assert(set(unquoted_dependent_levels_val_data).issubset(set(dependent_levels)), + "{0}: the class values in {1} ({2}) should be a " + "subset of class values in {3} ({4})".format( + self.module_name, self.source_table, + unquoted_dependent_levels_val_data, + self.training_preprocessor_table, dependent_levels)) + if is_psql_char_type(self.dependent_vartype): + dependent_levels = [quote_literal(level) if level is not None else level + for level in dependent_levels] + return dependent_levels + + def _validate_and_process_training_preprocessor_table(self): + input_tbl_valid(self.training_preprocessor_table, self.module_name) + training_summary_table = add_postfix( + self.training_preprocessor_table, "_summary") + input_tbl_valid(training_summary_table, self.module_name) + summary_table = plpy.execute("SELECT * FROM {0} LIMIT 1".format( + training_summary_table))[0] + _assert(NORMALIZING_CONST_COLNAME in summary_table, + "{0}: Expected column {1} in {2}.".format( + self.module_name, NORMALIZING_CONST_COLNAME, + training_summary_table)) + _assert(CLASS_VALUES_COLNAME in summary_table, + "{0}: Expected column {1} in {2}.".format( + self.module_name, CLASS_VALUES_COLNAME, + training_summary_table)) + _assert(NUM_CLASSES_COLNAME in summary_table, + "{0}: Expected column {1} in {2}.".format( + self.module_name, NUM_CLASSES_COLNAME, + training_summary_table)) + _assert(DEPENDENT_VARTYPE_COLNAME in summary_table, + "{0}: Expected column {1} in {2}.".format( + self.module_name, DEPENDENT_VARTYPE_COLNAME, + training_summary_table)) + return summary_table + + def validation_preprocessor_dl(self): + self.input_preprocessor_dl(order_by_random=False) + +class TrainingDataPreprocessorDL(InputDataPreprocessorDL): + def __init__(self, schema_madlib, source_table, output_table, + dependent_varname, independent_varname, buffer_size, + normalizing_const, num_classes, **kwargs): + InputDataPreprocessorDL.__init__( + self, schema_madlib, source_table, output_table, + dependent_varname, independent_varname, buffer_size, + normalizing_const, num_classes, "training_preprocessor_dl") + # Update default value of dependent_levels in superclass + self.dependent_levels = self._get_dependent_levels() + + def _get_dependent_levels(self): + if is_valid_psql_type(self.dependent_vartype, NUMERIC | ONLY_ARRAY): + dependent_levels = None + else: + dependent_levels = get_distinct_col_levels( + self.source_table, self.dependent_varname, + self.dependent_vartype, include_nulls=True) + return dependent_levels + + def training_preprocessor_dl(self): + self.input_preprocessor_dl(order_by_random=True) + +class InputDataPreprocessorDocumentation: + @staticmethod + def validation_preprocessor_dl_help(schema_madlib, message): + return "TODO: Fix me" + + @staticmethod + def training_preprocessor_dl_help(schema_madlib, message): + method = "training_preprocessor_dl" + summary = """ + ---------------------------------------------------------------- + SUMMARY + ---------------------------------------------------------------- + For Deep Learning based techniques such as Convolutional Neural Nets, + the input data is mostly images. These images can be represented as an + array of numbers where each element represents a pixel/color intensity. + It is standard practice to normalize the image data before use. + minibatch_preprocessor() is for general use-cases, but for deep learning + based use-cases we provide training_preprocessor_dl() that is + light-weight and is specific to image datasets. + + The normalizing constant is parameterized, and can be specified based + on the kind of image data used. + + An optional param named num_classes can be used to specify the length + of the one-hot encoded array for the dependent variable. This value if + specified must be greater than equal to the total number of distinct + class values found in the input table. + + For more details on function usage: + SELECT {schema_madlib}.{method}('usage') + """.format(**locals()) + + usage = """ + --------------------------------------------------------------------------- + USAGE + --------------------------------------------------------------------------- + SELECT {schema_madlib}.{method}( + source_table, -- TEXT. Name of the table containing input + data. Can also be a view + output_table, -- TEXT. Name of the output table for + mini-batching + dependent_varname, -- TEXT. Name of the dependent variable column + independent_varname, -- TEXT. Name of the independent variable + column + buffer_size -- INTEGER. Default computed automatically. + Number of source input rows to pack into a buffer + normalizing_const -- DOUBLE PRECISON. Default 1.0. The + normalizing constant to use for + standardizing arrays in independent_varname. + num_classes -- INTEGER. Default NULL. Number of class labels + to be considered for 1-hot encoding. If NULL, + the 1-hot encoded array length will be equal to + the number of distinct class values found in the + input table. + ); + + + --------------------------------------------------------------------------- + OUTPUT + --------------------------------------------------------------------------- + The output table produced by MiniBatch Preprocessor contains the + following columns: + + buffer_id -- INTEGER. Unique id for packed table. + dependent_varname -- ANYARRAY[]. Packed array of dependent variables. + independent_varname -- REAL[]. Packed array of independent + variables. + + --------------------------------------------------------------------------- + The algorithm also creates a summary table named <output_table>_summary + that has the following columns: + + source_table -- Source table name. + output_table -- Output table name from preprocessor. + dependent_varname -- Dependent variable values from the original table + (encoded by one_hot_encode, if specified). + independent_varname -- Independent variable values from the original + table. + dependent_vartype -- Type of the dependent variable from the + original table. + class_values -- Class values of the dependent variable + (‘NULL’(as TEXT type) for non + categorical vars). + buffer_size -- Buffer size used in preprocessing step. + normalizing_const -- Normalizing constant used for standardizing + arrays in independent_varname. + + --------------------------------------------------------------------------- + """.format(**locals()) + + if not message: + return summary + elif message.lower() in ('usage', 'help', '?'): + return usage + return """ + No such option. Use "SELECT {schema_madlib}.training_preprocessor_dl()" + for help. + """.format(**locals()) diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in similarity index 86% rename from src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in rename to src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in index 3fe17d0..9c9cd53 100644 --- a/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in +++ b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.sql_in @@ -17,7 +17,7 @@ * specific language governing permissions and limitations * under the License. * - * @file minibatch_preprocessing_dl.sql_in + * @file input_preprocessor_dl.sql_in * @brief TODO * @date December 2018 * @@ -27,13 +27,13 @@ m4_include(`SQLCommon.m4') /** -@addtogroup grp_minibatch_preprocessing_dl +@addtogroup grp_input_preprocessor_dl @brief Utility that prepares input image data for use by models that support mini-batch as an optimization option. <div class="toc"><b>Contents</b><ul> -<li class="level1"><a href="#minibatch_preprocessor_dl">Mini-Batch Preprocessor for Image Data</a></li> +<li class="level1"><a href="#input_preprocessor_dl">Input Preprocessor for Image Data</a></li> <li class="level1"><a href="#example">Examples</a></li> <li class="level1"><a href="#related">Related Topics</a></li> </ul></div> @@ -50,7 +50,7 @@ for image data. A separate more general minibatch_preprocessor() is also available for other MADlib modules using non-image input data. <pre class="syntax"> -minibatch_preprocessor_dl( source_table, +training_preprocessor_dl( source_table, output_table, dependent_varname, independent_varname, @@ -95,7 +95,7 @@ minibatch_preprocessor_dl( source_table, output table. The default value is computed considering size of the source table, number of independent variables, and number of segments in the database cluster. - @note minibatch_preprocessor_dl tries to pack data and distribute it + @note input_preprocessor_dl tries to pack data and distribute it evenly based on the number of input rows. Sometimes you don't necessarily get the exact same number of rows in one pack as you specified in buffer_size. @@ -128,7 +128,7 @@ minibatch_preprocessor_dl( source_table, <td>ANYARRAY[]. Packed array of dependent variables. The dependent variable is always one-hot encoded as an INTEGER[] array. For now, we are assuming that - minibatch_preprocessor_dl will be used + input_preprocessor_dl will be used only for classification problems using deep learning. So the dependent variable is one-hot encoded, unless it's already a numeric array in which case we assume it's already one-hot @@ -259,7 +259,7 @@ SELECT * FROM image_data; -# Run the preprocessor for image data: <pre class="example"> DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary; -SELECT madlib.minibatch_preprocessor_dl('image_data', -- Source table +SELECT madlib.input_preprocessor_dl('image_data', -- Source table 'image_data_packed', -- Output table 'species', -- Dependent variable 'rgb', -- Independent variable @@ -397,7 +397,7 @@ SELECT * FROM image_data; -# Run the preprocessor for image data: <pre class="example"> DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary; -SELECT madlib.minibatch_preprocessor_dl('image_data', -- Source table +SELECT madlib.input_preprocessor_dl('image_data', -- Source table 'image_data_packed', -- Output table 'species', -- Dependent variable 'rgb', -- Independent variable @@ -429,7 +429,7 @@ buffer_id | 2 but if you have occasion to change it: <pre class="example"> DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary; -SELECT madlib.minibatch_preprocessor_dl('image_data', -- Source table +SELECT madlib.input_preprocessor_dl('image_data', -- Source table 'image_data_packed', -- Output table 'species', -- Dependent variable 'rgb', -- Independent variable @@ -462,7 +462,7 @@ buffer_size | 10 -# Run the preprocessor for image data with num_classes greater than 3 (distinct class values found in table): <pre class="example"> DROP TABLE IF EXISTS image_data_packed, image_data_packed_summary; -SELECT madlib.minibatch_preprocessor_dl('image_data', -- Source table +SELECT madlib.input_preprocessor_dl('image_data', -- Source table 'image_data_packed', -- Output table 'species', -- Dependent variable 'rgb', -- Independent variable @@ -510,13 +510,59 @@ normalizing_const | 255.0 @anchor related @par Related Topics -minibatch_preprocessing_dl.sql_in +input_preprocessor_dl.sql_in minibatch_preprocessing.sql_in */ -CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.validation_preprocessor_dl( + source_table VARCHAR, + output_table VARCHAR, + dependent_varname VARCHAR, + independent_varname VARCHAR, + training_preprocessor_table VARCHAR, + buffer_size INTEGER +) RETURNS VOID AS $$ + PythonFunctionBodyOnly(deep_learning, input_data_preprocessor) + from utilities.control import MinWarning + with AOControl(False): + with MinWarning('error'): + validation_preprocessor_obj = input_data_preprocessor.ValidationDataPreprocessorDL(**globals()) + validation_preprocessor_obj.validation_preprocessor_dl() +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.validation_preprocessor_dl( + source_table VARCHAR, + output_table VARCHAR, + dependent_varname VARCHAR, + independent_varname VARCHAR, + training_preprocessor_table VARCHAR +) RETURNS VOID AS $$ + SELECT MADLIB_SCHEMA.validation_preprocessor_dl($1, $2, $3, $4, $5, NULL); +$$ LANGUAGE sql VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.validation_preprocessor_dl( + message VARCHAR +) RETURNS VARCHAR AS $$ + PythonFunctionBodyOnly(deep_learning, input_data_preprocessor) + return input_data_preprocessor.InputDataPreprocessorDocumentation.validation_preprocessor_dl_help(schema_madlib, message) +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.validation_preprocessor_dl() +RETURNS VARCHAR AS $$ + PythonFunctionBodyOnly(deep_learning, input_data_preprocessor) + return input_data_preprocessor.InputDataPreprocessorDocumentation.validation_preprocessor_dl_help(schema_madlib, '') +$$ LANGUAGE plpythonu VOLATILE +m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); + +------------------------------------------------------------------------------- + + +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.training_preprocessor_dl( source_table VARCHAR, output_table VARCHAR, dependent_varname VARCHAR, @@ -525,16 +571,16 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( normalizing_const DOUBLE PRECISION, num_classes INTEGER ) RETURNS VOID AS $$ - PythonFunctionBodyOnly(utilities, minibatch_preprocessing) + PythonFunctionBodyOnly(deep_learning, input_data_preprocessor) from utilities.control import MinWarning with AOControl(False): with MinWarning('error'): - minibatch_preprocessor_obj = minibatch_preprocessing.MiniBatchPreProcessorDL(**globals()) - minibatch_preprocessor_obj.minibatch_preprocessor_dl() + training_preprocessor_obj = input_data_preprocessor.TrainingDataPreprocessorDL(**globals()) + training_preprocessor_obj.training_preprocessor_dl() $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); -CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.training_preprocessor_dl( source_table VARCHAR, output_table VARCHAR, dependent_varname VARCHAR, @@ -542,43 +588,43 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( buffer_size INTEGER, normalizing_const DOUBLE PRECISION ) RETURNS VOID AS $$ - SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, $6, NULL); + SELECT MADLIB_SCHEMA.training_preprocessor_dl($1, $2, $3, $4, $5, $6, NULL); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); -CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.training_preprocessor_dl( source_table VARCHAR, output_table VARCHAR, dependent_varname VARCHAR, independent_varname VARCHAR, buffer_size INTEGER ) RETURNS VOID AS $$ - SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, 1.0, NULL); + SELECT MADLIB_SCHEMA.training_preprocessor_dl($1, $2, $3, $4, $5, 1.0, NULL); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); -CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.training_preprocessor_dl( source_table VARCHAR, output_table VARCHAR, dependent_varname VARCHAR, independent_varname VARCHAR ) RETURNS VOID AS $$ - SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, NULL, 1.0, NULL); + SELECT MADLIB_SCHEMA.training_preprocessor_dl($1, $2, $3, $4, NULL, 1.0, NULL); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); -CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.training_preprocessor_dl( message VARCHAR ) RETURNS VARCHAR AS $$ - PythonFunctionBodyOnly(utilities, minibatch_preprocessing) - return minibatch_preprocessing.MiniBatchDocumentation.minibatch_preprocessor_dl_help(schema_madlib, message) + PythonFunctionBodyOnly(deep_learning, input_data_preprocessor) + return input_data_preprocessor.InputDataPreprocessorDocumentation.training_preprocessor_dl_help(schema_madlib, message) $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); -CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl() +CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.training_preprocessor_dl() RETURNS VARCHAR AS $$ - PythonFunctionBodyOnly(utilities, minibatch_preprocessing) - return minibatch_preprocessing.MiniBatchDocumentation.minibatch_preprocessor_dl_help(schema_madlib, '') + PythonFunctionBodyOnly(deep_learning, input_data_preprocessor) + return input_data_preprocessor.InputDataPreprocessorDocumentation.training_preprocessor_dl_help(schema_madlib, '') $$ LANGUAGE plpythonu VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in index 445b5b9..171f5bd 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in @@ -27,12 +27,48 @@ def expand_input_dims(input_data, target_type=None): input_data = input_data.astype(target_type) return input_data +def strip_trailing_nulls_from_class_values(class_values): + """ + class_values is a list of unique class levels in training data. This + could have multiple Nones in it, and this function strips out all the + Nones that occur after the first element in the list. + Examples: + 1) input class_values = ['cat', 'dog'] + output class_values = ['cat', 'dog'] + + 2) input class_values = [None, 'cat', 'dog'] + output class_values = [None, 'cat', 'dog'] + + 3) input class_values = [None, 'cat', 'dog', None, None] + output class_values = [None, 'cat', 'dog'] + + 4) input class_values = ['cat', 'dog', None, None] + output class_values = ['cat', 'dog'] + + 5) input class_values = [None, None] + output class_values = [None] + @args: + @param: class_values, list + @returns: + updated class_values list + """ + num_of_valid_class_values = 0 + if class_values is not None: + for ele in class_values: + if ele is None and num_of_valid_class_values > 0: + break + num_of_valid_class_values += 1 + # Pass only the valid class_values for creating columns + class_values = class_values[:num_of_valid_class_values] + return class_values + # Name of columns in model summary table. CLASS_VALUES_COLNAME = "class_values" NORMALIZING_CONST_COLNAME = "normalizing_const" COMPILE_PARAMS_COLNAME = "compile_params" DEPENDENT_VARNAME_COLNAME = "dependent_varname" DEPENDENT_VARTYPE_COLNAME = "dependent_vartype" +INDEPENDENT_VARNAME_COLNAME = "independent_varname" MODEL_ARCH_TABLE_COLNAME = "model_arch_table" MODEL_ARCH_ID_COLNAME = "model_arch_id" MODEL_DATA_COLNAME = "model_data" diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in index 3ad3bbf..75295bf 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras_predict.py_in @@ -29,6 +29,7 @@ import numpy as np from madlib_keras_helper import expand_input_dims from madlib_keras_helper import MODEL_DATA_COLNAME +from madlib_keras_helper import strip_trailing_nulls_from_class_values from madlib_keras_validator import PredictInputValidator from madlib_keras_wrapper import get_device_name_and_set_cuda_env from madlib_keras_wrapper import set_model_weights @@ -43,41 +44,6 @@ import madlib_keras_serializer MODULE_NAME = 'madlib_keras_predict' -def _strip_trailing_nulls_from_class_values(class_values): - """ - class_values is a list of unique class levels in training data. This - could have multiple Nones in it, and this function strips out all the - Nones that occur after the first element in the list. - Examples: - 1) input class_values = ['cat', 'dog'] - output class_values = ['cat', 'dog'] - - 2) input class_values = [None, 'cat', 'dog'] - output class_values = [None, 'cat', 'dog'] - - 3) input class_values = [None, 'cat', 'dog', None, None] - output class_values = [None, 'cat', 'dog'] - - 4) input class_values = ['cat', 'dog', None, None] - output class_values = ['cat', 'dog'] - - 5) input class_values = [None, None] - output class_values = [None] - @args: - @param: class_values, list - @returns: - updated class_values list - """ - num_of_valid_class_values = 0 - if class_values is not None: - for ele in class_values: - if ele is None and num_of_valid_class_values > 0: - break - num_of_valid_class_values += 1 - # Pass only the valid class_values for creating columns - class_values = class_values[:num_of_valid_class_values] - return class_values - def predict(schema_madlib, model_table, test_table, id_col, independent_varname, output_table, pred_type, gpus_per_host, **kwargs): if not pred_type: @@ -106,7 +72,7 @@ def predict(schema_madlib, model_table, test_table, id_col, pred_col_name = "prob" pred_col_type = 'double precision' - class_values = _strip_trailing_nulls_from_class_values(class_values) + class_values = strip_trailing_nulls_from_class_values(class_values) prediction_select_clause = create_cols_from_array_sql_string( class_values, intermediate_col, pred_col_name, diff --git a/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/deep_learning/test/input_data_preprocessor.sql_in similarity index 61% rename from src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in rename to src/ports/postgres/modules/deep_learning/test/input_data_preprocessor.sql_in index 0e64c07..62014d9 100644 --- a/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in +++ b/src/ports/postgres/modules/deep_learning/test/input_data_preprocessor.sql_in @@ -19,9 +19,9 @@ * *//* ----------------------------------------------------------------------- */ -DROP TABLE IF EXISTS minibatch_preprocessor_dl_input; -CREATE TABLE minibatch_preprocessor_dl_input(id serial, x double precision[], label TEXT); -INSERT INTO minibatch_preprocessor_dl_input(x, label) VALUES +DROP TABLE IF EXISTS data_preprocessor_input; +CREATE TABLE data_preprocessor_input(id serial, x double precision[], label TEXT); +INSERT INTO data_preprocessor_input(x, label) VALUES (ARRAY[1,2,3,4,5,6], 'a'), (ARRAY[11,2,3,4,5,6], 'a'), (ARRAY[11,22,33,4,5,6], 'a'), @@ -40,36 +40,36 @@ INSERT INTO minibatch_preprocessor_dl_input(x, label) VALUES (ARRAY[11,22,33,44,65,56], 'b'), (ARRAY[11,22,33,44,65,56], 'b'); -DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; -SELECT minibatch_preprocessor_dl( - 'minibatch_preprocessor_dl_input', - 'minibatch_preprocessor_dl_batch', +DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; +SELECT training_preprocessor_dl( + 'data_preprocessor_input', + 'data_preprocessor_input_batch', 'id', 'x', 5); -SELECT assert(count(*)=4, 'Incorrect number of buffers in minibatch_preprocessor_dl_batch.') -FROM minibatch_preprocessor_dl_batch; +SELECT assert(count(*)=4, 'Incorrect number of buffers in data_preprocessor_input_batch.') +FROM data_preprocessor_input_batch; SELECT assert(array_upper(independent_var, 2)=6, 'Incorrect buffer size.') -FROM minibatch_preprocessor_dl_batch WHERE buffer_id=0; +FROM data_preprocessor_input_batch WHERE buffer_id=0; SELECT assert(array_upper(independent_var, 1)=5, 'Incorrect buffer size.') -FROM minibatch_preprocessor_dl_batch WHERE buffer_id=1; +FROM data_preprocessor_input_batch WHERE buffer_id=1; SELECT assert(array_upper(independent_var, 1)=4, 'Incorrect buffer size.') -FROM minibatch_preprocessor_dl_batch WHERE buffer_id=3; +FROM data_preprocessor_input_batch WHERE buffer_id=3; -DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; -SELECT minibatch_preprocessor_dl( - 'minibatch_preprocessor_dl_input', - 'minibatch_preprocessor_dl_batch', +DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; +SELECT training_preprocessor_dl( + 'data_preprocessor_input', + 'data_preprocessor_input_batch', 'label', 'x'); -DROP TABLE IF EXISTS minibatch_preprocessor_dl_input; -CREATE TABLE minibatch_preprocessor_dl_input(id serial, x double precision[], y INTEGER, y1 BOOLEAN, y2 TEXT, y3 DOUBLE PRECISION, y4 DOUBLE PRECISION[], y5 INTEGER[]); -INSERT INTO minibatch_preprocessor_dl_input(x, y, y1, y2, y3, y4, y5) VALUES +DROP TABLE IF EXISTS data_preprocessor_input; +CREATE TABLE data_preprocessor_input(id serial, x double precision[], y INTEGER, y1 BOOLEAN, y2 TEXT, y3 DOUBLE PRECISION, y4 DOUBLE PRECISION[], y5 INTEGER[]); +INSERT INTO data_preprocessor_input(x, y, y1, y2, y3, y4, y5) VALUES (ARRAY[1,2,3,4,5,6], 4, TRUE, 'a', 4.0, ARRAY[1.0, 0.0], ARRAY[1,0]), (ARRAY[11,2,3,4,5,6], 3, TRUE, 'c', 4.2, ARRAY[0.0, 1.0], ARRAY[1,0]), (ARRAY[11,22,33,4,5,6], 8, TRUE, 'a', 4.0, ARRAY[0.0, 1.0], ARRAY[1,0]), @@ -88,10 +88,10 @@ INSERT INTO minibatch_preprocessor_dl_input(x, y, y1, y2, y3, y4, y5) VALUES (ARRAY[11,22,33,44,65,56], -3, FALSE, 'a', 4.2, ARRAY[1.0, 0.0], ARRAY[1,0]), (ARRAY[11,22,33,44,65,56], -1, TRUE, 'b', 4.0, ARRAY[0.0, 1.0], ARRAY[0,1]); -DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; -SELECT minibatch_preprocessor_dl( - 'minibatch_preprocessor_dl_input', - 'minibatch_preprocessor_dl_batch', +DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; +SELECT training_preprocessor_dl( + 'data_preprocessor_input', + 'data_preprocessor_input_batch', 'y', 'x', 4, @@ -100,17 +100,17 @@ SELECT minibatch_preprocessor_dl( ); -- Test that indepdendent vars get divided by 5, by verifying min value goes from 1 to 0.2, and max value from 233 to 46.6 -SELECT assert(relative_error(MIN(x),0.2) < 0.00001, 'Independent var not normalized properly!') FROM (SELECT UNNEST(independent_var) as x FROM minibatch_preprocessor_dl_batch) a; -SELECT assert(relative_error(MAX(x),46.6) < 0.00001, 'Independent var not normalized properly!') FROM (SELECT UNNEST(independent_var) as x FROM minibatch_preprocessor_dl_batch) a; +SELECT assert(relative_error(MIN(x),0.2) < 0.00001, 'Independent var not normalized properly!') FROM (SELECT UNNEST(independent_var) as x FROM data_preprocessor_input_batch) a; +SELECT assert(relative_error(MAX(x),46.6) < 0.00001, 'Independent var not normalized properly!') FROM (SELECT UNNEST(independent_var) as x FROM data_preprocessor_input_batch) a; -- Test that 1-hot encoded array is of length 16 (num_classes) SELECT assert(array_upper(dependent_var, 2) = 16, 'Incorrect one-hot encode dimension with num_classes') FROM - minibatch_preprocessor_dl_batch WHERE buffer_id = 0; + data_preprocessor_input_batch WHERE buffer_id = 0; -- Test summary table SELECT assert ( - source_table = 'minibatch_preprocessor_dl_input' AND - output_table = 'minibatch_preprocessor_dl_batch' AND + source_table = 'data_preprocessor_input' AND + output_table = 'data_preprocessor_input_batch' AND dependent_varname = 'y' AND independent_varname = 'x' AND dependent_vartype = 'integer' AND @@ -118,103 +118,103 @@ SELECT assert buffer_size = 4 AND -- we sort the class values in python normalizing_const = 5, 'Summary Validation failed. Actual:' || __to_char(summary) - ) from (select * from minibatch_preprocessor_dl_batch_summary) summary; + ) from (select * from data_preprocessor_input_batch_summary) summary; -- Test one-hot encoding for dependent_var -- test boolean type -DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; -SELECT minibatch_preprocessor_dl( - 'minibatch_preprocessor_dl_input', - 'minibatch_preprocessor_dl_batch', +DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; +SELECT training_preprocessor_dl( + 'data_preprocessor_input', + 'data_preprocessor_input_batch', 'y1', 'x', 4, 5); -SELECT assert(pg_typeof(dependent_var) = 'integer[]'::regtype, 'One-hot encode doesn''t convert into integer array format') FROM minibatch_preprocessor_dl_batch WHERE buffer_id = 0; +SELECT assert(pg_typeof(dependent_var) = 'integer[]'::regtype, 'One-hot encode doesn''t convert into integer array format') FROM data_preprocessor_input_batch WHERE buffer_id = 0; SELECT assert(array_upper(dependent_var, 2) = 2, 'Incorrect one-hot encode dimension') FROM - minibatch_preprocessor_dl_batch WHERE buffer_id = 0; -SELECT assert(SUM(y) = 1, 'Incorrect one-hot encode format') FROM (SELECT buffer_id, UNNEST(dependent_var[1:1]) as y FROM minibatch_preprocessor_dl_batch) a WHERE buffer_id = 0; + data_preprocessor_input_batch WHERE buffer_id = 0; +SELECT assert(SUM(y) = 1, 'Incorrect one-hot encode format') FROM (SELECT buffer_id, UNNEST(dependent_var[1:1]) as y FROM data_preprocessor_input_batch) a WHERE buffer_id = 0; SELECT assert (dependent_vartype = 'boolean' AND class_values = '{f,t}', 'Summary Validation failed. Actual:' || __to_char(summary) - ) from (select * from minibatch_preprocessor_dl_batch_summary) summary; + ) from (select * from data_preprocessor_input_batch_summary) summary; -- test text type -DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; -SELECT minibatch_preprocessor_dl( - 'minibatch_preprocessor_dl_input', - 'minibatch_preprocessor_dl_batch', +DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; +SELECT training_preprocessor_dl( + 'data_preprocessor_input', + 'data_preprocessor_input_batch', 'y2', 'x', 4, 5); -SELECT assert(pg_typeof(dependent_var) = 'integer[]'::regtype, 'One-hot encode doesn''t convert into integer array format') FROM minibatch_preprocessor_dl_batch WHERE buffer_id = 0; +SELECT assert(pg_typeof(dependent_var) = 'integer[]'::regtype, 'One-hot encode doesn''t convert into integer array format') FROM data_preprocessor_input_batch WHERE buffer_id = 0; SELECT assert(array_upper(dependent_var, 2) = 3, 'Incorrect one-hot encode dimension') FROM - minibatch_preprocessor_dl_batch WHERE buffer_id = 0; -SELECT assert(SUM(y) = 1, 'Incorrect one-hot encode format') FROM (SELECT buffer_id, UNNEST(dependent_var[1:1]) as y FROM minibatch_preprocessor_dl_batch) a WHERE buffer_id = 0; + data_preprocessor_input_batch WHERE buffer_id = 0; +SELECT assert(SUM(y) = 1, 'Incorrect one-hot encode format') FROM (SELECT buffer_id, UNNEST(dependent_var[1:1]) as y FROM data_preprocessor_input_batch) a WHERE buffer_id = 0; SELECT assert (dependent_vartype = 'text' AND class_values = '{a,b,c}', 'Summary Validation failed. Actual:' || __to_char(summary) - ) from (select * from minibatch_preprocessor_dl_batch_summary) summary; + ) from (select * from data_preprocessor_input_batch_summary) summary; -- test double precision type -DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; -SELECT minibatch_preprocessor_dl( - 'minibatch_preprocessor_dl_input', - 'minibatch_preprocessor_dl_batch', +DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; +SELECT training_preprocessor_dl( + 'data_preprocessor_input', + 'data_preprocessor_input_batch', 'y3', 'x', 4, 5); -SELECT assert(pg_typeof(dependent_var) = 'integer[]'::regtype, 'One-hot encode doesn''t convert into integer array format') FROM minibatch_preprocessor_dl_batch WHERE buffer_id = 0; +SELECT assert(pg_typeof(dependent_var) = 'integer[]'::regtype, 'One-hot encode doesn''t convert into integer array format') FROM data_preprocessor_input_batch WHERE buffer_id = 0; SELECT assert(array_upper(dependent_var, 2) = 3, 'Incorrect one-hot encode dimension') FROM - minibatch_preprocessor_dl_batch WHERE buffer_id = 0; -SELECT assert(SUM(y) = 1, 'Incorrect one-hot encode format') FROM (SELECT buffer_id, UNNEST(dependent_var[1:1]) as y FROM minibatch_preprocessor_dl_batch) a WHERE buffer_id = 0; + data_preprocessor_input_batch WHERE buffer_id = 0; +SELECT assert(SUM(y) = 1, 'Incorrect one-hot encode format') FROM (SELECT buffer_id, UNNEST(dependent_var[1:1]) as y FROM data_preprocessor_input_batch) a WHERE buffer_id = 0; SELECT assert (dependent_vartype = 'double precision' AND class_values = '{4.0,4.2,5.0}', 'Summary Validation failed. Actual:' || __to_char(summary) - ) from (select * from minibatch_preprocessor_dl_batch_summary) summary; + ) from (select * from data_preprocessor_input_batch_summary) summary; -- test double precision array type -DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; -SELECT minibatch_preprocessor_dl( - 'minibatch_preprocessor_dl_input', - 'minibatch_preprocessor_dl_batch', +DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; +SELECT training_preprocessor_dl( + 'data_preprocessor_input', + 'data_preprocessor_input_batch', 'y4', 'x', 4, 5); -SELECT assert(pg_typeof(dependent_var) = 'integer[]'::regtype, 'One-hot encode doesn''t convert into integer array format') FROM minibatch_preprocessor_dl_batch WHERE buffer_id = 0; +SELECT assert(pg_typeof(dependent_var) = 'integer[]'::regtype, 'One-hot encode doesn''t convert into integer array format') FROM data_preprocessor_input_batch WHERE buffer_id = 0; SELECT assert(array_upper(dependent_var, 2) = 2, 'Incorrect one-hot encode dimension') FROM - minibatch_preprocessor_dl_batch WHERE buffer_id = 0; -SELECT assert(relative_error(SUM(y), SUM(y4)) < 0.000001, 'Incorrect one-hot encode value') FROM (SELECT UNNEST(dependent_var) AS y FROM minibatch_preprocessor_dl_batch) a, (SELECT UNNEST(y4) as y4 FROM minibatch_preprocessor_dl_input) b; + data_preprocessor_input_batch WHERE buffer_id = 0; +SELECT assert(relative_error(SUM(y), SUM(y4)) < 0.000001, 'Incorrect one-hot encode value') FROM (SELECT UNNEST(dependent_var) AS y FROM data_preprocessor_input_batch) a, (SELECT UNNEST(y4) as y4 FROM data_preprocessor_input) b; SELECT assert (dependent_vartype = 'double precision[]' AND class_values IS NULL, 'Summary Validation failed. Actual:' || __to_char(summary) - ) from (select * from minibatch_preprocessor_dl_batch_summary) summary; + ) from (select * from data_preprocessor_input_batch_summary) summary; -- test integer array type -DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; -SELECT minibatch_preprocessor_dl( - 'minibatch_preprocessor_dl_input', - 'minibatch_preprocessor_dl_batch', +DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; +SELECT training_preprocessor_dl( + 'data_preprocessor_input', + 'data_preprocessor_input_batch', 'y5', 'x', 4, 5); -SELECT assert(pg_typeof(dependent_var) = 'integer[]'::regtype, 'One-hot encode doesn''t convert into integer array format') FROM minibatch_preprocessor_dl_batch WHERE buffer_id = 0; +SELECT assert(pg_typeof(dependent_var) = 'integer[]'::regtype, 'One-hot encode doesn''t convert into integer array format') FROM data_preprocessor_input_batch WHERE buffer_id = 0; SELECT assert(array_upper(dependent_var, 2) = 2, 'Incorrect one-hot encode dimension') FROM - minibatch_preprocessor_dl_batch WHERE buffer_id = 0; -SELECT assert(relative_error(SUM(y), SUM(y5)) < 0.000001, 'Incorrect one-hot encode value') FROM (SELECT UNNEST(dependent_var) AS y FROM minibatch_preprocessor_dl_batch) a, (SELECT UNNEST(y5) as y5 FROM minibatch_preprocessor_dl_input) b; + data_preprocessor_input_batch WHERE buffer_id = 0; +SELECT assert(relative_error(SUM(y), SUM(y5)) < 0.000001, 'Incorrect one-hot encode value') FROM (SELECT UNNEST(dependent_var) AS y FROM data_preprocessor_input_batch) a, (SELECT UNNEST(y5) as y5 FROM data_preprocessor_input) b; SELECT assert (dependent_vartype = 'integer[]' AND class_values IS NULL, 'Summary Validation failed. Actual:' || __to_char(summary) - ) from (select * from minibatch_preprocessor_dl_batch_summary) summary; + ) from (select * from data_preprocessor_input_batch_summary) summary; -- Test cases with NULL in class values -DROP TABLE IF EXISTS minibatch_preprocessor_dl_input_null; -CREATE TABLE minibatch_preprocessor_dl_input_null(id serial, x double precision[], label TEXT); -INSERT INTO minibatch_preprocessor_dl_input_null(x, label) VALUES +DROP TABLE IF EXISTS data_preprocessor_input_null; +CREATE TABLE data_preprocessor_input_null(id serial, x double precision[], label TEXT); +INSERT INTO data_preprocessor_input_null(x, label) VALUES (ARRAY[1,2,3,4,5,6], 'a'), (ARRAY[11,2,3,4,5,6], 'a'), (ARRAY[11,22,33,4,5,6], NULL), @@ -233,10 +233,10 @@ INSERT INTO minibatch_preprocessor_dl_input_null(x, label) VALUES (ARRAY[11,22,33,44,65,56], 'b'), (ARRAY[11,22,33,44,65,56], NULL); -DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; -SELECT minibatch_preprocessor_dl( - 'minibatch_preprocessor_dl_input_null', - 'minibatch_preprocessor_dl_batch', +DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; +SELECT training_preprocessor_dl( + 'data_preprocessor_input_null', + 'data_preprocessor_input_batch', 'label', 'x', 4, @@ -250,22 +250,22 @@ SELECT assert ( class_values = '{NULL,a,b,NULL,NULL}', 'Summary Validation failed with NULL data. Actual:' || __to_char(summary) - ) from (select * from minibatch_preprocessor_dl_batch_summary) summary; + ) from (select * from data_preprocessor_input_batch_summary) summary; SELECT assert(array_upper(dependent_var, 2) = 5, 'Incorrect one-hot encode dimension with NULL data') FROM - minibatch_preprocessor_dl_batch WHERE buffer_id = 0; + data_preprocessor_input_batch WHERE buffer_id = 0; -- Test the content of 1-hot encoded dep var when NULL is the -- class label. -DROP TABLE IF EXISTS minibatch_preprocessor_dl_input_null; -CREATE TABLE minibatch_preprocessor_dl_input_null(id serial, x double precision[], label TEXT); -INSERT INTO minibatch_preprocessor_dl_input_null(x, label) VALUES +DROP TABLE IF EXISTS data_preprocessor_input_null; +CREATE TABLE data_preprocessor_input_null(id serial, x double precision[], label TEXT); +INSERT INTO data_preprocessor_input_null(x, label) VALUES (ARRAY[11,22,33,4,5,6], NULL); -DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch, minibatch_preprocessor_dl_batch_summary; -SELECT minibatch_preprocessor_dl( - 'minibatch_preprocessor_dl_input_null', - 'minibatch_preprocessor_dl_batch', +DROP TABLE IF EXISTS data_preprocessor_input_batch, data_preprocessor_input_batch_summary; +SELECT training_preprocessor_dl( + 'data_preprocessor_input_null', + 'data_preprocessor_input_batch', 'label', 'x', 4, @@ -280,11 +280,11 @@ SELECT assert ( class_values = '{NULL,NULL,NULL}', 'Summary Validation failed with NULL data. Actual:' || __to_char(summary) - ) from (select * from minibatch_preprocessor_dl_batch_summary) summary; + ) from (select * from data_preprocessor_input_batch_summary) summary; SELECT assert(array_upper(dependent_var, 2) = 3, 'Incorrect one-hot encode dimension with NULL data') FROM - minibatch_preprocessor_dl_batch WHERE buffer_id = 0; + data_preprocessor_input_batch WHERE buffer_id = 0; -- NULL is treated as a class label, so it should show '1' for the -- first index SELECT assert(dependent_var = '{{1,0,0}}', 'Incorrect one-hot encode dimension with NULL data') FROM - minibatch_preprocessor_dl_batch WHERE buffer_id = 0; + data_preprocessor_input_batch WHERE buffer_id = 0; diff --git a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in index 2421f5f..15d4725 100644 --- a/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in +++ b/src/ports/postgres/modules/deep_learning/test/madlib_keras.sql_in @@ -38,7 +38,7 @@ copy cifar_10_sample_val from stdin delimiter '|'; -- update cifar_10_sample_val SET independent_var = array_scalar_mult(independent_var::real[], (1/255.0)::real); -- Prepare the minibatched data manually instead of calling --- minibatch_preprocessor_dl since it internally calls array_scalar_mult. +-- training_preprocessor_dl since it internally calls array_scalar_mult. -- Please refer to MADLIB-1326 for more details on the issue. DROP TABLE IF EXISTS cifar_10_sample_batched; @@ -402,7 +402,7 @@ WHERE y='cat'; DROP TABLE IF EXISTS cifar_10_sample_text_batched; DROP TABLE IF EXISTS cifar_10_sample_text_batched_summary; -SELECT minibatch_preprocessor_dl('cifar_10_sample_text','cifar_10_sample_text_batched','y','x', 2, 255, 5); +SELECT training_preprocessor_dl('cifar_10_sample_text','cifar_10_sample_text_batched','y','x', 2, 255, 5); -- Change model_arch to reflect 5 num_classes DROP TABLE IF EXISTS model_arch; @@ -557,7 +557,7 @@ WHERE y = 1; DROP TABLE IF EXISTS cifar_10_sample_int_batched; DROP TABLE IF EXISTS cifar_10_sample_int_batched_summary; -SELECT minibatch_preprocessor_dl('cifar_10_sample','cifar_10_sample_int_batched','y','x', 2, 255, 5); +SELECT training_preprocessor_dl('cifar_10_sample','cifar_10_sample_int_batched','y','x', 2, 255, 5); DROP TABLE IF EXISTS keras_saved_out, keras_saved_out_summary; SELECT madlib_keras_fit( @@ -632,7 +632,7 @@ copy cifar_10_sample_test_shape from stdin delimiter '|'; DROP TABLE IF EXISTS cifar_10_sample_test_shape_batched; DROP TABLE IF EXISTS cifar_10_sample_test_shape_batched_summary; -SELECT minibatch_preprocessor_dl('cifar_10_sample_test_shape','cifar_10_sample_test_shape_batched','y','x', NULL, 255, 3); +SELECT training_preprocessor_dl('cifar_10_sample_test_shape','cifar_10_sample_test_shape_batched','y','x', NULL, 255, 3); -- Change model_arch to reflect channels_first DROP TABLE IF EXISTS model_arch; diff --git a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_input_data_preprocessor.py_in b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_input_data_preprocessor.py_in new file mode 100644 index 0000000..3cb735a --- /dev/null +++ b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_input_data_preprocessor.py_in @@ -0,0 +1,294 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys +import numpy as np +from os import path + +# Add modules to the pythonpath. +sys.path.append(path.dirname(path.dirname(path.dirname(path.dirname(path.abspath(__file__)))))) +sys.path.append(path.dirname(path.dirname(path.dirname(path.abspath(__file__))))) + +import unittest +from mock import * +import plpy_mock as plpy + +m4_changequote(`<!', `!>') + +class AnyStringWith(str): + def __eq__(self, other): + return self in other + +class InputPreProcessorDLTestCase(unittest.TestCase): + def setUp(self): + self.plpy_mock = Mock(spec='error') + patches = { + 'plpy': plpy, + 'utilities.mean_std_dev_calculator': Mock(), + } + # we need to use MagicMock() instead of Mock() for the plpy.execute mock + # to be able to iterate on the return value + self.plpy_mock_execute = MagicMock() + plpy.execute = self.plpy_mock_execute + + self.module_patcher = patch.dict('sys.modules', patches) + self.module_patcher.start() + + self.default_schema_madlib = "madlib" + self.default_source_table = "source" + self.default_output_table = "output" + self.default_dep_var = "depvar" + self.default_ind_var = "indvar" + self.default_buffer_size = 5 + self.default_normalizing_const = 1.0 + self.default_num_classes = None + self.default_module_name = "dummy" + + import deep_learning.input_data_preprocessor + self.module = deep_learning.input_data_preprocessor + import utilities.minibatch_preprocessing + self.util_module = utilities.minibatch_preprocessing + self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) + self.module.validate_module_input_params = Mock() + self.module.get_distinct_col_levels = Mock(return_value = [0,22,100]) + + def tearDown(self): + self.module_patcher.stop() + + def test_input_preprocessor_dl_executes_query(self): + self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) + preprocessor_obj = self.module.InputDataPreprocessorDL( + self.default_schema_madlib, + "input", + "out", + self.default_dep_var, + self.default_ind_var, + self.default_buffer_size, + self.default_normalizing_const, + self.default_num_classes, + self.default_module_name) + preprocessor_obj.input_preprocessor_dl() + + def test_input_preprocessor_null_buffer_size_executes_query(self): + self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) + preprocessor_obj = self.module.InputDataPreprocessorDL( + self.default_schema_madlib, + "input", + "out", + self.default_dep_var, + self.default_ind_var, + None, + self.default_normalizing_const, + self.default_num_classes, + self.default_module_name) + self.util_module.MiniBatchBufferSizeCalculator.calculate_default_buffer_size = Mock(return_value = 5) + preprocessor_obj.input_preprocessor_dl() + + def test_input_preprocessor_multiple_dep_var_raises_exception(self): + self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) + with self.assertRaises(plpy.PLPYException): + self.module.InputDataPreprocessorDL( + self.default_schema_madlib, + self.default_source_table, + self.default_output_table, + "y1,y2", + self.default_ind_var, + self.default_buffer_size, + self.default_normalizing_const, + self.default_num_classes, + self.default_module_name) + + def test_input_preprocessor_multiple_indep_var_raises_exception(self): + self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) + with self.assertRaises(plpy.PLPYException): + self.module.InputDataPreprocessorDL( + self.default_schema_madlib, + self.default_source_table, + self.default_output_table, + self.default_dep_var, + "x1,x2", + self.default_buffer_size, + self.default_normalizing_const, + self.default_num_classes, + self.default_module_name) + + def test_input_preprocessor_buffer_size_zero_fails(self): + self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) + with self.assertRaises(plpy.PLPYException): + self.module.InputDataPreprocessorDL( + self.default_schema_madlib, + self.default_source_table, + self.default_output_table, + self.default_dep_var, + self.default_ind_var, + 0, + self.default_normalizing_const, + self.default_num_classes, + self.default_module_name) + + def test_input_preprocessor_negative_buffer_size_fails(self): + self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) + with self.assertRaises(plpy.PLPYException): + self.module.InputDataPreprocessorDL(self.default_schema_madlib, + self.default_source_table, + self.default_output_table, + self.default_dep_var, + self.default_ind_var, + -1, + self.default_normalizing_const, + self.default_num_classes, + self.default_module_name) + + def test_input_preprocessor_invalid_indep_vartype_raises_exception(self): + self.module.get_expr_type = Mock(side_effect = ['integer', 'integer[]']) + with self.assertRaises(plpy.PLPYException): + self.module.InputDataPreprocessorDL(self.default_schema_madlib, + self.default_source_table, + self.default_output_table, + self.default_dep_var, + self.default_ind_var, + self.default_buffer_size, + self.default_normalizing_const, + self.default_num_classes, + self.default_module_name) + + def test_input_preprocessor_invalid_dep_vartype_raises_exception(self): + self.module.get_expr_type = Mock(side_effect = ['integer[]', 'text[]']) + with self.assertRaises(plpy.PLPYException): + self.module.InputDataPreprocessorDL(self.default_schema_madlib, + self.default_source_table, + self.default_output_table, + self.default_dep_var, + self.default_ind_var, + self.default_buffer_size, + self.default_normalizing_const, + self.default_num_classes, + self.default_module_name) + + def test_input_preprocessor_normalizing_const_zero_fails(self): + self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) + with self.assertRaises(plpy.PLPYException): + self.module.InputDataPreprocessorDL(self.default_schema_madlib, + self.default_source_table, + self.default_output_table, + self.default_dep_var, + self.default_ind_var, + self.default_buffer_size, + 0, + self.default_num_classes, + self.default_module_name) + + def test_input_preprocessor_negative_normalizing_const_fails(self): + self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) + with self.assertRaises(plpy.PLPYException): + self.module.InputDataPreprocessorDL(self.default_schema_madlib, + self.default_source_table, + self.default_output_table, + self.default_dep_var, + self.default_ind_var, + self.default_buffer_size, + -1, + self.default_num_classes, + self.default_module_name) + + def test_get_one_hot_encoded_dep_var_expr_null_val(self): + self.module.get_expr_type = Mock(side_effect = ['integer[]', 'text']) + self.module.get_distinct_col_levels = Mock(return_value = ["NULL", "'a'"]) + obj = self.module.InputDataPreprocessorDL( + self.default_schema_madlib, + self.default_source_table, + self.default_output_table, + self.default_dep_var, + self.default_ind_var, + self.default_buffer_size, + self.default_normalizing_const, + self.default_num_classes, + self.default_module_name) + obj.dependent_levels = ["NULL", "'a'"] + dep_var_array_expr = obj.get_one_hot_encoded_dep_var_expr() + self.assertEqual("array[({0}) is not distinct from null, ({0}) is not distinct from 'a']::integer[]". + format(self.default_dep_var), + dep_var_array_expr.lower()) + + def test_get_one_hot_encoded_dep_var_expr_numeric_array_val(self): + self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) + obj = self.module.InputDataPreprocessorDL( + self.default_schema_madlib, + self.default_source_table, + self.default_output_table, + self.default_dep_var, + self.default_ind_var, + self.default_buffer_size, + self.default_normalizing_const, + self.default_num_classes, + self.default_module_name) + dep_var_array_expr = obj.get_one_hot_encoded_dep_var_expr() + self.assertEqual("{0}::integer[]". + format(self.default_dep_var), + dep_var_array_expr.lower()) + + def test_validate_num_classes_none(self): + self.module.get_expr_type = Mock(side_effect = ['integer[]', 'text']) + obj = self.module.InputDataPreprocessorDL( + self.default_schema_madlib, + self.default_source_table, + self.default_output_table, + self.default_dep_var, + self.default_ind_var, + self.default_buffer_size, + self.default_normalizing_const, + None, + self.default_module_name) + obj.dependent_levels = ["dummy"] + self.assertEqual(0, obj.padding_size) + + def test_validate_num_classes_greater(self): + self.module.get_expr_type = Mock(side_effect = ['integer[]', 'text']) + self.module._get_dependent_levels = Mock(return_value = ["'a'", "'b'", "'c'"]) + obj = self.module.TrainingDataPreprocessorDL( + self.default_schema_madlib, + self.default_source_table, + self.default_output_table, + self.default_dep_var, + self.default_ind_var, + self.default_buffer_size, + self.default_normalizing_const, + 5) + obj._set_one_hot_encoding_variables() + self.assertEqual(2, obj.padding_size) + + def test_validate_num_classes_lesser(self): + self.module.get_expr_type = Mock(side_effect = ['integer[]', 'text']) + self.module.dependent_levels = Mock(return_value = ["'a'", "'b'", "'c'"]) + with self.assertRaises(plpy.PLPYException): + obj = self.module.TrainingDataPreprocessorDL( + self.default_schema_madlib, + self.default_source_table, + self.default_output_table, + self.default_dep_var, + self.default_ind_var, + self.default_buffer_size, + self.default_normalizing_const, + 2) + obj._set_one_hot_encoding_variables() + +if __name__ == '__main__': + unittest.main() + +# --------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in index 0bdd4d8..5eedd2e 100644 --- a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in +++ b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in @@ -761,7 +761,7 @@ class PredictInputPredTypeValidationTestCase(unittest.TestCase): self.subject.validate_pred_type(range(1598)) self.subject.validate_pred_type(None) -class MadlibKerasPredictTestCase(unittest.TestCase): +class MadlibKerasHelperTestCase(unittest.TestCase): def setUp(self): self.plpy_mock = Mock(spec='error') patches = { @@ -773,53 +773,35 @@ class MadlibKerasPredictTestCase(unittest.TestCase): self.module_patcher = patch.dict('sys.modules', patches) self.module_patcher.start() - import madlib_keras_predict - self.subject = madlib_keras_predict + import madlib_keras_helper + self.subject = madlib_keras_helper + self.input_data = [32, 32, 3] def tearDown(self): self.module_patcher.stop() + def test_expand_input_dims(self): + self.assertEqual(np.array(self.input_data).shape, (3,)) + res = self.subject.expand_input_dims(self.input_data) + self.assertEqual(res.shape, (1, 3)) + def test_strip_trailing_nulls_from_class_values(self): self.assertEqual(['cat', 'dog'], - self.subject._strip_trailing_nulls_from_class_values( + self.subject.strip_trailing_nulls_from_class_values( ['cat', 'dog'])) self.assertEqual([None, 'cat', 'dog'], - self.subject._strip_trailing_nulls_from_class_values( + self.subject.strip_trailing_nulls_from_class_values( [None, 'cat', 'dog'])) self.assertEqual([None, 'cat', 'dog'], - self.subject._strip_trailing_nulls_from_class_values( + self.subject.strip_trailing_nulls_from_class_values( [None, 'cat', 'dog', None, None])) self.assertEqual(['cat', 'dog'], - self.subject._strip_trailing_nulls_from_class_values( + self.subject.strip_trailing_nulls_from_class_values( ['cat', 'dog', None, None])) self.assertEqual([None], - self.subject._strip_trailing_nulls_from_class_values( + self.subject.strip_trailing_nulls_from_class_values( [None, None])) -class MadlibKerasHelperTestCase(unittest.TestCase): - def setUp(self): - self.plpy_mock = Mock(spec='error') - patches = { - 'plpy': plpy - } - - self.plpy_mock_execute = MagicMock() - plpy.execute = self.plpy_mock_execute - - self.module_patcher = patch.dict('sys.modules', patches) - self.module_patcher.start() - import madlib_keras_helper - self.subject = madlib_keras_helper - self.input_data = [32, 32, 3] - - def tearDown(self): - self.module_patcher.stop() - - def test_expand_input_dims(self): - self.assertEqual(np.array(self.input_data).shape, (3,)) - res = self.subject.expand_input_dims(self.input_data) - self.assertEqual(res.shape, (1, 3)) - if __name__ == '__main__': unittest.main() # --------------------------------------------------------------------- diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in index 32eb3e5..03c8214 100644 --- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in +++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in @@ -52,13 +52,6 @@ m4_changequote(`<!', `!>') MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname" MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname" -# These are readonly variables, do not modify -#MADLIB-1300 Adding these variables for DL only at this time. -# For release 2.0 These will be removed and above variables can -# be used for regular and DL minibatch. -MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL = "dependent_var" -MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL = "independent_var" - class MiniBatchPreProcessor: """ This class is responsible for executing the main logic of mini batch @@ -343,210 +336,6 @@ class MiniBatchPreProcessor: """.format(**locals()) plpy.execute(query) - -class MiniBatchPreProcessorDL(): - def __init__(self, schema_madlib, source_table, output_table, - dependent_varname, independent_varname, buffer_size, - normalizing_const, num_classes, **kwargs): - self.schema_madlib = schema_madlib - self.source_table = source_table - self.output_table = output_table - self.dependent_varname = dependent_varname - self.independent_varname = independent_varname - self.buffer_size = buffer_size - self.normalizing_const = normalizing_const if normalizing_const is not None else 1.0 - self.num_classes = num_classes - self.module_name = "minibatch_preprocessor_DL" - self.output_summary_table = add_postfix(self.output_table, "_summary") - self.independent_vartype = get_expr_type(self.independent_varname, - self.source_table) - self.dependent_vartype = get_expr_type(self.dependent_varname, - self.source_table) - - self._validate_args() - self.num_of_buffers = self._get_num_buffers() - if is_valid_psql_type(self.dependent_vartype, NUMERIC | ONLY_ARRAY): - self.dependent_levels = None - else: - self.dependent_levels = get_distinct_col_levels( - self.source_table, self.dependent_varname, - self.dependent_vartype, include_nulls=True) - # if any class level was NULL in sql, that would show up as - # None in self.dependent_levels. Replace all None with NULL - # in the list. - self.dependent_levels = ['NULL' if level is None else level - for level in self.dependent_levels] - self._validate_num_classes() - # Find the number of padded zeros to include in 1-hot vector - self.padding_size = 0 - # Try computing padding_size after running all necessary validations. - if self.num_classes and self.dependent_levels: - self.padding_size = self.num_classes - len(self.dependent_levels) - - def _validate_num_classes(self): - if self.num_classes is not None and \ - self.num_classes < len(self.dependent_levels): - plpy.error("{0}: Invalid num_classes value specified. It must "\ - "be equal to or greater than distinct class values found "\ - "in table ({1}).".format( - self.module_name, len(self.dependent_levels))) - - def get_one_hot_encoded_dep_var_expr(self): - """ - :param dependent_varname: Name of the dependent variable - :param num_classes: Number of class values to consider in 1-hot - :return: - This function returns a tuple of - 1. A string with transformed dependent varname depending on it's type - 2. All the distinct dependent class levels encoded as a string - - If dep_type == numeric[] , do not encode - 1. dependent_varname = rings - transformed_value = ARRAY[rings] - 2. dependent_varname = ARRAY[a, b, c] - transformed_value = ARRAY[a, b, c] - else if dep_type in ("text", "boolean"), encode: - 3. dependent_varname = rings (encoding) - transformed_value = ARRAY[rings=1, rings=2, rings=3] - """ - # Assuming the input NUMERIC[] is already one_hot_encoded, - # so casting to INTEGER[] - if is_valid_psql_type(self.dependent_vartype, NUMERIC | ONLY_ARRAY): - return "{0}::INTEGER[]".format(self.dependent_varname) - - # For DL use case, we want to allow NULL as a valid class value, - # so the query must have 'IS NOT DISTINCT FROM' instead of '=' - # like in the generic get_one_hot_encoded_expr() defined in - # db_utils.py_in. We also have this optional 'num_classes' param - # that affects the logic of 1-hot encoding. Since this is very - # specific to minibatch_preprocessing_dl for now, let's keep - # it here instead of refactoring it out to a generic helper function. - one_hot_encoded_expr = ["({0}) IS NOT DISTINCT FROM {1}".format( - self.dependent_varname, c) for c in self.dependent_levels] - if self.num_classes: - one_hot_encoded_expr.extend(['false' - for i in range(self.padding_size)]) - return 'ARRAY[{0}]::INTEGER[]'.format( - ', '.join(one_hot_encoded_expr)) - - def minibatch_preprocessor_dl(self): - # Create a temp table that has independent var normalized. - norm_tbl = unique_string(desp='normalized') - - # Always one-hot encode the dependent var. For now, we are assuming - # that minibatch_preprocessor_dl will be used only for deep - # learning and mostly for classification. So make a strong - # assumption that it is only for classification, so one-hot - # encode the dep var, unless it's already a numeric array in - # which case we assume it's already one-hot encoded. - one_hot_dep_var_array_expr = \ - self.get_one_hot_encoded_dep_var_expr() - scalar_mult_sql = """ - CREATE TEMP TABLE {norm_tbl} AS - SELECT {self.schema_madlib}.array_scalar_mult( - {self.independent_varname}::REAL[], (1/{self.normalizing_const})::REAL) AS x_norm, - {one_hot_dep_var_array_expr} AS y, - row_number() over() AS row_id - FROM {self.source_table} order by random() - """.format(**locals()) - plpy.execute(scalar_mult_sql) - # Create the mini-batched output table - if is_platform_pg(): - distributed_by_clause = '' - else: - distributed_by_clause= ' DISTRIBUTED BY (buffer_id) ' - sql = """ - CREATE TABLE {self.output_table} AS - SELECT * FROM - ( - SELECT {self.schema_madlib}.agg_array_concat( - ARRAY[{norm_tbl}.x_norm::REAL[]]) AS {x}, - {self.schema_madlib}.agg_array_concat( - ARRAY[{norm_tbl}.y]) AS {y}, - ({norm_tbl}.row_id%{self.num_of_buffers})::smallint AS buffer_id - FROM {norm_tbl} - GROUP BY buffer_id - ) b - {distributed_by_clause} - """.format(x=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL, - y=MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL, - **locals()) - plpy.execute(sql) - plpy.execute("DROP TABLE IF EXISTS {0}".format(norm_tbl)) - # Create summary table - self._create_output_summary_table() - - def _create_output_summary_table(self): - class_level_str='NULL::TEXT' - if self.dependent_levels: - # Update dependent_levels to include NULL when - # num_classes > len(self.dependent_levels) - if self.num_classes: - self.dependent_levels.extend(['NULL' - for i in range(self.padding_size)]) - class_level_str=py_list_to_sql_string( - self.dependent_levels, array_type=self.dependent_vartype, - long_format=True) - query = """ - CREATE TABLE {self.output_summary_table} AS - SELECT - $__madlib__${self.source_table}$__madlib__$::TEXT AS source_table, - $__madlib__${self.output_table}$__madlib__$::TEXT AS output_table, - $__madlib__${self.dependent_varname}$__madlib__$::TEXT AS dependent_varname, - $__madlib__${self.independent_varname}$__madlib__$::TEXT AS independent_varname, - $__madlib__${self.dependent_vartype}$__madlib__$::TEXT AS dependent_vartype, - {class_level_str} AS class_values, - {self.buffer_size} AS buffer_size, - {self.normalizing_const} AS normalizing_const - """.format(self=self, class_level_str=class_level_str) - plpy.execute(query) - - def _validate_args(self): - validate_module_input_params( - self.source_table, self.output_table, self.independent_varname, - self.dependent_varname, self.module_name, None, - [self.output_summary_table]) - num_of_independent_cols = split_quoted_delimited_str(self.independent_varname) - _assert(len(num_of_independent_cols) == 1, - "Invalid independent_varname: only one column name is allowed " - "as input.") - _assert(is_valid_psql_type(self.independent_vartype, - NUMERIC | ONLY_ARRAY), - "Invalid independent variable type, should be an array of " - "one of {0}".format(','.join(NUMERIC))) - # The denpendent variable needs to be either: - # 1. NUMERIC, TEXT OR BOOLEAN, which we always one-hot encode - # 2. NUMERIC ARRAY, which we assume it is already one-hot encoded, and we - # just cast it the INTEGER ARRAY - num_of_dependent_cols = split_quoted_delimited_str(self.dependent_varname) - _assert(len(num_of_dependent_cols) == 1, - "Invalid dependent_varname: only one column name is allowed " - "as input.") - _assert((is_valid_psql_type(self.dependent_vartype, NUMERIC | TEXT | BOOLEAN) or - is_valid_psql_type(self.dependent_vartype, NUMERIC | ONLY_ARRAY)), - """Invalid dependent variable type, should be one of the type in this list: - numeric, text, boolean, or numeric array""") - if self.buffer_size is not None: - _assert(self.buffer_size > 0, - "minibatch_preprocessor_dl: The buffer size has to be a " - "positive integer or NULL.") - _assert(self.normalizing_const > 0, - "minibatch_preprocessor_dl: The normalizing constant has to be a " - "positive integer or NULL.") - - def _get_num_buffers(self): - num_rows_in_tbl = plpy.execute(""" - SELECT count(*) AS cnt FROM {0} - """.format(self.source_table))[0]['cnt'] - buffer_size_calculator = MiniBatchBufferSizeCalculator() - indepdent_var_dim = _tbl_dimension_rownum( - self.schema_madlib, self.source_table, - self.independent_varname, skip_row_count=True) - self.buffer_size = buffer_size_calculator.calculate_default_buffer_size( - self.buffer_size, num_rows_in_tbl, indepdent_var_dim[0]) - return ceil((1.0 * num_rows_in_tbl) / self.buffer_size) - - class MiniBatchStandardizer: """ This class is responsible for @@ -791,96 +580,3 @@ class MiniBatchDocumentation: for help. """.format(**locals()) # --------------------------------------------------------------------- - @staticmethod - def minibatch_preprocessor_dl_help(schema_madlib, message): - method = "minibatch_preprocessor_dl" - summary = """ - ---------------------------------------------------------------- - SUMMARY - ---------------------------------------------------------------- - For Deep Learning based techniques such as Convolutional Neural Nets, - the input data is mostly images. These images can be represented as an - array of numbers where each element represents a pixel/color intensity. - It is standard practice to normalize the image data before use. - minibatch_preprocessor() is for general use-cases, but for deep learning - based use-cases we provide minibatch_preprocessor_dl() that is - light-weight and is specific to image datasets. - - The normalizing constant is parameterized, and can be specified based - on the kind of image data used. - - An optional param named num_classes can be used to specify the length - of the one-hot encoded array for the dependent variable. This value if - specified must be greater than equal to the total number of distinct - class values found in the input table. - - For more details on function usage: - SELECT {schema_madlib}.{method}('usage') - """.format(**locals()) - - usage = """ - --------------------------------------------------------------------------- - USAGE - --------------------------------------------------------------------------- - SELECT {schema_madlib}.{method}( - source_table, -- TEXT. Name of the table containing input - data. Can also be a view - output_table, -- TEXT. Name of the output table for - mini-batching - dependent_varname, -- TEXT. Name of the dependent variable column - independent_varname, -- TEXT. Name of the independent variable - column - buffer_size -- INTEGER. Default computed automatically. - Number of source input rows to pack into a buffer - normalizing_const -- DOUBLE PRECISON. Default 1.0. The - normalizing constant to use for - standardizing arrays in independent_varname. - num_classes -- INTEGER. Default NULL. Number of class labels - to be considered for 1-hot encoding. If NULL, - the 1-hot encoded array length will be equal to - the number of distinct class values found in the - input table. - ); - - - --------------------------------------------------------------------------- - OUTPUT - --------------------------------------------------------------------------- - The output table produced by MiniBatch Preprocessor contains the - following columns: - - buffer_id -- INTEGER. Unique id for packed table. - dependent_varname -- ANYARRAY[]. Packed array of dependent variables. - independent_varname -- REAL[]. Packed array of independent - variables. - - --------------------------------------------------------------------------- - The algorithm also creates a summary table named <output_table>_summary - that has the following columns: - - source_table -- Source table name. - output_table -- Output table name from preprocessor. - dependent_varname -- Dependent variable values from the original table - (encoded by one_hot_encode, if specified). - independent_varname -- Independent variable values from the original - table. - dependent_vartype -- Type of the dependent variable from the - original table. - class_values -- Class values of the dependent variable - (‘NULL’(as TEXT type) for non - categorical vars). - buffer_size -- Buffer size used in preprocessing step. - normalizing_const -- Normalizing constant used for standardizing - arrays in independent_varname. - - --------------------------------------------------------------------------- - """.format(**locals()) - - if not message: - return summary - elif message.lower() in ('usage', 'help', '?'): - return usage - return """ - No such option. Use "SELECT {schema_madlib}.minibatch_preprocessor_dl()" - for help. - """.format(**locals()) diff --git a/src/ports/postgres/modules/utilities/test/unit_tests/test_minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/test/unit_tests/test_minibatch_preprocessing.py_in index e213562..c68f6b1 100644 --- a/src/ports/postgres/modules/utilities/test/unit_tests/test_minibatch_preprocessing.py_in +++ b/src/ports/postgres/modules/utilities/test/unit_tests/test_minibatch_preprocessing.py_in @@ -31,6 +31,9 @@ import plpy_mock as plpy m4_changequote(`<!', `!>') +class AnyStringWith(str): + def __eq__(self, other): + return self in other class MiniBatchPreProcessingTestCase(unittest.TestCase): def setUp(self): @@ -319,251 +322,6 @@ class MiniBatchBufferSizeCalculatorTestCase(unittest.TestCase): #TODO add more tests after finalizing the buffer size calculation -class AnyStringWith(str): - def __eq__(self, other): - return self in other -class MiniBatchPreProcessingDLTestCase(unittest.TestCase): - def setUp(self): - self.plpy_mock = Mock(spec='error') - patches = { - 'plpy': plpy, - 'utilities.mean_std_dev_calculator': Mock(), - } - # we need to use MagicMock() instead of Mock() for the plpy.execute mock - # to be able to iterate on the return value - self.plpy_mock_execute = MagicMock() - plpy.execute = self.plpy_mock_execute - - self.module_patcher = patch.dict('sys.modules', patches) - self.module_patcher.start() - - self.default_schema_madlib = "madlib" - self.default_source_table = "source" - self.default_output_table = "output" - self.default_dep_var = "depvar" - self.default_ind_var = "indvar" - self.default_buffer_size = 5 - self.default_normalizing_const = 1.0 - self.default_num_classes = None - - import utilities.minibatch_preprocessing - self.module = utilities.minibatch_preprocessing - self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) - self.module.validate_module_input_params = Mock() - self.module.get_distinct_col_levels = Mock(return_value = [0,22,100]) - self.subject = self.module.MiniBatchPreProcessorDL( - self.default_schema_madlib, - self.default_source_table, - self.default_output_table, - self.default_dep_var, - self.default_ind_var, - self.default_buffer_size, - self.default_normalizing_const, - self.default_num_classes) - - def tearDown(self): - self.module_patcher.stop() - - def test_minibatch_preprocessor_dl_executes_query(self): - self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) - preprocessor_obj = self.module.MiniBatchPreProcessorDL( - self.default_schema_madlib, - "input", - "out", - self.default_dep_var, - self.default_ind_var, - self.default_buffer_size, - self.default_normalizing_const, - self.default_num_classes) - preprocessor_obj.minibatch_preprocessor_dl() - - def test_minibatch_preprocessor_null_buffer_size_executes_query(self): - self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) - preprocessor_obj = self.module.MiniBatchPreProcessorDL( - self.default_schema_madlib, - "input", - "out", - self.default_dep_var, - self.default_ind_var, - None, - self.default_normalizing_const, - self.default_num_classes) - self.module.MiniBatchBufferSizeCalculator.calculate_default_buffer_size = Mock(return_value = 5) - preprocessor_obj.minibatch_preprocessor_dl() - - def test_minibatch_preprocessor_multiple_dep_var_raises_exception(self): - self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) - with self.assertRaises(plpy.PLPYException): - self.module.MiniBatchPreProcessorDL( - self.default_schema_madlib, - self.default_source_table, - self.default_output_table, - "y1,y2", - self.default_ind_var, - self.default_buffer_size, - self.default_normalizing_const, - self.default_num_classes) - - def test_minibatch_preprocessor_multiple_indep_var_raises_exception(self): - self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) - with self.assertRaises(plpy.PLPYException): - self.module.MiniBatchPreProcessorDL( - self.default_schema_madlib, - self.default_source_table, - self.default_output_table, - self.default_dep_var, - "x1,x2", - self.default_buffer_size, - self.default_normalizing_const, - self.default_num_classes) - - def test_minibatch_preprocessor_buffer_size_zero_fails(self): - self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) - with self.assertRaises(plpy.PLPYException): - self.module.MiniBatchPreProcessorDL( - self.default_schema_madlib, - self.default_source_table, - self.default_output_table, - self.default_dep_var, - self.default_ind_var, - 0, - self.default_normalizing_const, - self.default_num_classes) - - def test_minibatch_preprocessor_negative_buffer_size_fails(self): - self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) - with self.assertRaises(plpy.PLPYException): - self.module.MiniBatchPreProcessorDL(self.default_schema_madlib, - self.default_source_table, - self.default_output_table, - self.default_dep_var, - self.default_ind_var, - -1, - self.default_normalizing_const, - self.default_num_classes) - - def test_minibatch_preprocessor_invalid_indep_vartype_raises_exception(self): - self.module.get_expr_type = Mock(side_effect = ['integer', 'integer[]']) - with self.assertRaises(plpy.PLPYException): - self.module.MiniBatchPreProcessorDL(self.default_schema_madlib, - self.default_source_table, - self.default_output_table, - self.default_dep_var, - self.default_ind_var, - self.default_buffer_size, - self.default_normalizing_const, - self.default_num_classes) - - def test_minibatch_preprocessor_invalid_dep_vartype_raises_exception(self): - self.module.get_expr_type = Mock(side_effect = ['integer[]', 'text[]']) - with self.assertRaises(plpy.PLPYException): - self.module.MiniBatchPreProcessorDL(self.default_schema_madlib, - self.default_source_table, - self.default_output_table, - self.default_dep_var, - self.default_ind_var, - self.default_buffer_size, - self.default_normalizing_const, - self.default_num_classes) - - def test_minibatch_preprocessor_normalizing_const_zero_fails(self): - self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) - with self.assertRaises(plpy.PLPYException): - self.module.MiniBatchPreProcessorDL(self.default_schema_madlib, - self.default_source_table, - self.default_output_table, - self.default_dep_var, - self.default_ind_var, - self.default_buffer_size, - 0, - self.default_num_classes) - - def test_minibatch_preprocessor_negative_normalizing_const_fails(self): - self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) - with self.assertRaises(plpy.PLPYException): - self.module.MiniBatchPreProcessorDL(self.default_schema_madlib, - self.default_source_table, - self.default_output_table, - self.default_dep_var, - self.default_ind_var, - self.default_buffer_size, - -1, - self.default_num_classes) - - def test_get_one_hot_encoded_dep_var_expr_null_val(self): - self.module.get_expr_type = Mock(side_effect = ['integer[]', 'text']) - self.module.get_distinct_col_levels = Mock(return_value = ["NULL", "'a'"]) - obj = self.module.MiniBatchPreProcessorDL( - self.default_schema_madlib, - self.default_source_table, - self.default_output_table, - self.default_dep_var, - self.default_ind_var, - self.default_buffer_size, - self.default_normalizing_const, - self.default_num_classes) - dep_var_array_expr = obj.get_one_hot_encoded_dep_var_expr() - self.assertEqual("array[({0}) is not distinct from null, ({0}) is not distinct from 'a']::integer[]". - format(self.default_dep_var), - dep_var_array_expr.lower()) - - def test_get_one_hot_encoded_dep_var_expr_numeric_array_val(self): - self.module.get_expr_type = Mock(side_effect = ['integer[]', 'integer[]']) - obj = self.module.MiniBatchPreProcessorDL( - self.default_schema_madlib, - self.default_source_table, - self.default_output_table, - self.default_dep_var, - self.default_ind_var, - self.default_buffer_size, - self.default_normalizing_const, - self.default_num_classes) - dep_var_array_expr = obj.get_one_hot_encoded_dep_var_expr() - self.assertEqual("{0}::integer[]". - format(self.default_dep_var), - dep_var_array_expr.lower()) - - def test_validate_num_classes_none(self): - self.module.get_expr_type = Mock(side_effect = ['integer[]', 'text']) - obj = self.module.MiniBatchPreProcessorDL( - self.default_schema_madlib, - self.default_source_table, - self.default_output_table, - self.default_dep_var, - self.default_ind_var, - self.default_buffer_size, - self.default_normalizing_const, - None) - self.assertEqual(0, obj.padding_size) - - def test_validate_num_classes_greater(self): - self.module.get_expr_type = Mock(side_effect = ['integer[]', 'text']) - self.module.dependent_levels = Mock(return_value = ["'a'", "'b'", "'c'"]) - obj = self.module.MiniBatchPreProcessorDL( - self.default_schema_madlib, - self.default_source_table, - self.default_output_table, - self.default_dep_var, - self.default_ind_var, - self.default_buffer_size, - self.default_normalizing_const, - 5) - self.assertEqual(2, obj.padding_size) - - def test_validate_num_classes_lesser(self): - self.module.get_expr_type = Mock(side_effect = ['integer[]', 'text']) - self.module.dependent_levels = Mock(return_value = ["'a'", "'b'", "'c'"]) - with self.assertRaises(plpy.PLPYException): - obj = self.module.MiniBatchPreProcessorDL( - self.default_schema_madlib, - self.default_source_table, - self.default_output_table, - self.default_dep_var, - self.default_ind_var, - self.default_buffer_size, - self.default_normalizing_const, - 2) - if __name__ == '__main__': unittest.main()
