This is an automated email from the ASF dual-hosted git repository.
jingyimei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/madlib.git
The following commit(s) were added to refs/heads/master by this push:
new 8de32ed Minibatch Preprocessor for Deep learning
8de32ed is described below
commit 8de32ede33c48d2f4a440f0f639c94a277a359c1
Author: Nandish Jayaram <[email protected]>
AuthorDate: Mon Dec 17 17:54:42 2018 -0800
Minibatch Preprocessor for Deep learning
The minibatch preprocessor we currently have in MADlib is bloated for DL
tasks. This feature adds a simplified version of creating buffers. It
can divide each element of the independent array by a normalizing constant
for standardization (which is 255.0 by default), and optionally shift the
dependent variables by an offset if they are a numeric type.
Closes #342
Co-authored-by: Arvind Sridhar <[email protected]>
Co-authored-by: Domino Valdano <[email protected]>
---
doc/mainpage.dox.in | 6 +
.../utilities/minibatch_preprocessing.py_in | 192 +++++++++-
.../utilities/minibatch_preprocessing_dl.sql_in | 396 +++++++++++++++++++++
.../test/minibatch_preprocessing_dl.sql_in | 125 +++++++
.../postgres/modules/utilities/utilities.py_in | 4 +-
5 files changed, 719 insertions(+), 4 deletions(-)
diff --git a/doc/mainpage.dox.in b/doc/mainpage.dox.in
index aab37af..5568da6 100644
--- a/doc/mainpage.dox.in
+++ b/doc/mainpage.dox.in
@@ -285,6 +285,12 @@ complete matrix stored as a distributed table.
Interface and implementation are subject to change.
@{
@defgroup grp_cg Conjugate Gradient
+ @defgroup grp_dl Deep Learning
+ @brief A collection of deep learning interfaces.
+ @details A collection of deep learning interfaces.
+ @{
+ @defgroup grp_minibatch_preprocessing_dl Mini-Batch Preprocessor for
Deep Learning
+ @}
@defgroup grp_knn k-Nearest Neighbors
@defgroup grp_bayes Naive Bayes Classification
@defgroup grp_sample Random Sampling
diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
index 88433c9..be568a9 100644
--- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in
@@ -39,7 +39,7 @@ from utilities import py_list_to_sql_string
from utilities import split_quoted_delimited_str
from utilities import unique_string
from utilities import validate_module_input_params
-from utilities import NUMERIC, INTEGER, TEXT, BOOLEAN, INCLUDE_ARRAY
+from utilities import NUMERIC, INTEGER, TEXT, BOOLEAN, INCLUDE_ARRAY,
ONLY_ARRAY
from mean_std_dev_calculator import MeanStdDevCalculator
from validate_args import get_expr_type
@@ -51,6 +51,111 @@ m4_changequote(`<!', `!>')
MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname"
MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname"
+class MiniBatchPreProcessorDL:
+ def __init__(self, schema_madlib, source_table, output_table,
+ dependent_varname, independent_varname, buffer_size,
+ normalizing_const, dependent_offset, **kwargs):
+ self.schema_madlib = schema_madlib
+ self.source_table = source_table
+ self.output_table = output_table
+ self.dependent_varname = dependent_varname
+ self.independent_varname = independent_varname
+ self.buffer_size = buffer_size
+ self.normalizing_const = normalizing_const
+ self.dependent_offset = dependent_offset
+ self.module_name = "minibatch_preprocessor_DL"
+ self.output_summary_table = add_postfix(self.output_table, "_summary")
+ self._validate_args()
+ self.num_of_buffers = self._get_num_buffers()
+
+ def minibatch_preprocessor_dl(self):
+ norm_tbl = unique_string(desp='normalized')
+ # Create a temp table that has independent var normalized.
+
+ dependent_varname_with_offset = self.dependent_varname
+ if self.dependent_offset:
+ dependent_varname_with_offset = '{0} +
{1}'.format(self.dependent_varname, self.dependent_offset)
+
+ scalar_mult_sql = """
+ CREATE TEMP TABLE {norm_tbl} AS
+ SELECT {self.schema_madlib}.array_scalar_mult(
+ {self.independent_varname}::REAL[],
(1/{self.normalizing_const})::REAL) AS x_norm,
+ {dependent_varname_with_offset} AS y,
+ row_number() over() AS row_id
+ FROM {self.source_table}
+ """.format(**locals())
+ plpy.execute(scalar_mult_sql)
+ # Create the mini-batched output table
+ if is_platform_pg():
+ distributed_by_clause = ''
+ else:
+ distributed_by_clause= ' DISTRIBUTED BY (buffer_id) '
+ sql = """
+ CREATE TABLE {self.output_table} AS
+ SELECT * FROM
+ (
+ SELECT {self.schema_madlib}.agg_array_concat(
+ ARRAY[{norm_tbl}.x_norm::REAL[]]) AS {x},
+ array_agg({norm_tbl}.y) AS {y},
+ ({norm_tbl}.row_id%{self.num_of_buffers})::smallint AS
buffer_id
+ FROM {norm_tbl}
+ GROUP BY buffer_id
+ ) b
+ {distributed_by_clause}
+ """.format(x=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME,
+ y=MINIBATCH_OUTPUT_DEPENDENT_COLNAME,
+ **locals())
+ plpy.execute(sql)
+ plpy.execute("DROP TABLE {0}".format(norm_tbl))
+ # Create summary table
+ self._create_output_summary_table()
+
+ def _create_output_summary_table(self):
+ query = """
+ CREATE TABLE {self.output_summary_table} AS
+ SELECT
+ $__madlib__${self.source_table}$__madlib__$::TEXT AS
source_table,
+ $__madlib__${self.output_table}$__madlib__$::TEXT AS
output_table,
+ $__madlib__${self.dependent_varname}$__madlib__$::TEXT AS
dependent_varname,
+ $__madlib__${self.independent_varname}$__madlib__$::TEXT AS
independent_varname,
+ $__madlib__${self.dependent_vartype}$__madlib__$::TEXT AS
dependent_vartype,
+ {self.buffer_size} AS buffer_size
+ """.format(self=self)
+ plpy.execute(query)
+
+ def _validate_args(self):
+ validate_module_input_params(
+ self.source_table, self.output_table, self.independent_varname,
+ self.dependent_varname, self.module_name, None,
+ [self.output_summary_table])
+ self.independent_vartype = get_expr_type(
+ self.independent_varname, self.source_table)
+ _assert(is_valid_psql_type(self.independent_vartype,
+ NUMERIC | ONLY_ARRAY),
+ "Invalid independent variable type, should be an array of " \
+ "one of {0}".format(','.join(NUMERIC)))
+ self.dependent_vartype = get_expr_type(
+ self.dependent_varname, self.source_table)
+ dep_valid_types = NUMERIC | TEXT | BOOLEAN
+ _assert(is_valid_psql_type(self.dependent_vartype, dep_valid_types),
+ "Invalid dependent variable type, should be one of {0}".
+ format(','.join(dep_valid_types)))
+ if self.buffer_size is not None:
+ _assert(self.buffer_size > 0,
+ "minibatch_preprocessor_dl: The buffer size has to be a " \
+ "positive integer or NULL.")
+
+ def _get_num_buffers(self):
+ num_rows_in_tbl = plpy.execute("""
+ SELECT count(*) AS cnt FROM {0}
+ """.format(self.source_table))[0]['cnt']
+ buffer_size_calculator = MiniBatchBufferSizeCalculator()
+ indepdent_var_dim = _tbl_dimension_rownum(
+ self.schema_madlib, self.source_table, self.independent_varname,
+ skip_row_count=True)
+ self.buffer_size =
buffer_size_calculator.calculate_default_buffer_size(
+ self.buffer_size, num_rows_in_tbl, indepdent_var_dim[0])
+ return ceil((1.0*num_rows_in_tbl)/self.buffer_size)
class MiniBatchPreProcessor:
"""
@@ -509,7 +614,7 @@ class MiniBatchDocumentation:
SELECT {schema_madlib}.{method}(
source_table, -- TEXT. Name of the table containing input
data. Can also be a view
- output_table , -- TEXT. Name of the output table for
+ output_table, -- TEXT. Name of the output table for
mini-batching
dependent_varname, -- TEXT. Name of the dependent variable
column
independent_varname, -- TEXT. Name of the independent variable
@@ -580,3 +685,86 @@ class MiniBatchDocumentation:
for help.
""".format(**locals())
# ---------------------------------------------------------------------
+ @staticmethod
+ def minibatch_preprocessor_dl_help(schema_madlib, message):
+ method = "minibatch_preprocessor_dl"
+ summary = """
+ ----------------------------------------------------------------
+ SUMMARY
+ ----------------------------------------------------------------
+ For Deep Learning based techniques such as Convolutional Neural Nets,
+ the input data is mostly images. These images can be represented as an
+ array of numbers where all elements are between 0 and 255 in value.
+ It is standard practice to divide each of these numbers by 255.0 to
+ normalize the image data. minibatch_preprocessor() is for general
+ use-cases, but for deep learning based use-cases we provide
+ minibatch_preprocessor_dl() that is light-weight and is
+ specific to image datasets.
+
+ The normalizing constant is parameterized, and can be specified based
+ on the kind of image data used.
+
+ For more details on function usage:
+ SELECT {schema_madlib}.{method}('usage')
+ """.format(**locals())
+
+ usage = """
+
---------------------------------------------------------------------------
+ USAGE
+
---------------------------------------------------------------------------
+ SELECT {schema_madlib}.{method}(
+ source_table, -- TEXT. Name of the table containing input
+ data. Can also be a view
+ output_table, -- TEXT. Name of the output table for
+ mini-batching
+ dependent_varname, -- TEXT. Name of the dependent variable
column
+ independent_varname, -- TEXT. Name of the independent variable
+ column
+ buffer_size -- INTEGER. Default computed automatically.
+ Number of source input rows to pack into
a buffer
+ normalizing_const -- DOUBLE PRECISON. Default 255.0. The
+ normalizing constant to use for
+ standardizing arrays in
independent_varname.
+ dependent_offset -- INTEGER. If specified, shifts all
dependent
+ variable values by this number (should
+ only be used for numeric types).
+ );
+
+
+
---------------------------------------------------------------------------
+ OUTPUT
+
---------------------------------------------------------------------------
+ The output table produced by MiniBatch Preprocessor contains the
+ following columns:
+
+ buffer_id -- INTEGER. Unique id for packed table.
+ dependent_varname -- ANYARRAY[]. Packed array of dependent
variables.
+ independent_varname -- REAL[]. Packed array of independent
+ variables.
+
+
---------------------------------------------------------------------------
+ The algorithm also creates a summary table named <output_table>_summary
+ that has the following columns:
+
+ source_table -- Source table name.
+ output_table -- Output table name from preprocessor.
+ dependent_varname -- Dependent variable values from the
original table
+ (shifted by dependent_offset, if
specified).
+ independent_varname -- Independent variable values from the
original
+ table.
+ dependent_vartype -- Type of the dependent variable from the
+ original table.
+ buffer_size -- Buffer size used in preprocessing step.
+
+
---------------------------------------------------------------------------
+ """.format(**locals())
+
+
+ if not message:
+ return summary
+ elif message.lower() in ('usage', 'help', '?'):
+ return usage
+ return """
+ No such option. Use "SELECT
{schema_madlib}.minibatch_preprocessor_dl()"
+ for help.
+ """.format(**locals())
diff --git
a/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in
b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in
new file mode 100644
index 0000000..994e458
--- /dev/null
+++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in
@@ -0,0 +1,396 @@
+/* ----------------------------------------------------------------------- */
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ * @file minibatch_preprocessing_dl.sql_in
+ * @brief TODO
+ * @date December 2018
+ *
+ */
+/* ----------------------------------------------------------------------- */
+
+m4_include(`SQLCommon.m4')
+
+/**
+@addtogroup grp_minibatch_preprocessing_dl
+
+<div class="toc"><b>Contents</b><ul>
+<li class="level1"><a href="#minibatch_preprocessor_dl">Mini-Batch
Preprocessor for Deep Learning</a></li>
+<li class="level1"><a href="#example">Examples</a></li>
+</ul></div>
+
+For Deep Learning based techniques such as Convolutional Neural Nets, the input
+data is mostly images. These images can be represented as an array of numbers
+where all elements are between 0 and 255 in value. It is standard practice
+to divide each of these numbers by 255.0 to normalize the image data.
+minibatch_preprocessor() is for general use-cases, but for deep learning based
+use-cases we provide minibatch_preprocessor_dl() that is light-weight and is
+specific to image datasets. The normalizing constant is parameterized, and can
+be specified based on the kind of image data used.
+<pre class="syntax">
+minibatch_preprocessor_dl(source_table,
+ output_table,
+ dependent_varname,
+ independent_varname,
+ buffer_size,
+ normalizing_const,
+ dependent_offset
+ )
+</pre>
+
+\b Arguments
+<dl class="arglist">
+ <dt>source_table</dt>
+ <dd>TEXT. Name of the table containing input data. Can also be a view.
+ </dd>
+
+ <dt>output_table</dt>
+ <dd>TEXT. Name of the output table from the preprocessor which
+ will be used as input to algorithms that support mini-batching.
+ Note that the arrays packed into the output table are randomized
+ and normalized (by dividing each element in the independent variable array
+ by the normalizing_const), so they will not match up in an obvious way with
+ the rows in the source table.
+ </dd>
+
+ <dt>dependent_varname</dt>
+ <dd>TEXT. Name of the dependent variable column.
+ </dd>
+
+ <dt>independent_varname</dt>
+ <dd>TEXT. Name of the independent variable column. The column must be of
+ a numeric array type.
+ </dd>
+
+ <dt>buffer_size (optional)</dt>
+ <dd>INTEGER, default: computed. Buffer size is the
+ number of rows from the
+ source table that are packed into one row of the preprocessor
+ output table. The default value is computed considering size of
+ the source table, number of independent variables, number of groups,
+ and number of segments in the database cluster. For larger data sets,
+ the computed buffer size will typically be a value in the millions.
+ </dd>
+
+ <dt>normalizing_const (optional)</dt>
+ <dd>DOUBLE PRECISION, default: 255.0. The normalizing constant to divide
+ each value in the independent_varname array by.
+ </dd>
+
+ <dt>dependent_offset (optional)</dt>
+ <dd>INTEGER, default: NULL. If specified, shifts all dependent
+ variable values by this number (should only be used for numeric types).
+ </dd>
+
+</dl>
+
+<b>Output tables</b>
+<br>
+ The output table produced by the mini-batch preprocessor contains the
following columns:
+ <table class="output">
+ <tr>
+ <th>buffer_id</th>
+ <td>INTEGER. Unique id for packed table.
+ </td>
+ </tr>
+ <tr>
+ <th>dependent_varname</th>
+ <td>ANYARRAY[]. Packed array of dependent variables. The type
+ of the array is the same as the type of the dependent variable from
+ the source table.
+ </td>
+ </tr>
+ <tr>
+ <th>independent_varname</th>
+ <td>REAL[]. Packed array of independent variables.
+ </td>
+ </tr>
+ </table>
+
+A summary table named \<output_table\>_summary is also created, which has the
following columns:
+ <table class="output">
+ <tr>
+ <th>source_table</th>
+ <td>Name of the source table.</td>
+ </tr>
+ <tr>
+ <th>output_table</th>
+ <td>Name of output table generated by preprocessor.</td>
+ </tr>
+ <tr>
+ <th>dependent_varname</th>
+ <td>Dependent variable from the source table.</td>
+ </tr>
+ <tr>
+ <th>independent_varname</th>
+ <td>Independent variable from the source table.</td>
+ </tr>
+ <tr>
+ <th>dependent_vartype</th>
+ <td>Type of the dependent varialbe from the source table.</td>
+ </tr>
+ <tr>
+ <th>buffer_size</th>
+ <td>Buffer size used in preprocessing step.</td>
+ </tr>
+ </table>
+
+@anchor example
+@par Examples
+-# Create an input data set based on the well known iris data set:
+<pre class="example">
+DROP TABLE IF EXISTS iris_data;
+CREATE TABLE iris_data(
+ id serial,
+ attributes numeric[],
+ class_text varchar,
+ class integer,
+ state varchar
+);
+INSERT INTO iris_data(id, attributes, class_text, class, state) VALUES
+(1,ARRAY[5.0,3.2,1.2,0.2],'Iris_setosa',1,'Alaska'),
+(2,ARRAY[5.5,3.5,1.3,0.2],'Iris_setosa',1,'Alaska'),
+(3,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Alaska'),
+(4,ARRAY[4.4,3.0,1.3,0.2],'Iris_setosa',1,'Alaska'),
+(5,ARRAY[5.1,3.4,1.5,0.2],'Iris_setosa',1,'Alaska'),
+(6,ARRAY[5.0,3.5,1.3,0.3],'Iris_setosa',1,'Alaska'),
+(7,ARRAY[4.5,2.3,1.3,0.3],'Iris_setosa',1,'Alaska'),
+(8,ARRAY[4.4,3.2,1.3,0.2],'Iris_setosa',1,'Alaska'),
+(9,ARRAY[5.0,3.5,1.6,0.6],'Iris_setosa',1,'Alaska'),
+(10,ARRAY[5.1,3.8,1.9,0.4],'Iris_setosa',1,'Alaska'),
+(11,ARRAY[4.8,3.0,1.4,0.3],'Iris_setosa',1,'Alaska'),
+(12,ARRAY[5.1,3.8,1.6,0.2],'Iris_setosa',1,'Alaska'),
+(13,ARRAY[5.7,2.8,4.5,1.3],'Iris_versicolor',2,'Alaska'),
+(14,ARRAY[6.3,3.3,4.7,1.6],'Iris_versicolor',2,'Alaska'),
+(15,ARRAY[4.9,2.4,3.3,1.0],'Iris_versicolor',2,'Alaska'),
+(16,ARRAY[6.6,2.9,4.6,1.3],'Iris_versicolor',2,'Alaska'),
+(17,ARRAY[5.2,2.7,3.9,1.4],'Iris_versicolor',2,'Alaska'),
+(18,ARRAY[5.0,2.0,3.5,1.0],'Iris_versicolor',2,'Alaska'),
+(19,ARRAY[5.9,3.0,4.2,1.5],'Iris_versicolor',2,'Alaska'),
+(20,ARRAY[6.0,2.2,4.0,1.0],'Iris_versicolor',2,'Alaska'),
+(21,ARRAY[6.1,2.9,4.7,1.4],'Iris_versicolor',2,'Alaska'),
+(22,ARRAY[5.6,2.9,3.6,1.3],'Iris_versicolor',2,'Alaska'),
+(23,ARRAY[6.7,3.1,4.4,1.4],'Iris_versicolor',2,'Alaska'),
+(24,ARRAY[5.6,3.0,4.5,1.5],'Iris_versicolor',2,'Alaska'),
+(25,ARRAY[5.8,2.7,4.1,1.0],'Iris_versicolor',2,'Alaska'),
+(26,ARRAY[6.2,2.2,4.5,1.5],'Iris_versicolor',2,'Alaska'),
+(27,ARRAY[5.6,2.5,3.9,1.1],'Iris_versicolor',2,'Alaska'),
+(28,ARRAY[5.0,3.4,1.5,0.2],'Iris_setosa',1,'Tennessee'),
+(29,ARRAY[4.4,2.9,1.4,0.2],'Iris_setosa',1,'Tennessee'),
+(30,ARRAY[4.9,3.1,1.5,0.1],'Iris_setosa',1,'Tennessee'),
+(31,ARRAY[5.4,3.7,1.5,0.2],'Iris_setosa',1,'Tennessee'),
+(32,ARRAY[4.8,3.4,1.6,0.2],'Iris_setosa',1,'Tennessee'),
+(33,ARRAY[4.8,3.0,1.4,0.1],'Iris_setosa',1,'Tennessee'),
+(34,ARRAY[4.3,3.0,1.1,0.1],'Iris_setosa',1,'Tennessee'),
+(35,ARRAY[5.8,4.0,1.2,0.2],'Iris_setosa',1,'Tennessee'),
+(36,ARRAY[5.7,4.4,1.5,0.4],'Iris_setosa',1,'Tennessee'),
+(37,ARRAY[5.4,3.9,1.3,0.4],'Iris_setosa',1,'Tennessee'),
+(38,ARRAY[6.0,2.9,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
+(39,ARRAY[5.7,2.6,3.5,1.0],'Iris_versicolor',2,'Tennessee'),
+(40,ARRAY[5.5,2.4,3.8,1.1],'Iris_versicolor',2,'Tennessee'),
+(41,ARRAY[5.5,2.4,3.7,1.0],'Iris_versicolor',2,'Tennessee'),
+(42,ARRAY[5.8,2.7,3.9,1.2],'Iris_versicolor',2,'Tennessee'),
+(43,ARRAY[6.0,2.7,5.1,1.6],'Iris_versicolor',2,'Tennessee'),
+(44,ARRAY[5.4,3.0,4.5,1.5],'Iris_versicolor',2,'Tennessee'),
+(45,ARRAY[6.0,3.4,4.5,1.6],'Iris_versicolor',2,'Tennessee'),
+(46,ARRAY[6.7,3.1,4.7,1.5],'Iris_versicolor',2,'Tennessee'),
+(47,ARRAY[6.3,2.3,4.4,1.3],'Iris_versicolor',2,'Tennessee'),
+(48,ARRAY[5.6,3.0,4.1,1.3],'Iris_versicolor',2,'Tennessee'),
+(49,ARRAY[5.5,2.5,4.0,1.3],'Iris_versicolor',2,'Tennessee'),
+(50,ARRAY[5.5,2.6,4.4,1.2],'Iris_versicolor',2,'Tennessee'),
+(51,ARRAY[6.1,3.0,4.6,1.4],'Iris_versicolor',2,'Tennessee'),
+(52,ARRAY[5.8,2.6,4.0,1.2],'Iris_versicolor',2,'Tennessee');
+</pre>
+
+-# Run the preprocessor:
+<pre class="example">
+DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary;
+SELECT madlib.minibatch_preprocessor_dl('iris_data', -- Source table
+ 'iris_data_packed', -- Output table
+ 'class_text', -- Dependent variable
+ 'attributes', -- Independent
variables
+ NULL, -- buffer size
+ 2 -- normalizing
constant
+ );
+</pre>
+For small datasets like in this example, buffer size is mainly
+determined by the number of segments in the database.
+This example is run on a Greenplum database with 3 segments,
+so there are 3 rows with a buffer size of 18.
+For PostgresSQL, there would be only one row with a buffer
+size of 52 since it is a single node database.
+For larger data sets, other factors go into
+computing buffers size besides number of segments.
+Also, note that the dependent variable has
+been one-hot encoded since it is categorical.
+Here is a sample of the packed output table:
+<pre class="example">
+\\x on
+SELECT * FROM iris_data_packed;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]-------+-------------------------------------
+independent_varname |
{{2.55,1.7,0.75,0.1},{2.55,1.9,0.95,0.2},{2.9,1.35,1.95,0.6},{3.35,1.55,2.2,0.7},{2.85,1.3,1.75,0.5},{3.1,1.1,2.25,0.75},...}}
+dependent_varname |
{Iris_versicolor,Iris_versicolor,Iris_versicolor,Iris_versicolor,Iris_setosa,Iris_setosa,...}
+buffer_id | 0
+-[ RECORD 2 ]-------+-------------------------------------
+independent_varname |
{{2.4,1.5,0.7,0.15},{2.2,1.6,0.65,0.1},{2.8,1.45,1.8,0.65},{2.9,1.3,2,0.6},{2.2,1.45,0.7,0.1},{2.85,1.4,2.25,0.65},{2.8,1.25,1.95,0.55},...}}
+dependent_varname |
{Iris_setosa,Iris_setosa,Iris_versicolor,Iris_versicolor,Iris_setosa,Iris_versicolor,...}
+buffer_id | 1
+-[ RECORD 2 ]-------+-------------------------------------
+independent_varname |
{{2.4,1.7,0.8,0.1},{2.9,1.35,2.05,0.5},{3.15,1.65,2.35,0.8},{2.7,1.95,0.65,0.2},{2.75,1.3,2.2,0.6},{3,1.1,2,0.5},{2.55,1.9,0.8,0.1},...}}
+dependent_varname |
{Iris_versicolor,Iris_setosa,Iris_versicolor,Iris_versicolor,Iris_setosa,Iris_setosa,...}
+buffer_id | 2
+</pre>
+Review the output summary table:
+<pre class="example">
+\\x on
+SELECT * FROM iris_data_packed_summary;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]-------+------------------
+source_table | iris_data
+output_table | iris_data_packed
+dependent_varname | class_text
+independent_varname | attributes
+dependent_vartype | character varying
+buffer_size | 18
+</pre>
+
+-# Generally the default buffer size will work well,
+but if you have occasion to change it:
+<pre class="example">
+DROP TABLE IF EXISTS iris_data_packed, iris_data_packed_summary;
+SELECT madlib.minibatch_preprocessor_dl('iris_data', -- Source table
+ 'iris_data_packed', -- Output table
+ 'class_text', -- Dependent variable
+ 'attributes', -- Independent
variables
+ 10 -- Buffer size
+ );
+</pre>
+Review the number of buffers in the output table:
+<pre class="example">
+SELECT COUNT(*) FROM iris_data_packed;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]
+count | 6
+</pre>
+Review the output summary table:
+<pre class="example">
+\\x on
+SELECT * FROM iris_data_packed_summary;
+</pre>
+<pre class="result">
+-[ RECORD 1 ]-------+------------------
+source_table | iris_data
+output_table | iris_data_packed
+dependent_varname | class_text
+independent_varname | attributes
+dependent_vartype | character varying
+buffer_size | 10
+</pre>
+
+*/
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
+ source_table VARCHAR,
+ output_table VARCHAR,
+ dependent_varname VARCHAR,
+ independent_varname VARCHAR,
+ buffer_size INTEGER,
+ normalizing_const DOUBLE PRECISION,
+ dependent_offset INTEGER
+) RETURNS VOID AS $$
+ PythonFunctionBodyOnly(utilities, minibatch_preprocessing)
+ from utilities.control import MinWarning
+ with AOControl(False):
+ with MinWarning('error'):
+ minibatch_preprocessor_obj =
minibatch_preprocessing.MiniBatchPreProcessorDL(**globals())
+ minibatch_preprocessor_obj.minibatch_preprocessor_dl()
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
+ source_table VARCHAR,
+ output_table VARCHAR,
+ dependent_varname VARCHAR,
+ independent_varname VARCHAR,
+ buffer_size INTEGER,
+ normalizing_const DOUBLE PRECISION
+) RETURNS VOID AS $$
+ SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, $6, NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
+ source_table VARCHAR,
+ output_table VARCHAR,
+ dependent_varname VARCHAR,
+ independent_varname VARCHAR,
+ buffer_size INTEGER
+) RETURNS VOID AS $$
+ SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, 255.0,
NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
+ source_table VARCHAR,
+ output_table VARCHAR,
+ dependent_varname VARCHAR,
+ independent_varname VARCHAR
+) RETURNS VOID AS $$
+ SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, NULL, 255.0,
NULL);
+$$ LANGUAGE sql VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl(
+ message VARCHAR
+) RETURNS VARCHAR AS $$
+ PythonFunctionBodyOnly(utilities, minibatch_preprocessing)
+ return
minibatch_preprocessing.MiniBatchDocumentation.minibatch_preprocessor_dl_help(schema_madlib,
message)
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl()
+RETURNS VARCHAR AS $$
+ PythonFunctionBodyOnly(utilities, minibatch_preprocessing)
+ return
minibatch_preprocessing.MiniBatchDocumentation.minibatch_preprocessor_dl_help(schema_madlib,
'')
+$$ LANGUAGE plpythonu VOLATILE
+m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
+
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.agg_array_concat_transition(anyarray,
anyarray)
+ RETURNS anyarray
+ AS 'select $1 || $2'
+ LANGUAGE SQL
+ IMMUTABLE;
+
+CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.agg_array_concat_merge(anyarray,
anyarray)
+ RETURNS anyarray
+ AS 'select $1 || $2'
+ LANGUAGE SQL
+ IMMUTABLE
+ RETURNS NULL ON NULL INPUT;
+
+DROP AGGREGATE IF EXISTS MADLIB_SCHEMA.agg_array_concat(anyarray);
+CREATE AGGREGATE MADLIB_SCHEMA.agg_array_concat(anyarray) (
+ SFUNC = MADLIB_SCHEMA.agg_array_concat_transition,
+ STYPE = anyarray,
+ PREFUNC = MADLIB_SCHEMA.agg_array_concat_merge
+ );
diff --git
a/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in
b/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in
new file mode 100644
index 0000000..dd9b0d5
--- /dev/null
+++
b/src/ports/postgres/modules/utilities/test/minibatch_preprocessing_dl.sql_in
@@ -0,0 +1,125 @@
+/* -----------------------------------------------------------------------
*//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *//* -----------------------------------------------------------------------
*/
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_input;
+CREATE TABLE minibatch_preprocessor_dl_input(id serial, x double precision[],
label TEXT);
+INSERT INTO minibatch_preprocessor_dl_input(x, label) VALUES
+(ARRAY[1,2,3,4,5,6], 'a'),
+(ARRAY[11,2,3,4,5,6], 'a'),
+(ARRAY[11,22,33,4,5,6], 'a'),
+(ARRAY[11,22,33,44,5,6], 'a'),
+(ARRAY[11,22,33,44,65,6], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'a'),
+(ARRAY[11,22,33,144,65,56], 'a'),
+(ARRAY[11,22,233,44,65,56], 'a'),
+(ARRAY[11,22,33,44,65,56], 'b'),
+(ARRAY[11,22,33,44,65,56], 'b'),
+(ARRAY[11,22,33,44,65,56], 'b');
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch,
minibatch_preprocessor_dl_batch_summary;
+SELECT minibatch_preprocessor_dl(
+ 'minibatch_preprocessor_dl_input',
+ 'minibatch_preprocessor_dl_batch',
+ 'id',
+ 'x',
+ 5);
+
+SELECT assert(count(*)=4, 'Incorrect number of buffers in
minibatch_preprocessor_dl_batch.')
+FROM minibatch_preprocessor_dl_batch;
+
+SELECT assert(array_upper(independent_varname, 1)=5, 'Incorrect buffer size.')
+FROM minibatch_preprocessor_dl_batch WHERE buffer_id=1;
+
+SELECT assert(array_upper(independent_varname, 1)=2, 'Incorrect buffer size.')
+FROM minibatch_preprocessor_dl_batch WHERE buffer_id=4;
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch,
minibatch_preprocessor_dl_batch_summary;
+SELECT minibatch_preprocessor_dl(
+ 'minibatch_preprocessor_dl_input',
+ 'minibatch_preprocessor_dl_batch',
+ 'label',
+ 'x');
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_input;
+CREATE TABLE minibatch_preprocessor_dl_input(id serial, x double precision[],
y INTEGER);
+INSERT INTO minibatch_preprocessor_dl_input(x, y) VALUES
+(ARRAY[1,2,3,4,5,6], 4),
+(ARRAY[11,2,3,4,5,6], 3),
+(ARRAY[11,22,33,4,5,6], 8),
+(ARRAY[11,22,33,44,5,6], 2),
+(ARRAY[11,22,33,44,65,6], 5),
+(ARRAY[11,22,33,44,65,56], 6),
+(ARRAY[11,22,33,44,65,56], 2),
+(ARRAY[11,22,33,44,65,56], 10),
+(ARRAY[11,22,33,44,65,56], 3),
+(ARRAY[11,22,33,44,65,56], 7),
+(ARRAY[11,22,33,44,65,56], 6),
+(ARRAY[11,22,33,44,65,56], -6),
+(ARRAY[11,22,33,144,65,56], 9),
+(ARRAY[11,22,233,44,65,56], 0),
+(ARRAY[11,22,33,44,65,56], 12),
+(ARRAY[11,22,33,44,65,56], -3),
+(ARRAY[11,22,33,44,65,56], -1);
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch,
minibatch_preprocessor_dl_batch_summary;
+SELECT minibatch_preprocessor_dl(
+ 'minibatch_preprocessor_dl_input',
+ 'minibatch_preprocessor_dl_batch',
+ 'y',
+ 'x',
+ 4,
+ 5);
+
+-- Test that indepdendent vars get divided by 5, by verifying min value goes
from 1 to 0.2, and max value from 233 to 46.6
+SELECT assert(relative_error(MIN(x),0.2) < 0.00001, 'Independent var not
normalized properly!') FROM (SELECT UNNEST(independent_varname) as x FROM
minibatch_preprocessor_dl_batch) a;
+SELECT assert(relative_error(MAX(x),46.6) < 0.00001, 'Independent var not
normalized properly!') FROM (SELECT UNNEST(independent_varname) as x FROM
minibatch_preprocessor_dl_batch) a;
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch,
minibatch_preprocessor_dl_batch_summary;
+SELECT minibatch_preprocessor_dl(
+ 'minibatch_preprocessor_dl_input',
+ 'minibatch_preprocessor_dl_batch',
+ 'y',
+ 'x',
+ 4,
+ 5,
+ 6);
+
+-- Test that dependent vars gets shifted by +6, by verifying minimum value
goes from -6 to 0
+SELECT assert(abs(MIN(y))<0.00001, 'Dependent var not shifted properly!') FROM
(SELECT UNNEST(dependent_varname) as y FROM minibatch_preprocessor_dl_batch) a;
+
+DROP TABLE IF EXISTS minibatch_preprocessor_dl_batch,
minibatch_preprocessor_dl_batch_summary;
+SELECT minibatch_preprocessor_dl(
+ 'minibatch_preprocessor_dl_input',
+ 'minibatch_preprocessor_dl_batch',
+ 'y',
+ 'x',
+ 4,
+ 5,
+ -6);
+
+-- Test that dependent vars gets shifted by -6, by verifying minimum value
goes from -6 to -12
+SELECT assert(relative_error(MIN(y), -12)<0.00001, 'Dependent var not shifted
properly!') FROM (SELECT UNNEST(dependent_varname) as y FROM
minibatch_preprocessor_dl_batch) a;
\ No newline at end of file
diff --git a/src/ports/postgres/modules/utilities/utilities.py_in
b/src/ports/postgres/modules/utilities/utilities.py_in
index 50c426b..89907b3 100644
--- a/src/ports/postgres/modules/utilities/utilities.py_in
+++ b/src/ports/postgres/modules/utilities/utilities.py_in
@@ -875,8 +875,8 @@ def collate_plpy_result(plpy_result_rows):
def validate_module_input_params(source_table, output_table,
independent_varname,
- dependent_varname, module_name, grouping_cols,
- other_output_tables=None):
+ dependent_varname, module_name,
+ grouping_cols=None, other_output_tables=None):
"""
This function is supposed to be used for validating params for
supervised learning like algos, e.g. linear regression, mlp, etc. since all