This is an automated email from the ASF dual-hosted git repository. jingyimei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/madlib.git
commit cffac4a0a2de41cb3ce414ba5e2d5d3b250137c2 Author: Rahul Iyer <[email protected]> AuthorDate: Wed Feb 20 15:26:15 2019 -0800 Minibatch DL: Set default normalizing constant to 1.0 JIRA: MADLIB-1290 --- .../utilities/minibatch_preprocessing.py_in | 41 +++++++++++----------- .../utilities/minibatch_preprocessing_dl.sql_in | 20 +++++------ 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in index a4d1cba..c3fd95d 100644 --- a/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in +++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in @@ -61,14 +61,14 @@ MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL = "independent_var" class MiniBatchPreProcessorDL: def __init__(self, schema_madlib, source_table, output_table, dependent_varname, independent_varname, buffer_size, - normalizing_const, dependent_offset, **kwargs): + normalizing_const=1.0, dependent_offset=None, **kwargs): self.schema_madlib = schema_madlib self.source_table = source_table self.output_table = output_table self.dependent_varname = dependent_varname self.independent_varname = independent_varname self.buffer_size = buffer_size - self.normalizing_const = normalizing_const + self.normalizing_const = normalizing_const if normalizing_const else 1.0 self.dependent_offset = dependent_offset self.module_name = "minibatch_preprocessor_DL" self.output_summary_table = add_postfix(self.output_table, "_summary") @@ -76,8 +76,8 @@ class MiniBatchPreProcessorDL: self.num_of_buffers = self._get_num_buffers() def minibatch_preprocessor_dl(self): - norm_tbl = unique_string(desp='normalized') # Create a temp table that has independent var normalized. + norm_tbl = unique_string(desp='normalized') dependent_varname_with_offset = self.dependent_varname if self.dependent_offset: @@ -90,7 +90,7 @@ class MiniBatchPreProcessorDL: {dependent_varname_with_offset} AS y, row_number() over() AS row_id FROM {self.source_table} - """.format(**locals()) + """.format(**locals()) plpy.execute(scalar_mult_sql) # Create the mini-batched output table if is_platform_pg(): @@ -109,11 +109,11 @@ class MiniBatchPreProcessorDL: GROUP BY buffer_id ) b {distributed_by_clause} - """.format(x=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL, - y=MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL, - **locals()) + """.format(x=MINIBATCH_OUTPUT_INDEPENDENT_COLNAME_DL, + y=MINIBATCH_OUTPUT_DEPENDENT_COLNAME_DL, + **locals()) plpy.execute(sql) - plpy.execute("DROP TABLE {0}".format(norm_tbl)) + plpy.execute("DROP TABLE IF EXISTS {0}".format(norm_tbl)) # Create summary table self._create_output_summary_table() @@ -127,7 +127,7 @@ class MiniBatchPreProcessorDL: $__madlib__${self.independent_varname}$__madlib__$::TEXT AS independent_varname, $__madlib__${self.dependent_vartype}$__madlib__$::TEXT AS dependent_vartype, {self.buffer_size} AS buffer_size - """.format(self=self) + """.format(self=self) plpy.execute(query) def _validate_args(self): @@ -139,7 +139,7 @@ class MiniBatchPreProcessorDL: self.independent_varname, self.source_table) _assert(is_valid_psql_type(self.independent_vartype, NUMERIC | ONLY_ARRAY), - "Invalid independent variable type, should be an array of " \ + "Invalid independent variable type, should be an array of " "one of {0}".format(','.join(NUMERIC))) self.dependent_vartype = get_expr_type( self.dependent_varname, self.source_table) @@ -149,7 +149,7 @@ class MiniBatchPreProcessorDL: format(','.join(dep_valid_types))) if self.buffer_size is not None: _assert(self.buffer_size > 0, - "minibatch_preprocessor_dl: The buffer size has to be a " \ + "minibatch_preprocessor_dl: The buffer size has to be a " "positive integer or NULL.") def _get_num_buffers(self): @@ -158,11 +158,11 @@ class MiniBatchPreProcessorDL: """.format(self.source_table))[0]['cnt'] buffer_size_calculator = MiniBatchBufferSizeCalculator() indepdent_var_dim = _tbl_dimension_rownum( - self.schema_madlib, self.source_table, self.independent_varname, - skip_row_count=True) + self.schema_madlib, self.source_table, + self.independent_varname, skip_row_count=True) self.buffer_size = buffer_size_calculator.calculate_default_buffer_size( self.buffer_size, num_rows_in_tbl, indepdent_var_dim[0]) - return ceil((1.0*num_rows_in_tbl)/self.buffer_size) + return ceil((1.0 * num_rows_in_tbl) / self.buffer_size) class MiniBatchPreProcessor: """ @@ -701,12 +701,11 @@ class MiniBatchDocumentation: ---------------------------------------------------------------- For Deep Learning based techniques such as Convolutional Neural Nets, the input data is mostly images. These images can be represented as an - array of numbers where all elements are between 0 and 255 in value. - It is standard practice to divide each of these numbers by 255.0 to - normalize the image data. minibatch_preprocessor() is for general - use-cases, but for deep learning based use-cases we provide - minibatch_preprocessor_dl() that is light-weight and is - specific to image datasets. + array of numbers where each element represents a pixel/color intensity. + It is standard practice to normalize the image data before use. + minibatch_preprocessor() is for general use-cases, but for deep learning + based use-cases we provide minibatch_preprocessor_dl() that is + light-weight and is specific to image datasets. The normalizing constant is parameterized, and can be specified based on the kind of image data used. @@ -729,7 +728,7 @@ class MiniBatchDocumentation: column buffer_size -- INTEGER. Default computed automatically. Number of source input rows to pack into a buffer - normalizing_const -- DOUBLE PRECISON. Default 255.0. The + normalizing_const -- DOUBLE PRECISON. Default 1.0. The normalizing constant to use for standardizing arrays in independent_varname. dependent_offset -- INTEGER. If specified, shifts all dependent diff --git a/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in index 6cbe249..537888e 100644 --- a/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in +++ b/src/ports/postgres/modules/utilities/minibatch_preprocessing_dl.sql_in @@ -35,12 +35,12 @@ m4_include(`SQLCommon.m4') <li class="level1"><a href="#related">Related Topics</a></li> </ul></div> -For deep learning techniques such as convolutional neural networks, the input -data is often images. These images can represented as an array of numbers -with elements between 0 and 255, representing grayscale or RGB channel values -for each pixel in the image. It is standard practice to divide by 255 to -normalize the image data. The normalizing constant is parameterized, and can -be set depending on the format of image data used. +For deep learning based techniques such as convolutional neural nets, the input +data is often images. These images can be represented as an array of numbers +where each element defines represents grayscale or RGB channel values for each +pixel in the image. It is standard practice to normalize the image data before +training. The normalizing constant is parameterized, and can be set depending on +the format of image data used. This mini-batch preprocessor is a lightweight version designed specifically for image data. A separate more general minibatch_preprocessor() is also @@ -54,7 +54,7 @@ minibatch_preprocessor_dl( source_table, buffer_size, normalizing_const, dependent_offset - ) + ) </pre> \b Arguments @@ -91,7 +91,7 @@ minibatch_preprocessor_dl( source_table, </dd> <dt>normalizing_const (optional)</dt> - <dd>DOUBLE PRECISION, default: 255. The normalizing constant to divide + <dd>DOUBLE PRECISION, default: 1.0. The normalizing constant to divide each value in the independent_varname array by. </dd> @@ -426,7 +426,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( independent_varname VARCHAR, buffer_size INTEGER ) RETURNS VOID AS $$ - SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, 255.0, NULL); + SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, $5, 1.0, NULL); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `'); @@ -436,7 +436,7 @@ CREATE OR REPLACE FUNCTION MADLIB_SCHEMA.minibatch_preprocessor_dl( dependent_varname VARCHAR, independent_varname VARCHAR ) RETURNS VOID AS $$ - SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, NULL, 255.0, NULL); + SELECT MADLIB_SCHEMA.minibatch_preprocessor_dl($1, $2, $3, $4, NULL, 1.0, NULL); $$ LANGUAGE sql VOLATILE m4_ifdef(`__HAS_FUNCTION_PROPERTIES__', `MODIFIES SQL DATA', `');
