[GitHub] madlib pull request #241: MiniBatch Pre-Processor: Add new module minibatch_...

2018-03-21 Thread asfgit
Github user asfgit closed the pull request at:

https://github.com/apache/madlib/pull/241


---


[GitHub] madlib pull request #241: MiniBatch Pre-Processor: Add new module minibatch_...

2018-03-20 Thread jingyimei
Github user jingyimei commented on a diff in the pull request:

https://github.com/apache/madlib/pull/241#discussion_r175957289
  
--- Diff: 
src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in ---
@@ -0,0 +1,559 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+"""
+@file minibatch_preprocessing.py_in
+
+"""
+from math import ceil
+import plpy
+
+from utilities import add_postfix
+from utilities import _assert
+from utilities import get_seg_number
+from utilities import is_platform_pg
+from utilities import is_psql_numeric_type
+from utilities import is_string_formatted_as_array_expression
+from utilities import py_list_to_sql_string
+from utilities import split_quoted_delimited_str
+from utilities import _string_to_array
+from utilities import validate_module_input_params
+from mean_std_dev_calculator import MeanStdDevCalculator
+from validate_args import get_expr_type
+from validate_args import output_tbl_valid
+from validate_args import _tbl_dimension_rownum
+
+m4_changequote(`')
+
+# These are readonly variables, do not modify
+MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname"
+MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname"
+
+class MiniBatchPreProcessor:
+"""
+This class is responsible for executing the main logic of mini batch
+preprocessing, which packs multiple rows of selected columns from the
+source table into one row based on the buffer size
+"""
+def __init__(self, schema_madlib, source_table, output_table,
+  dependent_varname, independent_varname, buffer_size, 
**kwargs):
+self.schema_madlib = schema_madlib
+self.source_table = source_table
+self.output_table = output_table
+self.dependent_varname = dependent_varname
+self.independent_varname = independent_varname
+self.buffer_size = buffer_size
+
+self.module_name = "minibatch_preprocessor"
+self.output_standardization_table = add_postfix(self.output_table,
+   "_standardization")
+self.output_summary_table = add_postfix(self.output_table, 
"_summary")
+self._validate_minibatch_preprocessor_params()
+
+def minibatch_preprocessor(self):
+# Get array expressions for both dep and indep variables from the
+# MiniBatchQueryFormatter class
+dependent_var_dbtype = get_expr_type(self.dependent_varname,
+ self.source_table)
+qry_formatter = MiniBatchQueryFormatter(self.source_table)
+dep_var_array_str, dep_var_classes_str = qry_formatter.\
+get_dep_var_array_and_classes(self.dependent_varname,
+  dependent_var_dbtype)
+indep_var_array_str = qry_formatter.get_indep_var_array_str(
+  self.independent_varname)
+
+standardizer = MiniBatchStandardizer(self.schema_madlib,
+ self.source_table,
+ dep_var_array_str,
+ indep_var_array_str,
+ 
self.output_standardization_table)
+standardize_query = standardizer.get_query_for_standardizing()
+
+num_rows_processed, num_missing_rows_skipped = self.\
+
_get_skipped_rows_processed_count(
+dep_var_array_str,
+indep_var_array_str)
+calculated_buffer_size = MiniBatchBufferSizeCalculator.\
+ calculate_default_buffer_size(
+ self.buffer_size,
+

[GitHub] madlib pull request #241: MiniBatch Pre-Processor: Add new module minibatch_...

2018-03-19 Thread njayaram2
Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/241#discussion_r175548350
  
--- Diff: 
src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in ---
@@ -0,0 +1,559 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+"""
+@file minibatch_preprocessing.py_in
+
+"""
+from math import ceil
+import plpy
+
+from utilities import add_postfix
+from utilities import _assert
+from utilities import get_seg_number
+from utilities import is_platform_pg
+from utilities import is_psql_numeric_type
+from utilities import is_string_formatted_as_array_expression
+from utilities import py_list_to_sql_string
+from utilities import split_quoted_delimited_str
+from utilities import _string_to_array
+from utilities import validate_module_input_params
+from mean_std_dev_calculator import MeanStdDevCalculator
+from validate_args import get_expr_type
+from validate_args import output_tbl_valid
+from validate_args import _tbl_dimension_rownum
+
+m4_changequote(`')
+
+# These are readonly variables, do not modify
+MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname"
+MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname"
+
+class MiniBatchPreProcessor:
+"""
+This class is responsible for executing the main logic of mini batch
+preprocessing, which packs multiple rows of selected columns from the
+source table into one row based on the buffer size
+"""
+def __init__(self, schema_madlib, source_table, output_table,
+  dependent_varname, independent_varname, buffer_size, 
**kwargs):
+self.schema_madlib = schema_madlib
+self.source_table = source_table
+self.output_table = output_table
+self.dependent_varname = dependent_varname
+self.independent_varname = independent_varname
+self.buffer_size = buffer_size
+
+self.module_name = "minibatch_preprocessor"
+self.output_standardization_table = add_postfix(self.output_table,
+   "_standardization")
+self.output_summary_table = add_postfix(self.output_table, 
"_summary")
+self._validate_minibatch_preprocessor_params()
+
+def minibatch_preprocessor(self):
+# Get array expressions for both dep and indep variables from the
+# MiniBatchQueryFormatter class
+dependent_var_dbtype = get_expr_type(self.dependent_varname,
+ self.source_table)
+qry_formatter = MiniBatchQueryFormatter(self.source_table)
+dep_var_array_str, dep_var_classes_str = qry_formatter.\
+get_dep_var_array_and_classes(self.dependent_varname,
+  dependent_var_dbtype)
+indep_var_array_str = qry_formatter.get_indep_var_array_str(
+  self.independent_varname)
+
+standardizer = MiniBatchStandardizer(self.schema_madlib,
+ self.source_table,
+ dep_var_array_str,
+ indep_var_array_str,
+ 
self.output_standardization_table)
+standardize_query = standardizer.get_query_for_standardizing()
+
+num_rows_processed, num_missing_rows_skipped = self.\
+
_get_skipped_rows_processed_count(
+dep_var_array_str,
+indep_var_array_str)
+calculated_buffer_size = MiniBatchBufferSizeCalculator.\
+ calculate_default_buffer_size(
+ self.buffer_size,
+

[GitHub] madlib pull request #241: MiniBatch Pre-Processor: Add new module minibatch_...

2018-03-19 Thread njayaram2
Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/241#discussion_r175588969
  
--- Diff: 
src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in ---
@@ -0,0 +1,559 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+"""
+@file minibatch_preprocessing.py_in
+
+"""
+from math import ceil
+import plpy
+
+from utilities import add_postfix
+from utilities import _assert
+from utilities import get_seg_number
+from utilities import is_platform_pg
+from utilities import is_psql_numeric_type
+from utilities import is_string_formatted_as_array_expression
+from utilities import py_list_to_sql_string
+from utilities import split_quoted_delimited_str
+from utilities import _string_to_array
+from utilities import validate_module_input_params
+from mean_std_dev_calculator import MeanStdDevCalculator
+from validate_args import get_expr_type
+from validate_args import output_tbl_valid
+from validate_args import _tbl_dimension_rownum
+
+m4_changequote(`')
+
+# These are readonly variables, do not modify
+MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname"
+MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname"
+
+class MiniBatchPreProcessor:
+"""
+This class is responsible for executing the main logic of mini batch
+preprocessing, which packs multiple rows of selected columns from the
+source table into one row based on the buffer size
+"""
+def __init__(self, schema_madlib, source_table, output_table,
+  dependent_varname, independent_varname, buffer_size, 
**kwargs):
+self.schema_madlib = schema_madlib
+self.source_table = source_table
+self.output_table = output_table
+self.dependent_varname = dependent_varname
+self.independent_varname = independent_varname
+self.buffer_size = buffer_size
+
+self.module_name = "minibatch_preprocessor"
+self.output_standardization_table = add_postfix(self.output_table,
+   "_standardization")
+self.output_summary_table = add_postfix(self.output_table, 
"_summary")
+self._validate_minibatch_preprocessor_params()
+
+def minibatch_preprocessor(self):
+# Get array expressions for both dep and indep variables from the
+# MiniBatchQueryFormatter class
+dependent_var_dbtype = get_expr_type(self.dependent_varname,
+ self.source_table)
+qry_formatter = MiniBatchQueryFormatter(self.source_table)
+dep_var_array_str, dep_var_classes_str = qry_formatter.\
+get_dep_var_array_and_classes(self.dependent_varname,
+  dependent_var_dbtype)
+indep_var_array_str = qry_formatter.get_indep_var_array_str(
+  self.independent_varname)
+
+standardizer = MiniBatchStandardizer(self.schema_madlib,
+ self.source_table,
+ dep_var_array_str,
+ indep_var_array_str,
+ 
self.output_standardization_table)
+standardize_query = standardizer.get_query_for_standardizing()
+
+num_rows_processed, num_missing_rows_skipped = self.\
+
_get_skipped_rows_processed_count(
+dep_var_array_str,
+indep_var_array_str)
+calculated_buffer_size = MiniBatchBufferSizeCalculator.\
+ calculate_default_buffer_size(
+ self.buffer_size,
+

[GitHub] madlib pull request #241: MiniBatch Pre-Processor: Add new module minibatch_...

2018-03-19 Thread njayaram2
Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/241#discussion_r175593796
  
--- Diff: 
src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in ---
@@ -0,0 +1,559 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+"""
+@file minibatch_preprocessing.py_in
+
+"""
+from math import ceil
+import plpy
+
+from utilities import add_postfix
+from utilities import _assert
+from utilities import get_seg_number
+from utilities import is_platform_pg
+from utilities import is_psql_numeric_type
+from utilities import is_string_formatted_as_array_expression
+from utilities import py_list_to_sql_string
+from utilities import split_quoted_delimited_str
+from utilities import _string_to_array
+from utilities import validate_module_input_params
+from mean_std_dev_calculator import MeanStdDevCalculator
+from validate_args import get_expr_type
+from validate_args import output_tbl_valid
+from validate_args import _tbl_dimension_rownum
+
+m4_changequote(`')
+
+# These are readonly variables, do not modify
+MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname"
+MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname"
+
+class MiniBatchPreProcessor:
+"""
+This class is responsible for executing the main logic of mini batch
+preprocessing, which packs multiple rows of selected columns from the
+source table into one row based on the buffer size
+"""
+def __init__(self, schema_madlib, source_table, output_table,
+  dependent_varname, independent_varname, buffer_size, 
**kwargs):
+self.schema_madlib = schema_madlib
+self.source_table = source_table
+self.output_table = output_table
+self.dependent_varname = dependent_varname
+self.independent_varname = independent_varname
+self.buffer_size = buffer_size
+
+self.module_name = "minibatch_preprocessor"
+self.output_standardization_table = add_postfix(self.output_table,
+   "_standardization")
+self.output_summary_table = add_postfix(self.output_table, 
"_summary")
+self._validate_minibatch_preprocessor_params()
+
+def minibatch_preprocessor(self):
+# Get array expressions for both dep and indep variables from the
+# MiniBatchQueryFormatter class
+dependent_var_dbtype = get_expr_type(self.dependent_varname,
+ self.source_table)
+qry_formatter = MiniBatchQueryFormatter(self.source_table)
+dep_var_array_str, dep_var_classes_str = qry_formatter.\
+get_dep_var_array_and_classes(self.dependent_varname,
+  dependent_var_dbtype)
+indep_var_array_str = qry_formatter.get_indep_var_array_str(
+  self.independent_varname)
+
+standardizer = MiniBatchStandardizer(self.schema_madlib,
+ self.source_table,
+ dep_var_array_str,
+ indep_var_array_str,
+ 
self.output_standardization_table)
+standardize_query = standardizer.get_query_for_standardizing()
+
+num_rows_processed, num_missing_rows_skipped = self.\
+
_get_skipped_rows_processed_count(
+dep_var_array_str,
+indep_var_array_str)
+calculated_buffer_size = MiniBatchBufferSizeCalculator.\
+ calculate_default_buffer_size(
+ self.buffer_size,
+

[GitHub] madlib pull request #241: MiniBatch Pre-Processor: Add new module minibatch_...

2018-03-19 Thread njayaram2
Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/241#discussion_r175531202
  
--- Diff: 
src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in ---
@@ -0,0 +1,559 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+"""
+@file minibatch_preprocessing.py_in
+
+"""
+from math import ceil
+import plpy
+
+from utilities import add_postfix
+from utilities import _assert
+from utilities import get_seg_number
+from utilities import is_platform_pg
+from utilities import is_psql_numeric_type
+from utilities import is_string_formatted_as_array_expression
+from utilities import py_list_to_sql_string
+from utilities import split_quoted_delimited_str
+from utilities import _string_to_array
+from utilities import validate_module_input_params
+from mean_std_dev_calculator import MeanStdDevCalculator
+from validate_args import get_expr_type
+from validate_args import output_tbl_valid
+from validate_args import _tbl_dimension_rownum
+
+m4_changequote(`')
+
+# These are readonly variables, do not modify
+MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname"
+MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname"
+
+class MiniBatchPreProcessor:
+"""
+This class is responsible for executing the main logic of mini batch
+preprocessing, which packs multiple rows of selected columns from the
+source table into one row based on the buffer size
+"""
+def __init__(self, schema_madlib, source_table, output_table,
+  dependent_varname, independent_varname, buffer_size, 
**kwargs):
+self.schema_madlib = schema_madlib
+self.source_table = source_table
+self.output_table = output_table
+self.dependent_varname = dependent_varname
+self.independent_varname = independent_varname
+self.buffer_size = buffer_size
+
+self.module_name = "minibatch_preprocessor"
+self.output_standardization_table = add_postfix(self.output_table,
+   "_standardization")
+self.output_summary_table = add_postfix(self.output_table, 
"_summary")
+self._validate_minibatch_preprocessor_params()
+
+def minibatch_preprocessor(self):
+# Get array expressions for both dep and indep variables from the
+# MiniBatchQueryFormatter class
+dependent_var_dbtype = get_expr_type(self.dependent_varname,
+ self.source_table)
+qry_formatter = MiniBatchQueryFormatter(self.source_table)
+dep_var_array_str, dep_var_classes_str = qry_formatter.\
+get_dep_var_array_and_classes(self.dependent_varname,
+  dependent_var_dbtype)
+indep_var_array_str = qry_formatter.get_indep_var_array_str(
+  self.independent_varname)
+
+standardizer = MiniBatchStandardizer(self.schema_madlib,
+ self.source_table,
+ dep_var_array_str,
+ indep_var_array_str,
+ 
self.output_standardization_table)
+standardize_query = standardizer.get_query_for_standardizing()
+
+num_rows_processed, num_missing_rows_skipped = self.\
+
_get_skipped_rows_processed_count(
+dep_var_array_str,
+indep_var_array_str)
+calculated_buffer_size = MiniBatchBufferSizeCalculator.\
+ calculate_default_buffer_size(
+ self.buffer_size,
+

[GitHub] madlib pull request #241: MiniBatch Pre-Processor: Add new module minibatch_...

2018-03-19 Thread njayaram2
Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/241#discussion_r175522378
  
--- Diff: src/ports/postgres/modules/utilities/utilities.py_in ---
@@ -794,6 +794,41 @@ def collate_plpy_result(plpy_result_rows):
 # 
--
 
 
+def validate_module_input_params(source_table, output_table, 
independent_varname,
+  dependent_varname, module_name, **kwargs):
--- End diff --

How about having an optional param to deal with checking for residual 
output tables (summary and standardization tables). We could take a list of 
suffixes to check for.


---


[GitHub] madlib pull request #241: MiniBatch Pre-Processor: Add new module minibatch_...

2018-03-19 Thread njayaram2
Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/241#discussion_r175585050
  
--- Diff: 
src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in ---
@@ -0,0 +1,559 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+
+"""
+@file minibatch_preprocessing.py_in
+
+"""
+from math import ceil
+import plpy
+
+from utilities import add_postfix
+from utilities import _assert
+from utilities import get_seg_number
+from utilities import is_platform_pg
+from utilities import is_psql_numeric_type
+from utilities import is_string_formatted_as_array_expression
+from utilities import py_list_to_sql_string
+from utilities import split_quoted_delimited_str
+from utilities import _string_to_array
+from utilities import validate_module_input_params
+from mean_std_dev_calculator import MeanStdDevCalculator
+from validate_args import get_expr_type
+from validate_args import output_tbl_valid
+from validate_args import _tbl_dimension_rownum
+
+m4_changequote(`')
+
+# These are readonly variables, do not modify
+MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname"
+MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname"
+
+class MiniBatchPreProcessor:
+"""
+This class is responsible for executing the main logic of mini batch
+preprocessing, which packs multiple rows of selected columns from the
+source table into one row based on the buffer size
+"""
+def __init__(self, schema_madlib, source_table, output_table,
+  dependent_varname, independent_varname, buffer_size, 
**kwargs):
+self.schema_madlib = schema_madlib
+self.source_table = source_table
+self.output_table = output_table
+self.dependent_varname = dependent_varname
+self.independent_varname = independent_varname
+self.buffer_size = buffer_size
+
+self.module_name = "minibatch_preprocessor"
+self.output_standardization_table = add_postfix(self.output_table,
+   "_standardization")
+self.output_summary_table = add_postfix(self.output_table, 
"_summary")
+self._validate_minibatch_preprocessor_params()
+
+def minibatch_preprocessor(self):
+# Get array expressions for both dep and indep variables from the
+# MiniBatchQueryFormatter class
+dependent_var_dbtype = get_expr_type(self.dependent_varname,
+ self.source_table)
+qry_formatter = MiniBatchQueryFormatter(self.source_table)
+dep_var_array_str, dep_var_classes_str = qry_formatter.\
+get_dep_var_array_and_classes(self.dependent_varname,
+  dependent_var_dbtype)
+indep_var_array_str = qry_formatter.get_indep_var_array_str(
+  self.independent_varname)
+
+standardizer = MiniBatchStandardizer(self.schema_madlib,
+ self.source_table,
+ dep_var_array_str,
+ indep_var_array_str,
+ 
self.output_standardization_table)
+standardize_query = standardizer.get_query_for_standardizing()
+
+num_rows_processed, num_missing_rows_skipped = self.\
+
_get_skipped_rows_processed_count(
+dep_var_array_str,
+indep_var_array_str)
+calculated_buffer_size = MiniBatchBufferSizeCalculator.\
+ calculate_default_buffer_size(
+ self.buffer_size,
+