[GitHub] madlib pull request #241: MiniBatch Pre-Processor: Add new module minibatch_...
Github user asfgit closed the pull request at: https://github.com/apache/madlib/pull/241 ---
[GitHub] madlib pull request #241: MiniBatch Pre-Processor: Add new module minibatch_...
Github user jingyimei commented on a diff in the pull request: https://github.com/apache/madlib/pull/241#discussion_r175957289 --- Diff: src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in --- @@ -0,0 +1,559 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +""" +@file minibatch_preprocessing.py_in + +""" +from math import ceil +import plpy + +from utilities import add_postfix +from utilities import _assert +from utilities import get_seg_number +from utilities import is_platform_pg +from utilities import is_psql_numeric_type +from utilities import is_string_formatted_as_array_expression +from utilities import py_list_to_sql_string +from utilities import split_quoted_delimited_str +from utilities import _string_to_array +from utilities import validate_module_input_params +from mean_std_dev_calculator import MeanStdDevCalculator +from validate_args import get_expr_type +from validate_args import output_tbl_valid +from validate_args import _tbl_dimension_rownum + +m4_changequote(`') + +# These are readonly variables, do not modify +MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname" +MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname" + +class MiniBatchPreProcessor: +""" +This class is responsible for executing the main logic of mini batch +preprocessing, which packs multiple rows of selected columns from the +source table into one row based on the buffer size +""" +def __init__(self, schema_madlib, source_table, output_table, + dependent_varname, independent_varname, buffer_size, **kwargs): +self.schema_madlib = schema_madlib +self.source_table = source_table +self.output_table = output_table +self.dependent_varname = dependent_varname +self.independent_varname = independent_varname +self.buffer_size = buffer_size + +self.module_name = "minibatch_preprocessor" +self.output_standardization_table = add_postfix(self.output_table, + "_standardization") +self.output_summary_table = add_postfix(self.output_table, "_summary") +self._validate_minibatch_preprocessor_params() + +def minibatch_preprocessor(self): +# Get array expressions for both dep and indep variables from the +# MiniBatchQueryFormatter class +dependent_var_dbtype = get_expr_type(self.dependent_varname, + self.source_table) +qry_formatter = MiniBatchQueryFormatter(self.source_table) +dep_var_array_str, dep_var_classes_str = qry_formatter.\ +get_dep_var_array_and_classes(self.dependent_varname, + dependent_var_dbtype) +indep_var_array_str = qry_formatter.get_indep_var_array_str( + self.independent_varname) + +standardizer = MiniBatchStandardizer(self.schema_madlib, + self.source_table, + dep_var_array_str, + indep_var_array_str, + self.output_standardization_table) +standardize_query = standardizer.get_query_for_standardizing() + +num_rows_processed, num_missing_rows_skipped = self.\ + _get_skipped_rows_processed_count( +dep_var_array_str, +indep_var_array_str) +calculated_buffer_size = MiniBatchBufferSizeCalculator.\ + calculate_default_buffer_size( + self.buffer_size, +
[GitHub] madlib pull request #241: MiniBatch Pre-Processor: Add new module minibatch_...
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/241#discussion_r175548350 --- Diff: src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in --- @@ -0,0 +1,559 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +""" +@file minibatch_preprocessing.py_in + +""" +from math import ceil +import plpy + +from utilities import add_postfix +from utilities import _assert +from utilities import get_seg_number +from utilities import is_platform_pg +from utilities import is_psql_numeric_type +from utilities import is_string_formatted_as_array_expression +from utilities import py_list_to_sql_string +from utilities import split_quoted_delimited_str +from utilities import _string_to_array +from utilities import validate_module_input_params +from mean_std_dev_calculator import MeanStdDevCalculator +from validate_args import get_expr_type +from validate_args import output_tbl_valid +from validate_args import _tbl_dimension_rownum + +m4_changequote(`') + +# These are readonly variables, do not modify +MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname" +MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname" + +class MiniBatchPreProcessor: +""" +This class is responsible for executing the main logic of mini batch +preprocessing, which packs multiple rows of selected columns from the +source table into one row based on the buffer size +""" +def __init__(self, schema_madlib, source_table, output_table, + dependent_varname, independent_varname, buffer_size, **kwargs): +self.schema_madlib = schema_madlib +self.source_table = source_table +self.output_table = output_table +self.dependent_varname = dependent_varname +self.independent_varname = independent_varname +self.buffer_size = buffer_size + +self.module_name = "minibatch_preprocessor" +self.output_standardization_table = add_postfix(self.output_table, + "_standardization") +self.output_summary_table = add_postfix(self.output_table, "_summary") +self._validate_minibatch_preprocessor_params() + +def minibatch_preprocessor(self): +# Get array expressions for both dep and indep variables from the +# MiniBatchQueryFormatter class +dependent_var_dbtype = get_expr_type(self.dependent_varname, + self.source_table) +qry_formatter = MiniBatchQueryFormatter(self.source_table) +dep_var_array_str, dep_var_classes_str = qry_formatter.\ +get_dep_var_array_and_classes(self.dependent_varname, + dependent_var_dbtype) +indep_var_array_str = qry_formatter.get_indep_var_array_str( + self.independent_varname) + +standardizer = MiniBatchStandardizer(self.schema_madlib, + self.source_table, + dep_var_array_str, + indep_var_array_str, + self.output_standardization_table) +standardize_query = standardizer.get_query_for_standardizing() + +num_rows_processed, num_missing_rows_skipped = self.\ + _get_skipped_rows_processed_count( +dep_var_array_str, +indep_var_array_str) +calculated_buffer_size = MiniBatchBufferSizeCalculator.\ + calculate_default_buffer_size( + self.buffer_size, +
[GitHub] madlib pull request #241: MiniBatch Pre-Processor: Add new module minibatch_...
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/241#discussion_r175588969 --- Diff: src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in --- @@ -0,0 +1,559 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +""" +@file minibatch_preprocessing.py_in + +""" +from math import ceil +import plpy + +from utilities import add_postfix +from utilities import _assert +from utilities import get_seg_number +from utilities import is_platform_pg +from utilities import is_psql_numeric_type +from utilities import is_string_formatted_as_array_expression +from utilities import py_list_to_sql_string +from utilities import split_quoted_delimited_str +from utilities import _string_to_array +from utilities import validate_module_input_params +from mean_std_dev_calculator import MeanStdDevCalculator +from validate_args import get_expr_type +from validate_args import output_tbl_valid +from validate_args import _tbl_dimension_rownum + +m4_changequote(`') + +# These are readonly variables, do not modify +MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname" +MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname" + +class MiniBatchPreProcessor: +""" +This class is responsible for executing the main logic of mini batch +preprocessing, which packs multiple rows of selected columns from the +source table into one row based on the buffer size +""" +def __init__(self, schema_madlib, source_table, output_table, + dependent_varname, independent_varname, buffer_size, **kwargs): +self.schema_madlib = schema_madlib +self.source_table = source_table +self.output_table = output_table +self.dependent_varname = dependent_varname +self.independent_varname = independent_varname +self.buffer_size = buffer_size + +self.module_name = "minibatch_preprocessor" +self.output_standardization_table = add_postfix(self.output_table, + "_standardization") +self.output_summary_table = add_postfix(self.output_table, "_summary") +self._validate_minibatch_preprocessor_params() + +def minibatch_preprocessor(self): +# Get array expressions for both dep and indep variables from the +# MiniBatchQueryFormatter class +dependent_var_dbtype = get_expr_type(self.dependent_varname, + self.source_table) +qry_formatter = MiniBatchQueryFormatter(self.source_table) +dep_var_array_str, dep_var_classes_str = qry_formatter.\ +get_dep_var_array_and_classes(self.dependent_varname, + dependent_var_dbtype) +indep_var_array_str = qry_formatter.get_indep_var_array_str( + self.independent_varname) + +standardizer = MiniBatchStandardizer(self.schema_madlib, + self.source_table, + dep_var_array_str, + indep_var_array_str, + self.output_standardization_table) +standardize_query = standardizer.get_query_for_standardizing() + +num_rows_processed, num_missing_rows_skipped = self.\ + _get_skipped_rows_processed_count( +dep_var_array_str, +indep_var_array_str) +calculated_buffer_size = MiniBatchBufferSizeCalculator.\ + calculate_default_buffer_size( + self.buffer_size, +
[GitHub] madlib pull request #241: MiniBatch Pre-Processor: Add new module minibatch_...
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/241#discussion_r175593796 --- Diff: src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in --- @@ -0,0 +1,559 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +""" +@file minibatch_preprocessing.py_in + +""" +from math import ceil +import plpy + +from utilities import add_postfix +from utilities import _assert +from utilities import get_seg_number +from utilities import is_platform_pg +from utilities import is_psql_numeric_type +from utilities import is_string_formatted_as_array_expression +from utilities import py_list_to_sql_string +from utilities import split_quoted_delimited_str +from utilities import _string_to_array +from utilities import validate_module_input_params +from mean_std_dev_calculator import MeanStdDevCalculator +from validate_args import get_expr_type +from validate_args import output_tbl_valid +from validate_args import _tbl_dimension_rownum + +m4_changequote(`') + +# These are readonly variables, do not modify +MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname" +MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname" + +class MiniBatchPreProcessor: +""" +This class is responsible for executing the main logic of mini batch +preprocessing, which packs multiple rows of selected columns from the +source table into one row based on the buffer size +""" +def __init__(self, schema_madlib, source_table, output_table, + dependent_varname, independent_varname, buffer_size, **kwargs): +self.schema_madlib = schema_madlib +self.source_table = source_table +self.output_table = output_table +self.dependent_varname = dependent_varname +self.independent_varname = independent_varname +self.buffer_size = buffer_size + +self.module_name = "minibatch_preprocessor" +self.output_standardization_table = add_postfix(self.output_table, + "_standardization") +self.output_summary_table = add_postfix(self.output_table, "_summary") +self._validate_minibatch_preprocessor_params() + +def minibatch_preprocessor(self): +# Get array expressions for both dep and indep variables from the +# MiniBatchQueryFormatter class +dependent_var_dbtype = get_expr_type(self.dependent_varname, + self.source_table) +qry_formatter = MiniBatchQueryFormatter(self.source_table) +dep_var_array_str, dep_var_classes_str = qry_formatter.\ +get_dep_var_array_and_classes(self.dependent_varname, + dependent_var_dbtype) +indep_var_array_str = qry_formatter.get_indep_var_array_str( + self.independent_varname) + +standardizer = MiniBatchStandardizer(self.schema_madlib, + self.source_table, + dep_var_array_str, + indep_var_array_str, + self.output_standardization_table) +standardize_query = standardizer.get_query_for_standardizing() + +num_rows_processed, num_missing_rows_skipped = self.\ + _get_skipped_rows_processed_count( +dep_var_array_str, +indep_var_array_str) +calculated_buffer_size = MiniBatchBufferSizeCalculator.\ + calculate_default_buffer_size( + self.buffer_size, +
[GitHub] madlib pull request #241: MiniBatch Pre-Processor: Add new module minibatch_...
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/241#discussion_r175531202 --- Diff: src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in --- @@ -0,0 +1,559 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +""" +@file minibatch_preprocessing.py_in + +""" +from math import ceil +import plpy + +from utilities import add_postfix +from utilities import _assert +from utilities import get_seg_number +from utilities import is_platform_pg +from utilities import is_psql_numeric_type +from utilities import is_string_formatted_as_array_expression +from utilities import py_list_to_sql_string +from utilities import split_quoted_delimited_str +from utilities import _string_to_array +from utilities import validate_module_input_params +from mean_std_dev_calculator import MeanStdDevCalculator +from validate_args import get_expr_type +from validate_args import output_tbl_valid +from validate_args import _tbl_dimension_rownum + +m4_changequote(`') + +# These are readonly variables, do not modify +MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname" +MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname" + +class MiniBatchPreProcessor: +""" +This class is responsible for executing the main logic of mini batch +preprocessing, which packs multiple rows of selected columns from the +source table into one row based on the buffer size +""" +def __init__(self, schema_madlib, source_table, output_table, + dependent_varname, independent_varname, buffer_size, **kwargs): +self.schema_madlib = schema_madlib +self.source_table = source_table +self.output_table = output_table +self.dependent_varname = dependent_varname +self.independent_varname = independent_varname +self.buffer_size = buffer_size + +self.module_name = "minibatch_preprocessor" +self.output_standardization_table = add_postfix(self.output_table, + "_standardization") +self.output_summary_table = add_postfix(self.output_table, "_summary") +self._validate_minibatch_preprocessor_params() + +def minibatch_preprocessor(self): +# Get array expressions for both dep and indep variables from the +# MiniBatchQueryFormatter class +dependent_var_dbtype = get_expr_type(self.dependent_varname, + self.source_table) +qry_formatter = MiniBatchQueryFormatter(self.source_table) +dep_var_array_str, dep_var_classes_str = qry_formatter.\ +get_dep_var_array_and_classes(self.dependent_varname, + dependent_var_dbtype) +indep_var_array_str = qry_formatter.get_indep_var_array_str( + self.independent_varname) + +standardizer = MiniBatchStandardizer(self.schema_madlib, + self.source_table, + dep_var_array_str, + indep_var_array_str, + self.output_standardization_table) +standardize_query = standardizer.get_query_for_standardizing() + +num_rows_processed, num_missing_rows_skipped = self.\ + _get_skipped_rows_processed_count( +dep_var_array_str, +indep_var_array_str) +calculated_buffer_size = MiniBatchBufferSizeCalculator.\ + calculate_default_buffer_size( + self.buffer_size, +
[GitHub] madlib pull request #241: MiniBatch Pre-Processor: Add new module minibatch_...
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/241#discussion_r175522378 --- Diff: src/ports/postgres/modules/utilities/utilities.py_in --- @@ -794,6 +794,41 @@ def collate_plpy_result(plpy_result_rows): # -- +def validate_module_input_params(source_table, output_table, independent_varname, + dependent_varname, module_name, **kwargs): --- End diff -- How about having an optional param to deal with checking for residual output tables (summary and standardization tables). We could take a list of suffixes to check for. ---
[GitHub] madlib pull request #241: MiniBatch Pre-Processor: Add new module minibatch_...
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/241#discussion_r175585050 --- Diff: src/ports/postgres/modules/utilities/minibatch_preprocessing.py_in --- @@ -0,0 +1,559 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + +""" +@file minibatch_preprocessing.py_in + +""" +from math import ceil +import plpy + +from utilities import add_postfix +from utilities import _assert +from utilities import get_seg_number +from utilities import is_platform_pg +from utilities import is_psql_numeric_type +from utilities import is_string_formatted_as_array_expression +from utilities import py_list_to_sql_string +from utilities import split_quoted_delimited_str +from utilities import _string_to_array +from utilities import validate_module_input_params +from mean_std_dev_calculator import MeanStdDevCalculator +from validate_args import get_expr_type +from validate_args import output_tbl_valid +from validate_args import _tbl_dimension_rownum + +m4_changequote(`') + +# These are readonly variables, do not modify +MINIBATCH_OUTPUT_DEPENDENT_COLNAME = "dependent_varname" +MINIBATCH_OUTPUT_INDEPENDENT_COLNAME = "independent_varname" + +class MiniBatchPreProcessor: +""" +This class is responsible for executing the main logic of mini batch +preprocessing, which packs multiple rows of selected columns from the +source table into one row based on the buffer size +""" +def __init__(self, schema_madlib, source_table, output_table, + dependent_varname, independent_varname, buffer_size, **kwargs): +self.schema_madlib = schema_madlib +self.source_table = source_table +self.output_table = output_table +self.dependent_varname = dependent_varname +self.independent_varname = independent_varname +self.buffer_size = buffer_size + +self.module_name = "minibatch_preprocessor" +self.output_standardization_table = add_postfix(self.output_table, + "_standardization") +self.output_summary_table = add_postfix(self.output_table, "_summary") +self._validate_minibatch_preprocessor_params() + +def minibatch_preprocessor(self): +# Get array expressions for both dep and indep variables from the +# MiniBatchQueryFormatter class +dependent_var_dbtype = get_expr_type(self.dependent_varname, + self.source_table) +qry_formatter = MiniBatchQueryFormatter(self.source_table) +dep_var_array_str, dep_var_classes_str = qry_formatter.\ +get_dep_var_array_and_classes(self.dependent_varname, + dependent_var_dbtype) +indep_var_array_str = qry_formatter.get_indep_var_array_str( + self.independent_varname) + +standardizer = MiniBatchStandardizer(self.schema_madlib, + self.source_table, + dep_var_array_str, + indep_var_array_str, + self.output_standardization_table) +standardize_query = standardizer.get_query_for_standardizing() + +num_rows_processed, num_missing_rows_skipped = self.\ + _get_skipped_rows_processed_count( +dep_var_array_str, +indep_var_array_str) +calculated_buffer_size = MiniBatchBufferSizeCalculator.\ + calculate_default_buffer_size( + self.buffer_size, +