[GitHub] madlib pull request #230: Balanced sets final
Github user asfgit closed the pull request at: https://github.com/apache/madlib/pull/230 ---
[GitHub] madlib pull request #230: Balanced sets final
Github user njayaram2 commented on a diff in the pull request: https://github.com/apache/madlib/pull/230#discussion_r166056096 --- Diff: src/ports/postgres/modules/sample/balance_sample.py_in --- @@ -0,0 +1,748 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file EXCEPT in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +m4_changequote(`') + +import math + +if __name__ != "__main__": +import plpy +from utilities.control import MinWarning +from utilities.utilities import _assert +from utilities.utilities import extract_keyvalue_params +from utilities.utilities import unique_string +from utilities.validate_args import columns_exist_in_table +from utilities.validate_args import get_cols +from utilities.validate_args import table_exists +from utilities.validate_args import table_is_empty +else: +# Used only for Unit Testing +# FIXME: repeating a function from utilities that is needed by the unit test. +# This should be removed once a unittest framework in used for testing. +import random +import time + +def unique_string(desp='', **kwargs): +""" +Generate random remporary names for temp table and other names. +It has a SQL interface so both SQL and Python functions can call it. +""" +r1 = random.randint(1, 1) +r2 = int(time.time()) +r3 = int(time.time()) % random.randint(1, 1) +u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" + str(r3) + "__" +return u_string +# -- + +UNIFORM = 'uniform' +UNDERSAMPLE = 'undersample' +OVERSAMPLE = 'oversample' +NOSAMPLE = 'nosample' + +NEW_ID_COLUMN = '__madlib_id__' +NULL_IDENTIFIER = '__madlib_null_id__' + +def _get_frequency_distribution(source_table, class_col): +""" Returns a dict containing the number of rows associated with each class +level. Each class level value is converted to a string using ::text. +""" +query_result = plpy.execute(""" +SELECT {class_col}::text AS classes, + count(*) AS class_count +FROM {source_table} +GROUP BY {class_col} + """.format(**locals())) +actual_level_counts = {} +for each_row in query_result: +level = each_row['classes'] +if level: +level = level.strip() +actual_level_counts[level] = each_row['class_count'] +return actual_level_counts + + +def _validate_and_get_sampling_strategy(sampling_strategy_str, output_table_size, +supported_strategies=None, default=UNIFORM): +""" Returns the sampling strategy based on the class_sizes input param. +@param sampling_strategy_str The sampling strategy specified by the + user (class_sizes param) +@returns: +Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is UNIFORM. +""" +if not sampling_strategy_str: +sampling_strategy_str = default +else: +if len(sampling_strategy_str) < 3: +# Require at least 3 characters since UNIFORM and UNDERSAMPLE have +# common prefix substring +plpy.error("Sample: Invalid class_sizes parameter") + +if not supported_strategies: +supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE] +try: +# allow user to specify a prefix substring of --- End diff -- There is precedence for supporting prefix for parameter values, in modules such as SVM. Yes, the error messages could be similar. ---
[GitHub] madlib pull request #230: Balanced sets final
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/230#discussion_r165529067 --- Diff: src/ports/postgres/modules/sample/balance_sample.py_in --- @@ -0,0 +1,748 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file EXCEPT in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +m4_changequote(`') + +import math + +if __name__ != "__main__": +import plpy +from utilities.control import MinWarning +from utilities.utilities import _assert +from utilities.utilities import extract_keyvalue_params +from utilities.utilities import unique_string +from utilities.validate_args import columns_exist_in_table +from utilities.validate_args import get_cols +from utilities.validate_args import table_exists +from utilities.validate_args import table_is_empty +else: +# Used only for Unit Testing +# FIXME: repeating a function from utilities that is needed by the unit test. +# This should be removed once a unittest framework in used for testing. +import random +import time + +def unique_string(desp='', **kwargs): +""" +Generate random remporary names for temp table and other names. +It has a SQL interface so both SQL and Python functions can call it. +""" +r1 = random.randint(1, 1) +r2 = int(time.time()) +r3 = int(time.time()) % random.randint(1, 1) +u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" + str(r3) + "__" +return u_string +# -- + +UNIFORM = 'uniform' +UNDERSAMPLE = 'undersample' +OVERSAMPLE = 'oversample' +NOSAMPLE = 'nosample' + +NEW_ID_COLUMN = '__madlib_id__' +NULL_IDENTIFIER = '__madlib_null_id__' + +def _get_frequency_distribution(source_table, class_col): +""" Returns a dict containing the number of rows associated with each class +level. Each class level value is converted to a string using ::text. +""" +query_result = plpy.execute(""" +SELECT {class_col}::text AS classes, + count(*) AS class_count +FROM {source_table} +GROUP BY {class_col} + """.format(**locals())) +actual_level_counts = {} +for each_row in query_result: +level = each_row['classes'] +if level: +level = level.strip() +actual_level_counts[level] = each_row['class_count'] +return actual_level_counts + + +def _validate_and_get_sampling_strategy(sampling_strategy_str, output_table_size, +supported_strategies=None, default=UNIFORM): +""" Returns the sampling strategy based on the class_sizes input param. +@param sampling_strategy_str The sampling strategy specified by the + user (class_sizes param) +@returns: +Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is UNIFORM. +""" +if not sampling_strategy_str: +sampling_strategy_str = default +else: +if len(sampling_strategy_str) < 3: +# Require at least 3 characters since UNIFORM and UNDERSAMPLE have +# common prefix substring +plpy.error("Sample: Invalid class_sizes parameter") + +if not supported_strategies: +supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE] +try: +# allow user to specify a prefix substring of +# supported strategies. +sampling_strategy_str = next(x for x in supported_strategies + if x.startswith(sampling_strategy_str.lower())) +except StopIteration: +# next() returns a StopIteration if no element found
[GitHub] madlib pull request #230: Balanced sets final
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/230#discussion_r165523942 --- Diff: src/ports/postgres/modules/sample/balance_sample.py_in --- @@ -0,0 +1,748 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file EXCEPT in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +m4_changequote(`') + +import math + +if __name__ != "__main__": +import plpy +from utilities.control import MinWarning +from utilities.utilities import _assert +from utilities.utilities import extract_keyvalue_params +from utilities.utilities import unique_string +from utilities.validate_args import columns_exist_in_table +from utilities.validate_args import get_cols +from utilities.validate_args import table_exists +from utilities.validate_args import table_is_empty +else: +# Used only for Unit Testing +# FIXME: repeating a function from utilities that is needed by the unit test. +# This should be removed once a unittest framework in used for testing. +import random +import time + +def unique_string(desp='', **kwargs): +""" +Generate random remporary names for temp table and other names. +It has a SQL interface so both SQL and Python functions can call it. +""" +r1 = random.randint(1, 1) +r2 = int(time.time()) +r3 = int(time.time()) % random.randint(1, 1) +u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" + str(r3) + "__" +return u_string +# -- + +UNIFORM = 'uniform' +UNDERSAMPLE = 'undersample' +OVERSAMPLE = 'oversample' +NOSAMPLE = 'nosample' + +NEW_ID_COLUMN = '__madlib_id__' +NULL_IDENTIFIER = '__madlib_null_id__' + +def _get_frequency_distribution(source_table, class_col): +""" Returns a dict containing the number of rows associated with each class +level. Each class level value is converted to a string using ::text. +""" +query_result = plpy.execute(""" +SELECT {class_col}::text AS classes, + count(*) AS class_count +FROM {source_table} +GROUP BY {class_col} + """.format(**locals())) +actual_level_counts = {} +for each_row in query_result: +level = each_row['classes'] +if level: +level = level.strip() +actual_level_counts[level] = each_row['class_count'] +return actual_level_counts + + +def _validate_and_get_sampling_strategy(sampling_strategy_str, output_table_size, +supported_strategies=None, default=UNIFORM): +""" Returns the sampling strategy based on the class_sizes input param. +@param sampling_strategy_str The sampling strategy specified by the + user (class_sizes param) +@returns: +Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is UNIFORM. +""" +if not sampling_strategy_str: +sampling_strategy_str = default +else: +if len(sampling_strategy_str) < 3: +# Require at least 3 characters since UNIFORM and UNDERSAMPLE have +# common prefix substring +plpy.error("Sample: Invalid class_sizes parameter") + +if not supported_strategies: +supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE] +try: +# allow user to specify a prefix substring of +# supported strategies. +sampling_strategy_str = next(x for x in supported_strategies + if x.startswith(sampling_strategy_str.lower())) +except StopIteration: +# next() returns a StopIteration if no element found
[GitHub] madlib pull request #230: Balanced sets final
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/230#discussion_r165737045 --- Diff: src/ports/postgres/modules/sample/balance_sample.py_in --- @@ -0,0 +1,748 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file EXCEPT in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +m4_changequote(`') + +import math + +if __name__ != "__main__": +import plpy +from utilities.control import MinWarning +from utilities.utilities import _assert +from utilities.utilities import extract_keyvalue_params +from utilities.utilities import unique_string +from utilities.validate_args import columns_exist_in_table +from utilities.validate_args import get_cols +from utilities.validate_args import table_exists +from utilities.validate_args import table_is_empty +else: +# Used only for Unit Testing +# FIXME: repeating a function from utilities that is needed by the unit test. +# This should be removed once a unittest framework in used for testing. +import random +import time + +def unique_string(desp='', **kwargs): +""" +Generate random remporary names for temp table and other names. +It has a SQL interface so both SQL and Python functions can call it. +""" +r1 = random.randint(1, 1) +r2 = int(time.time()) +r3 = int(time.time()) % random.randint(1, 1) +u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" + str(r3) + "__" +return u_string +# -- + +UNIFORM = 'uniform' +UNDERSAMPLE = 'undersample' +OVERSAMPLE = 'oversample' +NOSAMPLE = 'nosample' + +NEW_ID_COLUMN = '__madlib_id__' +NULL_IDENTIFIER = '__madlib_null_id__' + +def _get_frequency_distribution(source_table, class_col): +""" Returns a dict containing the number of rows associated with each class +level. Each class level value is converted to a string using ::text. +""" +query_result = plpy.execute(""" +SELECT {class_col}::text AS classes, + count(*) AS class_count +FROM {source_table} +GROUP BY {class_col} + """.format(**locals())) +actual_level_counts = {} +for each_row in query_result: +level = each_row['classes'] +if level: +level = level.strip() +actual_level_counts[level] = each_row['class_count'] +return actual_level_counts + + +def _validate_and_get_sampling_strategy(sampling_strategy_str, output_table_size, +supported_strategies=None, default=UNIFORM): +""" Returns the sampling strategy based on the class_sizes input param. +@param sampling_strategy_str The sampling strategy specified by the + user (class_sizes param) +@returns: +Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is UNIFORM. +""" +if not sampling_strategy_str: +sampling_strategy_str = default +else: +if len(sampling_strategy_str) < 3: +# Require at least 3 characters since UNIFORM and UNDERSAMPLE have +# common prefix substring +plpy.error("Sample: Invalid class_sizes parameter") + +if not supported_strategies: +supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE] +try: +# allow user to specify a prefix substring of +# supported strategies. +sampling_strategy_str = next(x for x in supported_strategies + if x.startswith(sampling_strategy_str.lower())) +except StopIteration: +# next() returns a StopIteration if no element found
[GitHub] madlib pull request #230: Balanced sets final
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/230#discussion_r165736819 --- Diff: src/ports/postgres/modules/sample/balance_sample.py_in --- @@ -0,0 +1,748 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file EXCEPT in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +m4_changequote(`') + +import math + +if __name__ != "__main__": +import plpy +from utilities.control import MinWarning +from utilities.utilities import _assert +from utilities.utilities import extract_keyvalue_params +from utilities.utilities import unique_string +from utilities.validate_args import columns_exist_in_table +from utilities.validate_args import get_cols +from utilities.validate_args import table_exists +from utilities.validate_args import table_is_empty +else: +# Used only for Unit Testing +# FIXME: repeating a function from utilities that is needed by the unit test. +# This should be removed once a unittest framework in used for testing. +import random +import time + +def unique_string(desp='', **kwargs): +""" +Generate random remporary names for temp table and other names. +It has a SQL interface so both SQL and Python functions can call it. +""" +r1 = random.randint(1, 1) +r2 = int(time.time()) +r3 = int(time.time()) % random.randint(1, 1) +u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" + str(r3) + "__" +return u_string +# -- + +UNIFORM = 'uniform' +UNDERSAMPLE = 'undersample' +OVERSAMPLE = 'oversample' +NOSAMPLE = 'nosample' + +NEW_ID_COLUMN = '__madlib_id__' +NULL_IDENTIFIER = '__madlib_null_id__' + +def _get_frequency_distribution(source_table, class_col): +""" Returns a dict containing the number of rows associated with each class +level. Each class level value is converted to a string using ::text. +""" +query_result = plpy.execute(""" +SELECT {class_col}::text AS classes, + count(*) AS class_count +FROM {source_table} +GROUP BY {class_col} + """.format(**locals())) +actual_level_counts = {} +for each_row in query_result: +level = each_row['classes'] +if level: +level = level.strip() +actual_level_counts[level] = each_row['class_count'] +return actual_level_counts + + +def _validate_and_get_sampling_strategy(sampling_strategy_str, output_table_size, +supported_strategies=None, default=UNIFORM): +""" Returns the sampling strategy based on the class_sizes input param. +@param sampling_strategy_str The sampling strategy specified by the + user (class_sizes param) +@returns: +Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is UNIFORM. +""" +if not sampling_strategy_str: +sampling_strategy_str = default +else: +if len(sampling_strategy_str) < 3: +# Require at least 3 characters since UNIFORM and UNDERSAMPLE have +# common prefix substring +plpy.error("Sample: Invalid class_sizes parameter") + +if not supported_strategies: +supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE] +try: +# allow user to specify a prefix substring of +# supported strategies. +sampling_strategy_str = next(x for x in supported_strategies + if x.startswith(sampling_strategy_str.lower())) +except StopIteration: +# next() returns a StopIteration if no element found
[GitHub] madlib pull request #230: Balanced sets final
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/230#discussion_r165526039 --- Diff: src/ports/postgres/modules/sample/balance_sample.py_in --- @@ -0,0 +1,748 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file EXCEPT in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +m4_changequote(`') + +import math + +if __name__ != "__main__": +import plpy +from utilities.control import MinWarning +from utilities.utilities import _assert +from utilities.utilities import extract_keyvalue_params +from utilities.utilities import unique_string +from utilities.validate_args import columns_exist_in_table +from utilities.validate_args import get_cols +from utilities.validate_args import table_exists +from utilities.validate_args import table_is_empty +else: +# Used only for Unit Testing +# FIXME: repeating a function from utilities that is needed by the unit test. +# This should be removed once a unittest framework in used for testing. +import random +import time + +def unique_string(desp='', **kwargs): +""" +Generate random remporary names for temp table and other names. +It has a SQL interface so both SQL and Python functions can call it. +""" +r1 = random.randint(1, 1) +r2 = int(time.time()) +r3 = int(time.time()) % random.randint(1, 1) +u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" + str(r3) + "__" +return u_string +# -- + +UNIFORM = 'uniform' +UNDERSAMPLE = 'undersample' +OVERSAMPLE = 'oversample' +NOSAMPLE = 'nosample' + +NEW_ID_COLUMN = '__madlib_id__' +NULL_IDENTIFIER = '__madlib_null_id__' + +def _get_frequency_distribution(source_table, class_col): +""" Returns a dict containing the number of rows associated with each class +level. Each class level value is converted to a string using ::text. +""" +query_result = plpy.execute(""" +SELECT {class_col}::text AS classes, + count(*) AS class_count +FROM {source_table} +GROUP BY {class_col} + """.format(**locals())) +actual_level_counts = {} +for each_row in query_result: +level = each_row['classes'] +if level: +level = level.strip() +actual_level_counts[level] = each_row['class_count'] +return actual_level_counts + + +def _validate_and_get_sampling_strategy(sampling_strategy_str, output_table_size, +supported_strategies=None, default=UNIFORM): +""" Returns the sampling strategy based on the class_sizes input param. +@param sampling_strategy_str The sampling strategy specified by the + user (class_sizes param) +@returns: +Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is UNIFORM. +""" +if not sampling_strategy_str: +sampling_strategy_str = default +else: +if len(sampling_strategy_str) < 3: +# Require at least 3 characters since UNIFORM and UNDERSAMPLE have +# common prefix substring +plpy.error("Sample: Invalid class_sizes parameter") + +if not supported_strategies: +supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE] +try: +# allow user to specify a prefix substring of +# supported strategies. +sampling_strategy_str = next(x for x in supported_strategies + if x.startswith(sampling_strategy_str.lower())) +except StopIteration: +# next() returns a StopIteration if no element found
[GitHub] madlib pull request #230: Balanced sets final
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/230#discussion_r165527448 --- Diff: src/ports/postgres/modules/sample/balance_sample.py_in --- @@ -0,0 +1,748 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file EXCEPT in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +m4_changequote(`') + +import math + +if __name__ != "__main__": +import plpy +from utilities.control import MinWarning +from utilities.utilities import _assert +from utilities.utilities import extract_keyvalue_params +from utilities.utilities import unique_string +from utilities.validate_args import columns_exist_in_table +from utilities.validate_args import get_cols +from utilities.validate_args import table_exists +from utilities.validate_args import table_is_empty +else: +# Used only for Unit Testing +# FIXME: repeating a function from utilities that is needed by the unit test. +# This should be removed once a unittest framework in used for testing. +import random +import time + +def unique_string(desp='', **kwargs): +""" +Generate random remporary names for temp table and other names. +It has a SQL interface so both SQL and Python functions can call it. +""" +r1 = random.randint(1, 1) +r2 = int(time.time()) +r3 = int(time.time()) % random.randint(1, 1) +u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" + str(r3) + "__" +return u_string +# -- + +UNIFORM = 'uniform' +UNDERSAMPLE = 'undersample' +OVERSAMPLE = 'oversample' +NOSAMPLE = 'nosample' + +NEW_ID_COLUMN = '__madlib_id__' +NULL_IDENTIFIER = '__madlib_null_id__' + +def _get_frequency_distribution(source_table, class_col): +""" Returns a dict containing the number of rows associated with each class +level. Each class level value is converted to a string using ::text. +""" +query_result = plpy.execute(""" +SELECT {class_col}::text AS classes, + count(*) AS class_count +FROM {source_table} +GROUP BY {class_col} + """.format(**locals())) +actual_level_counts = {} +for each_row in query_result: +level = each_row['classes'] +if level: +level = level.strip() +actual_level_counts[level] = each_row['class_count'] +return actual_level_counts + + +def _validate_and_get_sampling_strategy(sampling_strategy_str, output_table_size, +supported_strategies=None, default=UNIFORM): +""" Returns the sampling strategy based on the class_sizes input param. +@param sampling_strategy_str The sampling strategy specified by the + user (class_sizes param) +@returns: +Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is UNIFORM. +""" +if not sampling_strategy_str: +sampling_strategy_str = default +else: +if len(sampling_strategy_str) < 3: +# Require at least 3 characters since UNIFORM and UNDERSAMPLE have +# common prefix substring +plpy.error("Sample: Invalid class_sizes parameter") + +if not supported_strategies: +supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE] +try: +# allow user to specify a prefix substring of +# supported strategies. +sampling_strategy_str = next(x for x in supported_strategies + if x.startswith(sampling_strategy_str.lower())) +except StopIteration: +# next() returns a StopIteration if no element found
[GitHub] madlib pull request #230: Balanced sets final
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/230#discussion_r165479832 --- Diff: src/ports/postgres/modules/sample/balance_sample.py_in --- @@ -0,0 +1,748 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file EXCEPT in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +m4_changequote(`') + +import math + +if __name__ != "__main__": +import plpy +from utilities.control import MinWarning +from utilities.utilities import _assert +from utilities.utilities import extract_keyvalue_params +from utilities.utilities import unique_string +from utilities.validate_args import columns_exist_in_table +from utilities.validate_args import get_cols +from utilities.validate_args import table_exists +from utilities.validate_args import table_is_empty +else: +# Used only for Unit Testing +# FIXME: repeating a function from utilities that is needed by the unit test. +# This should be removed once a unittest framework in used for testing. +import random +import time + +def unique_string(desp='', **kwargs): +""" +Generate random remporary names for temp table and other names. +It has a SQL interface so both SQL and Python functions can call it. +""" +r1 = random.randint(1, 1) +r2 = int(time.time()) +r3 = int(time.time()) % random.randint(1, 1) +u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" + str(r3) + "__" +return u_string +# -- + +UNIFORM = 'uniform' +UNDERSAMPLE = 'undersample' +OVERSAMPLE = 'oversample' +NOSAMPLE = 'nosample' + +NEW_ID_COLUMN = '__madlib_id__' +NULL_IDENTIFIER = '__madlib_null_id__' + +def _get_frequency_distribution(source_table, class_col): +""" Returns a dict containing the number of rows associated with each class +level. Each class level value is converted to a string using ::text. +""" +query_result = plpy.execute(""" +SELECT {class_col}::text AS classes, + count(*) AS class_count +FROM {source_table} +GROUP BY {class_col} + """.format(**locals())) +actual_level_counts = {} +for each_row in query_result: +level = each_row['classes'] +if level: +level = level.strip() +actual_level_counts[level] = each_row['class_count'] +return actual_level_counts + + +def _validate_and_get_sampling_strategy(sampling_strategy_str, output_table_size, +supported_strategies=None, default=UNIFORM): +""" Returns the sampling strategy based on the class_sizes input param. +@param sampling_strategy_str The sampling strategy specified by the + user (class_sizes param) +@returns: +Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is UNIFORM. +""" +if not sampling_strategy_str: +sampling_strategy_str = default +else: +if len(sampling_strategy_str) < 3: +# Require at least 3 characters since UNIFORM and UNDERSAMPLE have +# common prefix substring +plpy.error("Sample: Invalid class_sizes parameter") + +if not supported_strategies: +supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE] +try: +# allow user to specify a prefix substring of +# supported strategies. +sampling_strategy_str = next(x for x in supported_strategies + if x.startswith(sampling_strategy_str.lower())) +except StopIteration: +# next() returns a StopIteration if no element found
[GitHub] madlib pull request #230: Balanced sets final
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/230#discussion_r165505215 --- Diff: src/ports/postgres/modules/sample/balance_sample.sql_in --- @@ -0,0 +1,355 @@ +/* --- *//** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * + * @file balance_sample.sql_in + * + * @brief SQL functions for balanced data sets sampling. + * @date 12/14/2017 + * + * @sa Given a table, balanced sampling returns a sampled data set + * with specified balancing for each class (defaults to uniform sampleing). + * + *//* --- */ + +m4_include(`SQLCommon.m4') + + +/** +@addtogroup grp_balance_sampling + +Contents + +Balanced Data Sets Sampling +Examples + + + +@brief A method for independently sampling classes to receive a +balanced data set. +It is commonly used to where classes have greater imbalanced ratio. +It ensures the subclasses are adequately represented in the sample. + +@anchor strs +@par Balanced Sampling + + +balance_sample( source_table, +output_table, +class_col, +class_sizes, +output_table_size, +grouping_cols, +with_replacement, +keep_null + ) + + +\b Arguments + +source_table +TEXT. Name of the table containing the input data. + +output_table +TEXT. Name of output table that contains the sampled data. +The output table contains all columns present in the source +table. + +class_col +TEXT, Name of the column containing the class to be balanced. + + +class_sizes (optional) +VARCHAR, default âuniformâ. + +@note +Current implementation only supports 'undersample'. + +Parameter to define the size of the different class values. +(Class values are sometimes also called levels). + +Can take the following forms: + + +âuniformâ: +All class values will be resampled to have the same number of rows. + +'undersample': +Under-sample such that all class values end up with the same number of +observations as the minority class. Done without replacement by default +unless the parameter âwith_replacementâ is set to TRUE. + +'oversample': +Over-sample with replacement such that all class values end up with the +same number of observations as the majority class. Not affected by the +parameter âwith_replacementâ since over-sampling is always done with +replacement. + + + +You can also explicitly set class size in a string containing a +comma-delimited list. Order does not matter and all class values do not +need to be specified. Use the format âclass_value_1=x, class_value_2=y, ...â +where the class value in the list must exist in the column âclass_colâ. + +E.g., âmale=3000, female=7000â means you want to resample the dataset +to result in 3000 male and 7000 female rows in the âoutput_tableâ. + +@note +The allowed names for class values follows object naming rules in +PostgreSQL [6]. Quoted identifiers are allowed and should be enclosed +in double quotes â in the usual way. If for some reason the class values +in the examples above were âMaLeâ and âFeMaLeâ then the comma delimited +list for âclass_sizeâ would be: ââMaLeâ=3000, âFeMaLeâ=7000â. + + +output_table_size (optional) +INTEGER, default NULL. Desired size of the output data set. + +This parameter is ignored if âclass_sizeâ parameter is set to either +âoversampleâ or âundersampleâ since output table size is already determined. +If NULL, the resulting output table size will depend on the settings for the
[GitHub] madlib pull request #230: Balanced sets final
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/230#discussion_r165530126 --- Diff: src/ports/postgres/modules/sample/balance_sample.py_in --- @@ -0,0 +1,748 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file EXCEPT in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +m4_changequote(`') + +import math + +if __name__ != "__main__": +import plpy +from utilities.control import MinWarning +from utilities.utilities import _assert +from utilities.utilities import extract_keyvalue_params +from utilities.utilities import unique_string +from utilities.validate_args import columns_exist_in_table +from utilities.validate_args import get_cols +from utilities.validate_args import table_exists +from utilities.validate_args import table_is_empty +else: +# Used only for Unit Testing +# FIXME: repeating a function from utilities that is needed by the unit test. +# This should be removed once a unittest framework in used for testing. +import random +import time + +def unique_string(desp='', **kwargs): +""" +Generate random remporary names for temp table and other names. +It has a SQL interface so both SQL and Python functions can call it. +""" +r1 = random.randint(1, 1) +r2 = int(time.time()) +r3 = int(time.time()) % random.randint(1, 1) +u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" + str(r3) + "__" +return u_string +# -- + +UNIFORM = 'uniform' +UNDERSAMPLE = 'undersample' +OVERSAMPLE = 'oversample' +NOSAMPLE = 'nosample' + +NEW_ID_COLUMN = '__madlib_id__' +NULL_IDENTIFIER = '__madlib_null_id__' + +def _get_frequency_distribution(source_table, class_col): +""" Returns a dict containing the number of rows associated with each class +level. Each class level value is converted to a string using ::text. +""" +query_result = plpy.execute(""" +SELECT {class_col}::text AS classes, + count(*) AS class_count +FROM {source_table} +GROUP BY {class_col} + """.format(**locals())) +actual_level_counts = {} +for each_row in query_result: +level = each_row['classes'] +if level: +level = level.strip() +actual_level_counts[level] = each_row['class_count'] +return actual_level_counts + + +def _validate_and_get_sampling_strategy(sampling_strategy_str, output_table_size, +supported_strategies=None, default=UNIFORM): +""" Returns the sampling strategy based on the class_sizes input param. +@param sampling_strategy_str The sampling strategy specified by the + user (class_sizes param) +@returns: +Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is UNIFORM. +""" +if not sampling_strategy_str: +sampling_strategy_str = default +else: +if len(sampling_strategy_str) < 3: +# Require at least 3 characters since UNIFORM and UNDERSAMPLE have +# common prefix substring +plpy.error("Sample: Invalid class_sizes parameter") + +if not supported_strategies: +supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE] +try: +# allow user to specify a prefix substring of +# supported strategies. +sampling_strategy_str = next(x for x in supported_strategies + if x.startswith(sampling_strategy_str.lower())) +except StopIteration: +# next() returns a StopIteration if no element found
[GitHub] madlib pull request #230: Balanced sets final
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/230#discussion_r165733978 --- Diff: src/ports/postgres/modules/sample/balance_sample.py_in --- @@ -0,0 +1,748 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file EXCEPT in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +m4_changequote(`') + +import math + +if __name__ != "__main__": +import plpy +from utilities.control import MinWarning +from utilities.utilities import _assert +from utilities.utilities import extract_keyvalue_params +from utilities.utilities import unique_string +from utilities.validate_args import columns_exist_in_table +from utilities.validate_args import get_cols +from utilities.validate_args import table_exists +from utilities.validate_args import table_is_empty +else: +# Used only for Unit Testing +# FIXME: repeating a function from utilities that is needed by the unit test. +# This should be removed once a unittest framework in used for testing. +import random +import time + +def unique_string(desp='', **kwargs): +""" +Generate random remporary names for temp table and other names. +It has a SQL interface so both SQL and Python functions can call it. +""" +r1 = random.randint(1, 1) +r2 = int(time.time()) +r3 = int(time.time()) % random.randint(1, 1) +u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" + str(r3) + "__" +return u_string +# -- + +UNIFORM = 'uniform' +UNDERSAMPLE = 'undersample' +OVERSAMPLE = 'oversample' +NOSAMPLE = 'nosample' + +NEW_ID_COLUMN = '__madlib_id__' +NULL_IDENTIFIER = '__madlib_null_id__' + +def _get_frequency_distribution(source_table, class_col): +""" Returns a dict containing the number of rows associated with each class +level. Each class level value is converted to a string using ::text. +""" +query_result = plpy.execute(""" +SELECT {class_col}::text AS classes, + count(*) AS class_count +FROM {source_table} +GROUP BY {class_col} + """.format(**locals())) +actual_level_counts = {} +for each_row in query_result: +level = each_row['classes'] +if level: +level = level.strip() +actual_level_counts[level] = each_row['class_count'] +return actual_level_counts + + +def _validate_and_get_sampling_strategy(sampling_strategy_str, output_table_size, +supported_strategies=None, default=UNIFORM): +""" Returns the sampling strategy based on the class_sizes input param. +@param sampling_strategy_str The sampling strategy specified by the + user (class_sizes param) +@returns: +Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is UNIFORM. +""" +if not sampling_strategy_str: +sampling_strategy_str = default +else: +if len(sampling_strategy_str) < 3: +# Require at least 3 characters since UNIFORM and UNDERSAMPLE have +# common prefix substring +plpy.error("Sample: Invalid class_sizes parameter") + +if not supported_strategies: +supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE] +try: +# allow user to specify a prefix substring of +# supported strategies. +sampling_strategy_str = next(x for x in supported_strategies + if x.startswith(sampling_strategy_str.lower())) +except StopIteration: +# next() returns a StopIteration if no element found
[GitHub] madlib pull request #230: Balanced sets final
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/230#discussion_r165516241 --- Diff: src/ports/postgres/modules/sample/balance_sample.py_in --- @@ -0,0 +1,748 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file EXCEPT in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +m4_changequote(`') + +import math + +if __name__ != "__main__": +import plpy +from utilities.control import MinWarning +from utilities.utilities import _assert +from utilities.utilities import extract_keyvalue_params +from utilities.utilities import unique_string +from utilities.validate_args import columns_exist_in_table +from utilities.validate_args import get_cols +from utilities.validate_args import table_exists +from utilities.validate_args import table_is_empty +else: +# Used only for Unit Testing +# FIXME: repeating a function from utilities that is needed by the unit test. +# This should be removed once a unittest framework in used for testing. +import random +import time + +def unique_string(desp='', **kwargs): +""" +Generate random remporary names for temp table and other names. +It has a SQL interface so both SQL and Python functions can call it. +""" +r1 = random.randint(1, 1) +r2 = int(time.time()) +r3 = int(time.time()) % random.randint(1, 1) +u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" + str(r3) + "__" +return u_string +# -- + +UNIFORM = 'uniform' +UNDERSAMPLE = 'undersample' +OVERSAMPLE = 'oversample' +NOSAMPLE = 'nosample' + +NEW_ID_COLUMN = '__madlib_id__' +NULL_IDENTIFIER = '__madlib_null_id__' + +def _get_frequency_distribution(source_table, class_col): +""" Returns a dict containing the number of rows associated with each class +level. Each class level value is converted to a string using ::text. +""" +query_result = plpy.execute(""" +SELECT {class_col}::text AS classes, + count(*) AS class_count +FROM {source_table} +GROUP BY {class_col} + """.format(**locals())) +actual_level_counts = {} +for each_row in query_result: +level = each_row['classes'] +if level: +level = level.strip() +actual_level_counts[level] = each_row['class_count'] +return actual_level_counts + + +def _validate_and_get_sampling_strategy(sampling_strategy_str, output_table_size, +supported_strategies=None, default=UNIFORM): +""" Returns the sampling strategy based on the class_sizes input param. +@param sampling_strategy_str The sampling strategy specified by the + user (class_sizes param) +@returns: +Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is UNIFORM. +""" +if not sampling_strategy_str: +sampling_strategy_str = default +else: +if len(sampling_strategy_str) < 3: +# Require at least 3 characters since UNIFORM and UNDERSAMPLE have +# common prefix substring +plpy.error("Sample: Invalid class_sizes parameter") + +if not supported_strategies: --- End diff -- Do we need to have `supported_strategies` as an argument, why not always use `[UNIFORM, UNDERSAMPLE, OVERSAMPLE]` ? Do we foresee any other sampling strategies ? ---
[GitHub] madlib pull request #230: Balanced sets final
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/230#discussion_r165537394 --- Diff: src/ports/postgres/modules/sample/balance_sample.py_in --- @@ -0,0 +1,748 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file EXCEPT in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +m4_changequote(`') + +import math + +if __name__ != "__main__": +import plpy +from utilities.control import MinWarning +from utilities.utilities import _assert +from utilities.utilities import extract_keyvalue_params +from utilities.utilities import unique_string +from utilities.validate_args import columns_exist_in_table +from utilities.validate_args import get_cols +from utilities.validate_args import table_exists +from utilities.validate_args import table_is_empty +else: +# Used only for Unit Testing +# FIXME: repeating a function from utilities that is needed by the unit test. +# This should be removed once a unittest framework in used for testing. +import random +import time + +def unique_string(desp='', **kwargs): +""" +Generate random remporary names for temp table and other names. +It has a SQL interface so both SQL and Python functions can call it. +""" +r1 = random.randint(1, 1) +r2 = int(time.time()) +r3 = int(time.time()) % random.randint(1, 1) +u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" + str(r3) + "__" +return u_string +# -- + +UNIFORM = 'uniform' +UNDERSAMPLE = 'undersample' +OVERSAMPLE = 'oversample' +NOSAMPLE = 'nosample' + +NEW_ID_COLUMN = '__madlib_id__' +NULL_IDENTIFIER = '__madlib_null_id__' + +def _get_frequency_distribution(source_table, class_col): +""" Returns a dict containing the number of rows associated with each class +level. Each class level value is converted to a string using ::text. +""" +query_result = plpy.execute(""" +SELECT {class_col}::text AS classes, + count(*) AS class_count +FROM {source_table} +GROUP BY {class_col} + """.format(**locals())) +actual_level_counts = {} +for each_row in query_result: +level = each_row['classes'] +if level: +level = level.strip() +actual_level_counts[level] = each_row['class_count'] +return actual_level_counts + + +def _validate_and_get_sampling_strategy(sampling_strategy_str, output_table_size, +supported_strategies=None, default=UNIFORM): +""" Returns the sampling strategy based on the class_sizes input param. +@param sampling_strategy_str The sampling strategy specified by the + user (class_sizes param) +@returns: +Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is UNIFORM. +""" +if not sampling_strategy_str: +sampling_strategy_str = default +else: +if len(sampling_strategy_str) < 3: +# Require at least 3 characters since UNIFORM and UNDERSAMPLE have +# common prefix substring +plpy.error("Sample: Invalid class_sizes parameter") + +if not supported_strategies: +supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE] +try: +# allow user to specify a prefix substring of +# supported strategies. +sampling_strategy_str = next(x for x in supported_strategies + if x.startswith(sampling_strategy_str.lower())) +except StopIteration: +# next() returns a StopIteration if no element found
[GitHub] madlib pull request #230: Balanced sets final
Github user kaknikhil commented on a diff in the pull request: https://github.com/apache/madlib/pull/230#discussion_r165734791 --- Diff: src/ports/postgres/modules/sample/balance_sample.py_in --- @@ -0,0 +1,748 @@ +# coding=utf-8 +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file EXCEPT in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +m4_changequote(`') + +import math + +if __name__ != "__main__": +import plpy +from utilities.control import MinWarning +from utilities.utilities import _assert +from utilities.utilities import extract_keyvalue_params +from utilities.utilities import unique_string +from utilities.validate_args import columns_exist_in_table +from utilities.validate_args import get_cols +from utilities.validate_args import table_exists +from utilities.validate_args import table_is_empty +else: +# Used only for Unit Testing +# FIXME: repeating a function from utilities that is needed by the unit test. +# This should be removed once a unittest framework in used for testing. +import random +import time + +def unique_string(desp='', **kwargs): +""" +Generate random remporary names for temp table and other names. +It has a SQL interface so both SQL and Python functions can call it. +""" +r1 = random.randint(1, 1) +r2 = int(time.time()) +r3 = int(time.time()) % random.randint(1, 1) +u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" + str(r3) + "__" +return u_string +# -- + +UNIFORM = 'uniform' +UNDERSAMPLE = 'undersample' +OVERSAMPLE = 'oversample' +NOSAMPLE = 'nosample' + +NEW_ID_COLUMN = '__madlib_id__' +NULL_IDENTIFIER = '__madlib_null_id__' + +def _get_frequency_distribution(source_table, class_col): +""" Returns a dict containing the number of rows associated with each class +level. Each class level value is converted to a string using ::text. +""" +query_result = plpy.execute(""" +SELECT {class_col}::text AS classes, + count(*) AS class_count +FROM {source_table} +GROUP BY {class_col} + """.format(**locals())) +actual_level_counts = {} +for each_row in query_result: +level = each_row['classes'] +if level: +level = level.strip() +actual_level_counts[level] = each_row['class_count'] +return actual_level_counts + + +def _validate_and_get_sampling_strategy(sampling_strategy_str, output_table_size, +supported_strategies=None, default=UNIFORM): +""" Returns the sampling strategy based on the class_sizes input param. +@param sampling_strategy_str The sampling strategy specified by the + user (class_sizes param) +@returns: +Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is UNIFORM. +""" +if not sampling_strategy_str: +sampling_strategy_str = default +else: +if len(sampling_strategy_str) < 3: +# Require at least 3 characters since UNIFORM and UNDERSAMPLE have +# common prefix substring +plpy.error("Sample: Invalid class_sizes parameter") + +if not supported_strategies: +supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE] +try: +# allow user to specify a prefix substring of +# supported strategies. +sampling_strategy_str = next(x for x in supported_strategies + if x.startswith(sampling_strategy_str.lower())) +except StopIteration: +# next() returns a StopIteration if no element found