[GitHub] madlib pull request #230: Balanced sets final

2018-02-08 Thread asfgit
Github user asfgit closed the pull request at:

https://github.com/apache/madlib/pull/230


---


[GitHub] madlib pull request #230: Balanced sets final

2018-02-05 Thread njayaram2
Github user njayaram2 commented on a diff in the pull request:

https://github.com/apache/madlib/pull/230#discussion_r166056096
  
--- Diff: src/ports/postgres/modules/sample/balance_sample.py_in ---
@@ -0,0 +1,748 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file EXCEPT in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+m4_changequote(`')
+
+import math
+
+if __name__ != "__main__":
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import get_cols
+from utilities.validate_args import table_exists
+from utilities.validate_args import table_is_empty
+else:
+# Used only for Unit Testing
+# FIXME: repeating a function from utilities that is needed by the 
unit test.
+# This should be removed once a unittest framework in used for testing.
+import random
+import time
+
+def unique_string(desp='', **kwargs):
+"""
+Generate random remporary names for temp table and other names.
+It has a SQL interface so both SQL and Python functions can call 
it.
+"""
+r1 = random.randint(1, 1)
+r2 = int(time.time())
+r3 = int(time.time()) % random.randint(1, 1)
+u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" 
+ str(r3) + "__"
+return u_string
+# 
--
+
+UNIFORM = 'uniform'
+UNDERSAMPLE = 'undersample'
+OVERSAMPLE = 'oversample'
+NOSAMPLE = 'nosample'
+
+NEW_ID_COLUMN = '__madlib_id__'
+NULL_IDENTIFIER = '__madlib_null_id__'
+
+def _get_frequency_distribution(source_table, class_col):
+""" Returns a dict containing the number of rows associated with each 
class
+level. Each class level value is converted to a string using 
::text.
+"""
+query_result = plpy.execute("""
+SELECT {class_col}::text AS classes,
+   count(*) AS class_count
+FROM {source_table}
+GROUP BY {class_col}
+ """.format(**locals()))
+actual_level_counts = {}
+for each_row in query_result:
+level = each_row['classes']
+if level:
+level = level.strip()
+actual_level_counts[level] = each_row['class_count']
+return actual_level_counts
+
+
+def _validate_and_get_sampling_strategy(sampling_strategy_str, 
output_table_size,
+supported_strategies=None, default=UNIFORM):
+""" Returns the sampling strategy based on the class_sizes input param.
+@param sampling_strategy_str The sampling strategy specified by the
+ user (class_sizes param)
+@returns:
+Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is 
UNIFORM.
+"""
+if not sampling_strategy_str:
+sampling_strategy_str = default
+else:
+if len(sampling_strategy_str) < 3:
+# Require at least 3 characters since UNIFORM and UNDERSAMPLE 
have
+# common prefix substring
+plpy.error("Sample: Invalid class_sizes parameter")
+
+if not supported_strategies:
+supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE]
+try:
+# allow user to specify a prefix substring of
--- End diff --

There is precedence for supporting prefix for parameter values, in modules 
such as SVM.

Yes, the error messages could be similar.


---


[GitHub] madlib pull request #230: Balanced sets final

2018-02-02 Thread kaknikhil
Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/230#discussion_r165529067
  
--- Diff: src/ports/postgres/modules/sample/balance_sample.py_in ---
@@ -0,0 +1,748 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file EXCEPT in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+m4_changequote(`')
+
+import math
+
+if __name__ != "__main__":
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import get_cols
+from utilities.validate_args import table_exists
+from utilities.validate_args import table_is_empty
+else:
+# Used only for Unit Testing
+# FIXME: repeating a function from utilities that is needed by the 
unit test.
+# This should be removed once a unittest framework in used for testing.
+import random
+import time
+
+def unique_string(desp='', **kwargs):
+"""
+Generate random remporary names for temp table and other names.
+It has a SQL interface so both SQL and Python functions can call 
it.
+"""
+r1 = random.randint(1, 1)
+r2 = int(time.time())
+r3 = int(time.time()) % random.randint(1, 1)
+u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" 
+ str(r3) + "__"
+return u_string
+# 
--
+
+UNIFORM = 'uniform'
+UNDERSAMPLE = 'undersample'
+OVERSAMPLE = 'oversample'
+NOSAMPLE = 'nosample'
+
+NEW_ID_COLUMN = '__madlib_id__'
+NULL_IDENTIFIER = '__madlib_null_id__'
+
+def _get_frequency_distribution(source_table, class_col):
+""" Returns a dict containing the number of rows associated with each 
class
+level. Each class level value is converted to a string using 
::text.
+"""
+query_result = plpy.execute("""
+SELECT {class_col}::text AS classes,
+   count(*) AS class_count
+FROM {source_table}
+GROUP BY {class_col}
+ """.format(**locals()))
+actual_level_counts = {}
+for each_row in query_result:
+level = each_row['classes']
+if level:
+level = level.strip()
+actual_level_counts[level] = each_row['class_count']
+return actual_level_counts
+
+
+def _validate_and_get_sampling_strategy(sampling_strategy_str, 
output_table_size,
+supported_strategies=None, default=UNIFORM):
+""" Returns the sampling strategy based on the class_sizes input param.
+@param sampling_strategy_str The sampling strategy specified by the
+ user (class_sizes param)
+@returns:
+Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is 
UNIFORM.
+"""
+if not sampling_strategy_str:
+sampling_strategy_str = default
+else:
+if len(sampling_strategy_str) < 3:
+# Require at least 3 characters since UNIFORM and UNDERSAMPLE 
have
+# common prefix substring
+plpy.error("Sample: Invalid class_sizes parameter")
+
+if not supported_strategies:
+supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE]
+try:
+# allow user to specify a prefix substring of
+# supported strategies.
+sampling_strategy_str = next(x for x in supported_strategies
+ if 
x.startswith(sampling_strategy_str.lower()))
+except StopIteration:
+# next() returns a StopIteration if no element found
 

[GitHub] madlib pull request #230: Balanced sets final

2018-02-02 Thread kaknikhil
Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/230#discussion_r165523942
  
--- Diff: src/ports/postgres/modules/sample/balance_sample.py_in ---
@@ -0,0 +1,748 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file EXCEPT in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+m4_changequote(`')
+
+import math
+
+if __name__ != "__main__":
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import get_cols
+from utilities.validate_args import table_exists
+from utilities.validate_args import table_is_empty
+else:
+# Used only for Unit Testing
+# FIXME: repeating a function from utilities that is needed by the 
unit test.
+# This should be removed once a unittest framework in used for testing.
+import random
+import time
+
+def unique_string(desp='', **kwargs):
+"""
+Generate random remporary names for temp table and other names.
+It has a SQL interface so both SQL and Python functions can call 
it.
+"""
+r1 = random.randint(1, 1)
+r2 = int(time.time())
+r3 = int(time.time()) % random.randint(1, 1)
+u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" 
+ str(r3) + "__"
+return u_string
+# 
--
+
+UNIFORM = 'uniform'
+UNDERSAMPLE = 'undersample'
+OVERSAMPLE = 'oversample'
+NOSAMPLE = 'nosample'
+
+NEW_ID_COLUMN = '__madlib_id__'
+NULL_IDENTIFIER = '__madlib_null_id__'
+
+def _get_frequency_distribution(source_table, class_col):
+""" Returns a dict containing the number of rows associated with each 
class
+level. Each class level value is converted to a string using 
::text.
+"""
+query_result = plpy.execute("""
+SELECT {class_col}::text AS classes,
+   count(*) AS class_count
+FROM {source_table}
+GROUP BY {class_col}
+ """.format(**locals()))
+actual_level_counts = {}
+for each_row in query_result:
+level = each_row['classes']
+if level:
+level = level.strip()
+actual_level_counts[level] = each_row['class_count']
+return actual_level_counts
+
+
+def _validate_and_get_sampling_strategy(sampling_strategy_str, 
output_table_size,
+supported_strategies=None, default=UNIFORM):
+""" Returns the sampling strategy based on the class_sizes input param.
+@param sampling_strategy_str The sampling strategy specified by the
+ user (class_sizes param)
+@returns:
+Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is 
UNIFORM.
+"""
+if not sampling_strategy_str:
+sampling_strategy_str = default
+else:
+if len(sampling_strategy_str) < 3:
+# Require at least 3 characters since UNIFORM and UNDERSAMPLE 
have
+# common prefix substring
+plpy.error("Sample: Invalid class_sizes parameter")
+
+if not supported_strategies:
+supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE]
+try:
+# allow user to specify a prefix substring of
+# supported strategies.
+sampling_strategy_str = next(x for x in supported_strategies
+ if 
x.startswith(sampling_strategy_str.lower()))
+except StopIteration:
+# next() returns a StopIteration if no element found
 

[GitHub] madlib pull request #230: Balanced sets final

2018-02-02 Thread kaknikhil
Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/230#discussion_r165737045
  
--- Diff: src/ports/postgres/modules/sample/balance_sample.py_in ---
@@ -0,0 +1,748 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file EXCEPT in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+m4_changequote(`')
+
+import math
+
+if __name__ != "__main__":
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import get_cols
+from utilities.validate_args import table_exists
+from utilities.validate_args import table_is_empty
+else:
+# Used only for Unit Testing
+# FIXME: repeating a function from utilities that is needed by the 
unit test.
+# This should be removed once a unittest framework in used for testing.
+import random
+import time
+
+def unique_string(desp='', **kwargs):
+"""
+Generate random remporary names for temp table and other names.
+It has a SQL interface so both SQL and Python functions can call 
it.
+"""
+r1 = random.randint(1, 1)
+r2 = int(time.time())
+r3 = int(time.time()) % random.randint(1, 1)
+u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" 
+ str(r3) + "__"
+return u_string
+# 
--
+
+UNIFORM = 'uniform'
+UNDERSAMPLE = 'undersample'
+OVERSAMPLE = 'oversample'
+NOSAMPLE = 'nosample'
+
+NEW_ID_COLUMN = '__madlib_id__'
+NULL_IDENTIFIER = '__madlib_null_id__'
+
+def _get_frequency_distribution(source_table, class_col):
+""" Returns a dict containing the number of rows associated with each 
class
+level. Each class level value is converted to a string using 
::text.
+"""
+query_result = plpy.execute("""
+SELECT {class_col}::text AS classes,
+   count(*) AS class_count
+FROM {source_table}
+GROUP BY {class_col}
+ """.format(**locals()))
+actual_level_counts = {}
+for each_row in query_result:
+level = each_row['classes']
+if level:
+level = level.strip()
+actual_level_counts[level] = each_row['class_count']
+return actual_level_counts
+
+
+def _validate_and_get_sampling_strategy(sampling_strategy_str, 
output_table_size,
+supported_strategies=None, default=UNIFORM):
+""" Returns the sampling strategy based on the class_sizes input param.
+@param sampling_strategy_str The sampling strategy specified by the
+ user (class_sizes param)
+@returns:
+Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is 
UNIFORM.
+"""
+if not sampling_strategy_str:
+sampling_strategy_str = default
+else:
+if len(sampling_strategy_str) < 3:
+# Require at least 3 characters since UNIFORM and UNDERSAMPLE 
have
+# common prefix substring
+plpy.error("Sample: Invalid class_sizes parameter")
+
+if not supported_strategies:
+supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE]
+try:
+# allow user to specify a prefix substring of
+# supported strategies.
+sampling_strategy_str = next(x for x in supported_strategies
+ if 
x.startswith(sampling_strategy_str.lower()))
+except StopIteration:
+# next() returns a StopIteration if no element found
 

[GitHub] madlib pull request #230: Balanced sets final

2018-02-02 Thread kaknikhil
Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/230#discussion_r165736819
  
--- Diff: src/ports/postgres/modules/sample/balance_sample.py_in ---
@@ -0,0 +1,748 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file EXCEPT in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+m4_changequote(`')
+
+import math
+
+if __name__ != "__main__":
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import get_cols
+from utilities.validate_args import table_exists
+from utilities.validate_args import table_is_empty
+else:
+# Used only for Unit Testing
+# FIXME: repeating a function from utilities that is needed by the 
unit test.
+# This should be removed once a unittest framework in used for testing.
+import random
+import time
+
+def unique_string(desp='', **kwargs):
+"""
+Generate random remporary names for temp table and other names.
+It has a SQL interface so both SQL and Python functions can call 
it.
+"""
+r1 = random.randint(1, 1)
+r2 = int(time.time())
+r3 = int(time.time()) % random.randint(1, 1)
+u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" 
+ str(r3) + "__"
+return u_string
+# 
--
+
+UNIFORM = 'uniform'
+UNDERSAMPLE = 'undersample'
+OVERSAMPLE = 'oversample'
+NOSAMPLE = 'nosample'
+
+NEW_ID_COLUMN = '__madlib_id__'
+NULL_IDENTIFIER = '__madlib_null_id__'
+
+def _get_frequency_distribution(source_table, class_col):
+""" Returns a dict containing the number of rows associated with each 
class
+level. Each class level value is converted to a string using 
::text.
+"""
+query_result = plpy.execute("""
+SELECT {class_col}::text AS classes,
+   count(*) AS class_count
+FROM {source_table}
+GROUP BY {class_col}
+ """.format(**locals()))
+actual_level_counts = {}
+for each_row in query_result:
+level = each_row['classes']
+if level:
+level = level.strip()
+actual_level_counts[level] = each_row['class_count']
+return actual_level_counts
+
+
+def _validate_and_get_sampling_strategy(sampling_strategy_str, 
output_table_size,
+supported_strategies=None, default=UNIFORM):
+""" Returns the sampling strategy based on the class_sizes input param.
+@param sampling_strategy_str The sampling strategy specified by the
+ user (class_sizes param)
+@returns:
+Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is 
UNIFORM.
+"""
+if not sampling_strategy_str:
+sampling_strategy_str = default
+else:
+if len(sampling_strategy_str) < 3:
+# Require at least 3 characters since UNIFORM and UNDERSAMPLE 
have
+# common prefix substring
+plpy.error("Sample: Invalid class_sizes parameter")
+
+if not supported_strategies:
+supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE]
+try:
+# allow user to specify a prefix substring of
+# supported strategies.
+sampling_strategy_str = next(x for x in supported_strategies
+ if 
x.startswith(sampling_strategy_str.lower()))
+except StopIteration:
+# next() returns a StopIteration if no element found
 

[GitHub] madlib pull request #230: Balanced sets final

2018-02-02 Thread kaknikhil
Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/230#discussion_r165526039
  
--- Diff: src/ports/postgres/modules/sample/balance_sample.py_in ---
@@ -0,0 +1,748 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file EXCEPT in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+m4_changequote(`')
+
+import math
+
+if __name__ != "__main__":
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import get_cols
+from utilities.validate_args import table_exists
+from utilities.validate_args import table_is_empty
+else:
+# Used only for Unit Testing
+# FIXME: repeating a function from utilities that is needed by the 
unit test.
+# This should be removed once a unittest framework in used for testing.
+import random
+import time
+
+def unique_string(desp='', **kwargs):
+"""
+Generate random remporary names for temp table and other names.
+It has a SQL interface so both SQL and Python functions can call 
it.
+"""
+r1 = random.randint(1, 1)
+r2 = int(time.time())
+r3 = int(time.time()) % random.randint(1, 1)
+u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" 
+ str(r3) + "__"
+return u_string
+# 
--
+
+UNIFORM = 'uniform'
+UNDERSAMPLE = 'undersample'
+OVERSAMPLE = 'oversample'
+NOSAMPLE = 'nosample'
+
+NEW_ID_COLUMN = '__madlib_id__'
+NULL_IDENTIFIER = '__madlib_null_id__'
+
+def _get_frequency_distribution(source_table, class_col):
+""" Returns a dict containing the number of rows associated with each 
class
+level. Each class level value is converted to a string using 
::text.
+"""
+query_result = plpy.execute("""
+SELECT {class_col}::text AS classes,
+   count(*) AS class_count
+FROM {source_table}
+GROUP BY {class_col}
+ """.format(**locals()))
+actual_level_counts = {}
+for each_row in query_result:
+level = each_row['classes']
+if level:
+level = level.strip()
+actual_level_counts[level] = each_row['class_count']
+return actual_level_counts
+
+
+def _validate_and_get_sampling_strategy(sampling_strategy_str, 
output_table_size,
+supported_strategies=None, default=UNIFORM):
+""" Returns the sampling strategy based on the class_sizes input param.
+@param sampling_strategy_str The sampling strategy specified by the
+ user (class_sizes param)
+@returns:
+Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is 
UNIFORM.
+"""
+if not sampling_strategy_str:
+sampling_strategy_str = default
+else:
+if len(sampling_strategy_str) < 3:
+# Require at least 3 characters since UNIFORM and UNDERSAMPLE 
have
+# common prefix substring
+plpy.error("Sample: Invalid class_sizes parameter")
+
+if not supported_strategies:
+supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE]
+try:
+# allow user to specify a prefix substring of
+# supported strategies.
+sampling_strategy_str = next(x for x in supported_strategies
+ if 
x.startswith(sampling_strategy_str.lower()))
+except StopIteration:
+# next() returns a StopIteration if no element found
 

[GitHub] madlib pull request #230: Balanced sets final

2018-02-02 Thread kaknikhil
Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/230#discussion_r165527448
  
--- Diff: src/ports/postgres/modules/sample/balance_sample.py_in ---
@@ -0,0 +1,748 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file EXCEPT in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+m4_changequote(`')
+
+import math
+
+if __name__ != "__main__":
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import get_cols
+from utilities.validate_args import table_exists
+from utilities.validate_args import table_is_empty
+else:
+# Used only for Unit Testing
+# FIXME: repeating a function from utilities that is needed by the 
unit test.
+# This should be removed once a unittest framework in used for testing.
+import random
+import time
+
+def unique_string(desp='', **kwargs):
+"""
+Generate random remporary names for temp table and other names.
+It has a SQL interface so both SQL and Python functions can call 
it.
+"""
+r1 = random.randint(1, 1)
+r2 = int(time.time())
+r3 = int(time.time()) % random.randint(1, 1)
+u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" 
+ str(r3) + "__"
+return u_string
+# 
--
+
+UNIFORM = 'uniform'
+UNDERSAMPLE = 'undersample'
+OVERSAMPLE = 'oversample'
+NOSAMPLE = 'nosample'
+
+NEW_ID_COLUMN = '__madlib_id__'
+NULL_IDENTIFIER = '__madlib_null_id__'
+
+def _get_frequency_distribution(source_table, class_col):
+""" Returns a dict containing the number of rows associated with each 
class
+level. Each class level value is converted to a string using 
::text.
+"""
+query_result = plpy.execute("""
+SELECT {class_col}::text AS classes,
+   count(*) AS class_count
+FROM {source_table}
+GROUP BY {class_col}
+ """.format(**locals()))
+actual_level_counts = {}
+for each_row in query_result:
+level = each_row['classes']
+if level:
+level = level.strip()
+actual_level_counts[level] = each_row['class_count']
+return actual_level_counts
+
+
+def _validate_and_get_sampling_strategy(sampling_strategy_str, 
output_table_size,
+supported_strategies=None, default=UNIFORM):
+""" Returns the sampling strategy based on the class_sizes input param.
+@param sampling_strategy_str The sampling strategy specified by the
+ user (class_sizes param)
+@returns:
+Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is 
UNIFORM.
+"""
+if not sampling_strategy_str:
+sampling_strategy_str = default
+else:
+if len(sampling_strategy_str) < 3:
+# Require at least 3 characters since UNIFORM and UNDERSAMPLE 
have
+# common prefix substring
+plpy.error("Sample: Invalid class_sizes parameter")
+
+if not supported_strategies:
+supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE]
+try:
+# allow user to specify a prefix substring of
+# supported strategies.
+sampling_strategy_str = next(x for x in supported_strategies
+ if 
x.startswith(sampling_strategy_str.lower()))
+except StopIteration:
+# next() returns a StopIteration if no element found
 

[GitHub] madlib pull request #230: Balanced sets final

2018-02-02 Thread kaknikhil
Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/230#discussion_r165479832
  
--- Diff: src/ports/postgres/modules/sample/balance_sample.py_in ---
@@ -0,0 +1,748 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file EXCEPT in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+m4_changequote(`')
+
+import math
+
+if __name__ != "__main__":
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import get_cols
+from utilities.validate_args import table_exists
+from utilities.validate_args import table_is_empty
+else:
+# Used only for Unit Testing
+# FIXME: repeating a function from utilities that is needed by the 
unit test.
+# This should be removed once a unittest framework in used for testing.
+import random
+import time
+
+def unique_string(desp='', **kwargs):
+"""
+Generate random remporary names for temp table and other names.
+It has a SQL interface so both SQL and Python functions can call 
it.
+"""
+r1 = random.randint(1, 1)
+r2 = int(time.time())
+r3 = int(time.time()) % random.randint(1, 1)
+u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" 
+ str(r3) + "__"
+return u_string
+# 
--
+
+UNIFORM = 'uniform'
+UNDERSAMPLE = 'undersample'
+OVERSAMPLE = 'oversample'
+NOSAMPLE = 'nosample'
+
+NEW_ID_COLUMN = '__madlib_id__'
+NULL_IDENTIFIER = '__madlib_null_id__'
+
+def _get_frequency_distribution(source_table, class_col):
+""" Returns a dict containing the number of rows associated with each 
class
+level. Each class level value is converted to a string using 
::text.
+"""
+query_result = plpy.execute("""
+SELECT {class_col}::text AS classes,
+   count(*) AS class_count
+FROM {source_table}
+GROUP BY {class_col}
+ """.format(**locals()))
+actual_level_counts = {}
+for each_row in query_result:
+level = each_row['classes']
+if level:
+level = level.strip()
+actual_level_counts[level] = each_row['class_count']
+return actual_level_counts
+
+
+def _validate_and_get_sampling_strategy(sampling_strategy_str, 
output_table_size,
+supported_strategies=None, default=UNIFORM):
+""" Returns the sampling strategy based on the class_sizes input param.
+@param sampling_strategy_str The sampling strategy specified by the
+ user (class_sizes param)
+@returns:
+Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is 
UNIFORM.
+"""
+if not sampling_strategy_str:
+sampling_strategy_str = default
+else:
+if len(sampling_strategy_str) < 3:
+# Require at least 3 characters since UNIFORM and UNDERSAMPLE 
have
+# common prefix substring
+plpy.error("Sample: Invalid class_sizes parameter")
+
+if not supported_strategies:
+supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE]
+try:
+# allow user to specify a prefix substring of
+# supported strategies.
+sampling_strategy_str = next(x for x in supported_strategies
+ if 
x.startswith(sampling_strategy_str.lower()))
+except StopIteration:
+# next() returns a StopIteration if no element found
 

[GitHub] madlib pull request #230: Balanced sets final

2018-02-02 Thread kaknikhil
Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/230#discussion_r165505215
  
--- Diff: src/ports/postgres/modules/sample/balance_sample.sql_in ---
@@ -0,0 +1,355 @@
+/* --- 
*//**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ *
+ * @file balance_sample.sql_in
+ *
+ * @brief SQL functions for balanced data sets sampling.
+ * @date 12/14/2017
+ *
+ * @sa Given a table, balanced sampling returns a sampled data set
+ * with specified balancing for each class (defaults to uniform sampleing).
+ *
+ *//* 
--- */
+
+m4_include(`SQLCommon.m4')
+
+
+/**
+@addtogroup grp_balance_sampling
+
+Contents
+
+Balanced Data Sets Sampling
+Examples
+
+
+
+@brief A method for independently sampling classes to receive a
+balanced data set.
+It is commonly used to where classes have greater imbalanced ratio.
+It ensures the subclasses are adequately represented in the sample.
+
+@anchor strs
+@par Balanced Sampling
+
+
+balance_sample( source_table,
+output_table,
+class_col,
+class_sizes,
+output_table_size,
+grouping_cols,
+with_replacement,
+keep_null
+  )
+
+
+\b Arguments
+
+source_table
+TEXT. Name of the table containing the input data.
+
+output_table
+TEXT. Name of output table that contains the sampled data.
+The output table contains all columns present in the source
+table.
+
+class_col
+TEXT,  Name of the column containing the class to be balanced.
+
+
+class_sizes (optional)
+VARCHAR, default ‘uniform’.
+
+@note
+Current implementation only supports 'undersample'.
+
+Parameter to define the size of the different class values.
+(Class values are sometimes also called levels).
+
+Can take the following forms:
+
+
+‘uniform’:
+All class values will be resampled to have the same number of rows.
+
+'undersample':
+Under-sample such that all class values end up with the same number of
+observations as the minority class.  Done without replacement by default
+unless the parameter ‘with_replacement’ is set to TRUE.
+
+'oversample':
+Over-sample with replacement such that all class values end up with the
+same number of observations as the majority class.  Not affected by the
+parameter ‘with_replacement’ since over-sampling is always done with
+replacement.
+
+
+
+You can also explicitly set class size in a string containing a
+comma-delimited list. Order does not matter and all class values do not
+need to be specified.  Use the format “class_value_1=x, class_value_2=y, 
...”
+where the class value in the list must exist in the column ‘class_col’.
+
+E.g.,  ‘male=3000, female=7000’ means you want to resample the dataset
+to result in 3000 male and 7000 female rows in the ‘output_table’.
+
+@note
+The allowed names for class values follows object naming rules in
+PostgreSQL [6].  Quoted identifiers are allowed and should be enclosed
+in double quotes “ in the usual way.  If for some reason the class values
+in the examples above were “MaLe” and “FeMaLe” then the comma 
delimited
+list for ‘class_size’ would be:  ‘“MaLe”=3000, 
“FeMaLe”=7000’.
+
+
+output_table_size (optional)
+INTEGER, default NULL.  Desired size of the output data set.
+
+This parameter is ignored if ‘class_size’ parameter is set to either
+‘oversample’ or ‘undersample’ since output table size is already 
determined.
+If NULL, the resulting output table size will depend on the settings for 
the

[GitHub] madlib pull request #230: Balanced sets final

2018-02-02 Thread kaknikhil
Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/230#discussion_r165530126
  
--- Diff: src/ports/postgres/modules/sample/balance_sample.py_in ---
@@ -0,0 +1,748 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file EXCEPT in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+m4_changequote(`')
+
+import math
+
+if __name__ != "__main__":
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import get_cols
+from utilities.validate_args import table_exists
+from utilities.validate_args import table_is_empty
+else:
+# Used only for Unit Testing
+# FIXME: repeating a function from utilities that is needed by the 
unit test.
+# This should be removed once a unittest framework in used for testing.
+import random
+import time
+
+def unique_string(desp='', **kwargs):
+"""
+Generate random remporary names for temp table and other names.
+It has a SQL interface so both SQL and Python functions can call 
it.
+"""
+r1 = random.randint(1, 1)
+r2 = int(time.time())
+r3 = int(time.time()) % random.randint(1, 1)
+u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" 
+ str(r3) + "__"
+return u_string
+# 
--
+
+UNIFORM = 'uniform'
+UNDERSAMPLE = 'undersample'
+OVERSAMPLE = 'oversample'
+NOSAMPLE = 'nosample'
+
+NEW_ID_COLUMN = '__madlib_id__'
+NULL_IDENTIFIER = '__madlib_null_id__'
+
+def _get_frequency_distribution(source_table, class_col):
+""" Returns a dict containing the number of rows associated with each 
class
+level. Each class level value is converted to a string using 
::text.
+"""
+query_result = plpy.execute("""
+SELECT {class_col}::text AS classes,
+   count(*) AS class_count
+FROM {source_table}
+GROUP BY {class_col}
+ """.format(**locals()))
+actual_level_counts = {}
+for each_row in query_result:
+level = each_row['classes']
+if level:
+level = level.strip()
+actual_level_counts[level] = each_row['class_count']
+return actual_level_counts
+
+
+def _validate_and_get_sampling_strategy(sampling_strategy_str, 
output_table_size,
+supported_strategies=None, default=UNIFORM):
+""" Returns the sampling strategy based on the class_sizes input param.
+@param sampling_strategy_str The sampling strategy specified by the
+ user (class_sizes param)
+@returns:
+Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is 
UNIFORM.
+"""
+if not sampling_strategy_str:
+sampling_strategy_str = default
+else:
+if len(sampling_strategy_str) < 3:
+# Require at least 3 characters since UNIFORM and UNDERSAMPLE 
have
+# common prefix substring
+plpy.error("Sample: Invalid class_sizes parameter")
+
+if not supported_strategies:
+supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE]
+try:
+# allow user to specify a prefix substring of
+# supported strategies.
+sampling_strategy_str = next(x for x in supported_strategies
+ if 
x.startswith(sampling_strategy_str.lower()))
+except StopIteration:
+# next() returns a StopIteration if no element found
 

[GitHub] madlib pull request #230: Balanced sets final

2018-02-02 Thread kaknikhil
Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/230#discussion_r165733978
  
--- Diff: src/ports/postgres/modules/sample/balance_sample.py_in ---
@@ -0,0 +1,748 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file EXCEPT in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+m4_changequote(`')
+
+import math
+
+if __name__ != "__main__":
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import get_cols
+from utilities.validate_args import table_exists
+from utilities.validate_args import table_is_empty
+else:
+# Used only for Unit Testing
+# FIXME: repeating a function from utilities that is needed by the 
unit test.
+# This should be removed once a unittest framework in used for testing.
+import random
+import time
+
+def unique_string(desp='', **kwargs):
+"""
+Generate random remporary names for temp table and other names.
+It has a SQL interface so both SQL and Python functions can call 
it.
+"""
+r1 = random.randint(1, 1)
+r2 = int(time.time())
+r3 = int(time.time()) % random.randint(1, 1)
+u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" 
+ str(r3) + "__"
+return u_string
+# 
--
+
+UNIFORM = 'uniform'
+UNDERSAMPLE = 'undersample'
+OVERSAMPLE = 'oversample'
+NOSAMPLE = 'nosample'
+
+NEW_ID_COLUMN = '__madlib_id__'
+NULL_IDENTIFIER = '__madlib_null_id__'
+
+def _get_frequency_distribution(source_table, class_col):
+""" Returns a dict containing the number of rows associated with each 
class
+level. Each class level value is converted to a string using 
::text.
+"""
+query_result = plpy.execute("""
+SELECT {class_col}::text AS classes,
+   count(*) AS class_count
+FROM {source_table}
+GROUP BY {class_col}
+ """.format(**locals()))
+actual_level_counts = {}
+for each_row in query_result:
+level = each_row['classes']
+if level:
+level = level.strip()
+actual_level_counts[level] = each_row['class_count']
+return actual_level_counts
+
+
+def _validate_and_get_sampling_strategy(sampling_strategy_str, 
output_table_size,
+supported_strategies=None, default=UNIFORM):
+""" Returns the sampling strategy based on the class_sizes input param.
+@param sampling_strategy_str The sampling strategy specified by the
+ user (class_sizes param)
+@returns:
+Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is 
UNIFORM.
+"""
+if not sampling_strategy_str:
+sampling_strategy_str = default
+else:
+if len(sampling_strategy_str) < 3:
+# Require at least 3 characters since UNIFORM and UNDERSAMPLE 
have
+# common prefix substring
+plpy.error("Sample: Invalid class_sizes parameter")
+
+if not supported_strategies:
+supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE]
+try:
+# allow user to specify a prefix substring of
+# supported strategies.
+sampling_strategy_str = next(x for x in supported_strategies
+ if 
x.startswith(sampling_strategy_str.lower()))
+except StopIteration:
+# next() returns a StopIteration if no element found
 

[GitHub] madlib pull request #230: Balanced sets final

2018-02-02 Thread kaknikhil
Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/230#discussion_r165516241
  
--- Diff: src/ports/postgres/modules/sample/balance_sample.py_in ---
@@ -0,0 +1,748 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file EXCEPT in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+m4_changequote(`')
+
+import math
+
+if __name__ != "__main__":
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import get_cols
+from utilities.validate_args import table_exists
+from utilities.validate_args import table_is_empty
+else:
+# Used only for Unit Testing
+# FIXME: repeating a function from utilities that is needed by the 
unit test.
+# This should be removed once a unittest framework in used for testing.
+import random
+import time
+
+def unique_string(desp='', **kwargs):
+"""
+Generate random remporary names for temp table and other names.
+It has a SQL interface so both SQL and Python functions can call 
it.
+"""
+r1 = random.randint(1, 1)
+r2 = int(time.time())
+r3 = int(time.time()) % random.randint(1, 1)
+u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" 
+ str(r3) + "__"
+return u_string
+# 
--
+
+UNIFORM = 'uniform'
+UNDERSAMPLE = 'undersample'
+OVERSAMPLE = 'oversample'
+NOSAMPLE = 'nosample'
+
+NEW_ID_COLUMN = '__madlib_id__'
+NULL_IDENTIFIER = '__madlib_null_id__'
+
+def _get_frequency_distribution(source_table, class_col):
+""" Returns a dict containing the number of rows associated with each 
class
+level. Each class level value is converted to a string using 
::text.
+"""
+query_result = plpy.execute("""
+SELECT {class_col}::text AS classes,
+   count(*) AS class_count
+FROM {source_table}
+GROUP BY {class_col}
+ """.format(**locals()))
+actual_level_counts = {}
+for each_row in query_result:
+level = each_row['classes']
+if level:
+level = level.strip()
+actual_level_counts[level] = each_row['class_count']
+return actual_level_counts
+
+
+def _validate_and_get_sampling_strategy(sampling_strategy_str, 
output_table_size,
+supported_strategies=None, default=UNIFORM):
+""" Returns the sampling strategy based on the class_sizes input param.
+@param sampling_strategy_str The sampling strategy specified by the
+ user (class_sizes param)
+@returns:
+Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is 
UNIFORM.
+"""
+if not sampling_strategy_str:
+sampling_strategy_str = default
+else:
+if len(sampling_strategy_str) < 3:
+# Require at least 3 characters since UNIFORM and UNDERSAMPLE 
have
+# common prefix substring
+plpy.error("Sample: Invalid class_sizes parameter")
+
+if not supported_strategies:
--- End diff --

Do we need to have `supported_strategies` as an argument, why not always 
use `[UNIFORM, UNDERSAMPLE, OVERSAMPLE]` ? Do we foresee any other sampling 
strategies ?


---


[GitHub] madlib pull request #230: Balanced sets final

2018-02-02 Thread kaknikhil
Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/230#discussion_r165537394
  
--- Diff: src/ports/postgres/modules/sample/balance_sample.py_in ---
@@ -0,0 +1,748 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file EXCEPT in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+m4_changequote(`')
+
+import math
+
+if __name__ != "__main__":
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import get_cols
+from utilities.validate_args import table_exists
+from utilities.validate_args import table_is_empty
+else:
+# Used only for Unit Testing
+# FIXME: repeating a function from utilities that is needed by the 
unit test.
+# This should be removed once a unittest framework in used for testing.
+import random
+import time
+
+def unique_string(desp='', **kwargs):
+"""
+Generate random remporary names for temp table and other names.
+It has a SQL interface so both SQL and Python functions can call 
it.
+"""
+r1 = random.randint(1, 1)
+r2 = int(time.time())
+r3 = int(time.time()) % random.randint(1, 1)
+u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" 
+ str(r3) + "__"
+return u_string
+# 
--
+
+UNIFORM = 'uniform'
+UNDERSAMPLE = 'undersample'
+OVERSAMPLE = 'oversample'
+NOSAMPLE = 'nosample'
+
+NEW_ID_COLUMN = '__madlib_id__'
+NULL_IDENTIFIER = '__madlib_null_id__'
+
+def _get_frequency_distribution(source_table, class_col):
+""" Returns a dict containing the number of rows associated with each 
class
+level. Each class level value is converted to a string using 
::text.
+"""
+query_result = plpy.execute("""
+SELECT {class_col}::text AS classes,
+   count(*) AS class_count
+FROM {source_table}
+GROUP BY {class_col}
+ """.format(**locals()))
+actual_level_counts = {}
+for each_row in query_result:
+level = each_row['classes']
+if level:
+level = level.strip()
+actual_level_counts[level] = each_row['class_count']
+return actual_level_counts
+
+
+def _validate_and_get_sampling_strategy(sampling_strategy_str, 
output_table_size,
+supported_strategies=None, default=UNIFORM):
+""" Returns the sampling strategy based on the class_sizes input param.
+@param sampling_strategy_str The sampling strategy specified by the
+ user (class_sizes param)
+@returns:
+Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is 
UNIFORM.
+"""
+if not sampling_strategy_str:
+sampling_strategy_str = default
+else:
+if len(sampling_strategy_str) < 3:
+# Require at least 3 characters since UNIFORM and UNDERSAMPLE 
have
+# common prefix substring
+plpy.error("Sample: Invalid class_sizes parameter")
+
+if not supported_strategies:
+supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE]
+try:
+# allow user to specify a prefix substring of
+# supported strategies.
+sampling_strategy_str = next(x for x in supported_strategies
+ if 
x.startswith(sampling_strategy_str.lower()))
+except StopIteration:
+# next() returns a StopIteration if no element found
 

[GitHub] madlib pull request #230: Balanced sets final

2018-02-02 Thread kaknikhil
Github user kaknikhil commented on a diff in the pull request:

https://github.com/apache/madlib/pull/230#discussion_r165734791
  
--- Diff: src/ports/postgres/modules/sample/balance_sample.py_in ---
@@ -0,0 +1,748 @@
+# coding=utf-8
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file EXCEPT in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+m4_changequote(`')
+
+import math
+
+if __name__ != "__main__":
+import plpy
+from utilities.control import MinWarning
+from utilities.utilities import _assert
+from utilities.utilities import extract_keyvalue_params
+from utilities.utilities import unique_string
+from utilities.validate_args import columns_exist_in_table
+from utilities.validate_args import get_cols
+from utilities.validate_args import table_exists
+from utilities.validate_args import table_is_empty
+else:
+# Used only for Unit Testing
+# FIXME: repeating a function from utilities that is needed by the 
unit test.
+# This should be removed once a unittest framework in used for testing.
+import random
+import time
+
+def unique_string(desp='', **kwargs):
+"""
+Generate random remporary names for temp table and other names.
+It has a SQL interface so both SQL and Python functions can call 
it.
+"""
+r1 = random.randint(1, 1)
+r2 = int(time.time())
+r3 = int(time.time()) % random.randint(1, 1)
+u_string = "__madlib_temp_" + desp + str(r1) + "_" + str(r2) + "_" 
+ str(r3) + "__"
+return u_string
+# 
--
+
+UNIFORM = 'uniform'
+UNDERSAMPLE = 'undersample'
+OVERSAMPLE = 'oversample'
+NOSAMPLE = 'nosample'
+
+NEW_ID_COLUMN = '__madlib_id__'
+NULL_IDENTIFIER = '__madlib_null_id__'
+
+def _get_frequency_distribution(source_table, class_col):
+""" Returns a dict containing the number of rows associated with each 
class
+level. Each class level value is converted to a string using 
::text.
+"""
+query_result = plpy.execute("""
+SELECT {class_col}::text AS classes,
+   count(*) AS class_count
+FROM {source_table}
+GROUP BY {class_col}
+ """.format(**locals()))
+actual_level_counts = {}
+for each_row in query_result:
+level = each_row['classes']
+if level:
+level = level.strip()
+actual_level_counts[level] = each_row['class_count']
+return actual_level_counts
+
+
+def _validate_and_get_sampling_strategy(sampling_strategy_str, 
output_table_size,
+supported_strategies=None, default=UNIFORM):
+""" Returns the sampling strategy based on the class_sizes input param.
+@param sampling_strategy_str The sampling strategy specified by the
+ user (class_sizes param)
+@returns:
+Str. One of [UNIFORM, UNDERSAMPLE, OVERSAMPLE]. Default is 
UNIFORM.
+"""
+if not sampling_strategy_str:
+sampling_strategy_str = default
+else:
+if len(sampling_strategy_str) < 3:
+# Require at least 3 characters since UNIFORM and UNDERSAMPLE 
have
+# common prefix substring
+plpy.error("Sample: Invalid class_sizes parameter")
+
+if not supported_strategies:
+supported_strategies = [UNIFORM, UNDERSAMPLE, OVERSAMPLE]
+try:
+# allow user to specify a prefix substring of
+# supported strategies.
+sampling_strategy_str = next(x for x in supported_strategies
+ if 
x.startswith(sampling_strategy_str.lower()))
+except StopIteration:
+# next() returns a StopIteration if no element found