This is an automated email from the ASF dual-hosted git repository. khannaekta pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/madlib.git
commit 2d6e599bf9ab393c0e8c6b6a81b781a1a4e1088c Author: Ekta Khanna <[email protected]> AuthorDate: Fri Oct 9 17:32:26 2020 -0700 DL: [AutoML] Add new class for Distribution rules JIRA: MADLIB-1453 Co-authored-by: Nikhil Kak <[email protected]> --- .../deep_learning/input_data_preprocessor.py_in | 17 ++++++++------- .../deep_learning/madlib_keras_automl.py_in | 6 +++--- .../deep_learning/madlib_keras_helper.py_in | 3 +-- .../deep_learning/madlib_keras_validator.py_in | 1 + .../test/unit_tests/test_madlib_keras.py_in | 24 ++++++++++++++-------- .../test/unit_tests/test_madlib_keras_automl.py_in | 8 ++++---- 6 files changed, 35 insertions(+), 24 deletions(-) diff --git a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in index 1d395a6..4b27642 100644 --- a/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in +++ b/src/ports/postgres/modules/deep_learning/input_data_preprocessor.py_in @@ -51,6 +51,9 @@ from madlib_keras_helper import * import time NUM_CLASSES_COLNAME = "num_classes" +class DistributionRulesOptions: + ALL_SEGMENTS = 'all_segments' + GPU_SEGMENTS = 'gpu_segments' class InputDataPreprocessorDL(object): def __init__(self, schema_madlib, source_table, output_table, @@ -64,12 +67,12 @@ class InputDataPreprocessorDL(object): self.buffer_size = buffer_size self.normalizing_const = normalizing_const self.num_classes = num_classes - self.distribution_rules = distribution_rules if distribution_rules else 'all_segments' + self.distribution_rules = distribution_rules.lower() if distribution_rules else DistributionRulesOptions.ALL_SEGMENTS self.module_name = module_name self.output_summary_table = None self.dependent_vartype = None self.independent_vartype = None - self.gpu_config = '$__madlib__$all_segments$__madlib__$' + self.gpu_config = '$__madlib__${0}$__madlib__$'.format(DistributionRulesOptions.ALL_SEGMENTS) if self.output_table: self.output_summary_table = add_postfix(self.output_table, "_summary") @@ -269,7 +272,7 @@ class InputDataPreprocessorDL(object): if is_platform_pg(): # used later for writing summary table - self.distribution_rules = '$__madlib__$all_segments$__madlib__$' + self.distribution_rules = '$__madlib__${0}$__madlib__$'.format(DistributionRulesOptions.ALL_SEGMENTS) # # For postgres, we just need 3 simple queries: @@ -320,14 +323,14 @@ class InputDataPreprocessorDL(object): # it's to be spread evenly across all segments, we still # need to do some extra work to ensure that happens. - if self.distribution_rules == 'all_segments': + if self.distribution_rules == DistributionRulesOptions.ALL_SEGMENTS: all_segments = True - self.distribution_rules = '$__madlib__$all_segments$__madlib__$' + self.distribution_rules = '$__madlib__${0}$__madlib__$'.format(DistributionRulesOptions.ALL_SEGMENTS) num_segments = get_seg_number() else: all_segments = False - if self.distribution_rules == 'gpu_segments': + if self.distribution_rules == DistributionRulesOptions.GPU_SEGMENTS: #TODO can we reuse the function `get_accessible_gpus_for_seg` from # madlib_keras_helper gpu_info_table = unique_string(desp='gpu_info') @@ -620,7 +623,7 @@ class InputDataPreprocessorDL(object): normalizing_const_colname=NORMALIZING_CONST_COLNAME, num_classes_colname=NUM_CLASSES_COLNAME, internal_gpu_config=INTERNAL_GPU_CONFIG, - distribution_rules=DISTRIBUTION_RULES, + distribution_rules=DISTRIBUTION_RULES_COLNAME, FLOAT32_SQL_TYPE=FLOAT32_SQL_TYPE) plpy.execute(query) diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in index 0df6772..dc8c837 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in @@ -33,7 +33,7 @@ from utilities.utilities import get_current_timestamp, get_seg_number, get_segme from utilities.control import SetGUC from madlib_keras_fit_multiple_model import FitMultipleModel from madlib_keras_helper import generate_row_string -from madlib_keras_helper import DISTRIBUTION_RULES +from madlib_keras_helper import DISTRIBUTION_RULES_COLNAME from madlib_keras_model_selection import MstSearch, ModelSelectionSchema from keras_model_arch_table import ModelArchSchema from utilities.validate_args import table_exists, drop_tables, input_tbl_valid @@ -706,7 +706,7 @@ class AutoMLHyperopt(KerasAutoML): :return: """ source_summary_table = add_postfix(self.source_table, '_summary') - dist_rules = plpy.execute("SELECT {0} from {1}".format(DISTRIBUTION_RULES, source_summary_table))[0][DISTRIBUTION_RULES] + dist_rules = plpy.execute("SELECT {0} from {1}".format(DISTRIBUTION_RULES_COLNAME, source_summary_table))[0][DISTRIBUTION_RULES_COLNAME] #TODO create constant for all_segments if dist_rules == "all_segments": return get_seg_number() @@ -734,9 +734,9 @@ class AutoMLHyperopt(KerasAutoML): self.algorithm = rand elif automl_params_dict[AutoMLConstants.ALGORITHM].lower() == 'tpe': self.algorithm = tpe + # TODO: Add support for atpe uncomment the below lines after atpe works # elif automl_params_dict[AutoMLSchema.ALGORITHM].lower() == 'atpe': # self.algorithm = atpe - # uncomment the above lines after atpe works # TODO else: plpy.error("{0}: valid algorithm 'automl_params' for hyperopt: 'rand', 'tpe'".format(self.module_name)) # , or 'atpe' else: diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in index 96c2817..be9a1f9 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras_helper.py_in @@ -26,7 +26,6 @@ from utilities.validate_args import table_exists from madlib_keras_gpu_info import GPUInfoFunctions import plpy from math import isnan -# from madlib_keras_model_selection import ModelSelectionSchema ############### Constants used in other deep learning files ######### # Name of columns in model summary table. @@ -54,7 +53,7 @@ SMALLINT_SQL_TYPE = 'SMALLINT' DEFAULT_NORMALIZING_CONST = 1.0 GP_SEGMENT_ID_COLNAME = "gp_segment_id" INTERNAL_GPU_CONFIG = '__internal_gpu_config__' -DISTRIBUTION_RULES = "distribution_rules" +DISTRIBUTION_RULES_COLNAME = "distribution_rules" ##################################################################### # Prepend a dimension to np arrays using expand_dims. diff --git a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in index 8b2157d..41e4c72 100644 --- a/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in +++ b/src/ports/postgres/modules/deep_learning/madlib_keras_validator.py_in @@ -18,6 +18,7 @@ # under the License. import plpy +from input_data_preprocessor import DistributionRulesOptions from keras_model_arch_table import ModelArchSchema from model_arch_info import get_num_classes from madlib_keras_custom_function import CustomFunctionSchema diff --git a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in index e69bab4..13bbfd1 100644 --- a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in +++ b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras.py_in @@ -51,7 +51,8 @@ class MadlibKerasFitTestCase(unittest.TestCase): def setUp(self): self.plpy_mock = Mock(spec='error') patches = { - 'plpy': plpy + 'plpy': plpy, + 'utilities.mean_std_dev_calculator': Mock() } self.plpy_mock_execute = MagicMock() @@ -691,7 +692,8 @@ class InternalKerasPredictTestCase(unittest.TestCase): def setUp(self): self.plpy_mock = Mock(spec='error') patches = { - 'plpy': plpy + 'plpy': plpy, + 'utilities.mean_std_dev_calculator': Mock() } self.plpy_mock_execute = MagicMock() @@ -795,7 +797,8 @@ class MadlibKerasPredictBYOMTestCase(unittest.TestCase): def setUp(self): self.plpy_mock = Mock(spec='error') patches = { - 'plpy': plpy + 'plpy': plpy, + 'utilities.mean_std_dev_calculator': Mock() } self.plpy_mock_execute = MagicMock() @@ -877,7 +880,8 @@ class MadlibKerasWrapperTestCase(unittest.TestCase): def setUp(self): self.plpy_mock = Mock(spec='error') patches = { - 'plpy': plpy + 'plpy': plpy, + 'utilities.mean_std_dev_calculator': Mock() } self.plpy_mock_execute = MagicMock() @@ -1210,7 +1214,8 @@ class MadlibKerasFitCommonValidatorTestCase(unittest.TestCase): def setUp(self): self.plpy_mock = Mock(spec='error') patches = { - 'plpy': plpy + 'plpy': plpy, + 'utilities.mean_std_dev_calculator': Mock() } self.plpy_mock_execute = MagicMock() @@ -1262,7 +1267,8 @@ class InputValidatorTestCase(unittest.TestCase): def setUp(self): self.plpy_mock = Mock(spec='error') patches = { - 'plpy': plpy + 'plpy': plpy, + 'utilities.mean_std_dev_calculator': Mock() } self.plpy_mock_execute = MagicMock() @@ -1382,7 +1388,8 @@ class MadlibSerializerTestCase(unittest.TestCase): def setUp(self): self.plpy_mock = Mock(spec='error') patches = { - 'plpy': plpy + 'plpy': plpy, + 'utilities.mean_std_dev_calculator': Mock() } self.plpy_mock_execute = MagicMock() @@ -1585,7 +1592,8 @@ class MadlibKerasEvaluationTestCase(unittest.TestCase): def setUp(self): self.plpy_mock = Mock(spec='error') patches = { - 'plpy': plpy + 'plpy': plpy, + 'utilities.mean_std_dev_calculator': Mock() } self.plpy_mock_execute = MagicMock() diff --git a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_automl.py_in b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_automl.py_in index edb12c4..946dde3 100644 --- a/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_automl.py_in +++ b/src/ports/postgres/modules/deep_learning/test/unit_tests/test_madlib_keras_automl.py_in @@ -37,7 +37,8 @@ class HyperbandScheduleTestCase(unittest.TestCase): # tested here. They are tested in dev-check. self.plpy_mock = Mock(spec='error') patches = { - 'plpy': plpy + 'plpy': plpy, + 'utilities.mean_std_dev_calculator': Mock() } self.plpy_mock_execute = MagicMock() @@ -206,15 +207,14 @@ class HyperbandScheduleTestCase(unittest.TestCase): def tearDown(self): self.module_patcher.stop() - - class AutoMLHyperoptTestCase(unittest.TestCase): def setUp(self): # The side effects of this class(writing to the output table) are not # tested here. They are tested in dev-check. self.plpy_mock = Mock(spec='error') patches = { - 'plpy': plpy + 'plpy': plpy, + 'utilities.mean_std_dev_calculator': Mock() } self.plpy_mock_execute = MagicMock()
