orhankislal commented on a change in pull request #518:
URL: https://github.com/apache/madlib/pull/518#discussion_r493674295
##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
##########
@@ -17,27 +17,41 @@
# specific language governing permissions and limitations
# under the License.
+from ast import literal_eval
from datetime import datetime
-import plpy
+from hyperopt import hp, rand, tpe, atpe, Trials, STATUS_OK, STATUS_RUNNING
+from hyperopt.base import Domain
import math
+import numpy as np
+import plpy
from time import time
from madlib_keras_validator import MstLoaderInputValidator
-from utilities.utilities import unique_string, add_postfix,
extract_keyvalue_params, \
- _assert, _assert_equal, rename_table
+from utilities.utilities import get_seg_number, get_segments_per_host,
unique_string, add_postfix, \
+ extract_keyvalue_params, _assert, _assert_equal, rename_table
from utilities.control import MinWarning, SetGUC
from madlib_keras_fit_multiple_model import FitMultipleModel
from madlib_keras_model_selection import MstSearch, ModelSelectionSchema
from keras_model_arch_table import ModelArchSchema
from utilities.validate_args import table_exists, drop_tables
+import inspect
+
+def decallmethods(decorator, prefix='test_'):
Review comment:
We should use `_` in the function name. I wasn't sure if this was meant
to be `de-call methods` or `dec all methods`. Also a short explanation on what
this function and the inner class doing would be great.
##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
##########
@@ -158,11 +174,10 @@ class HyperbandSchedule():
**locals())
plpy.execute(insert_query)
-@MinWarning("warning")
-class KerasAutoML():
- """The core AutoML function for running AutoML algorithms such as
Hyperband.
- This function executes the hyperband rounds 'diagonally' to evaluate
multiple configurations together
- and leverage the compute power of MPP databases such as Greenplum.
+# @MinWarning("warning")
Review comment:
Should we revert these commented lines back?
##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
##########
@@ -291,9 +338,100 @@ class KerasAutoML():
(self.metrics_compute_frequency >= 1 and \
self.metrics_compute_frequency <= num_iterations)
+ def print_best_so_far(self):
+ """
+ Prints mst keys with best train/val losses at a given point.
+ """
+ best_so_far = '\n'
+ best_so_far += self.print_best_helper('training')
+ if self.validation_table:
+ best_so_far += self.print_best_helper('validation')
+ plpy.info(best_so_far)
+
+ def print_best_helper(self, keyword):
Review comment:
Same as before.
##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
##########
@@ -291,9 +338,100 @@ class KerasAutoML():
(self.metrics_compute_frequency >= 1 and \
self.metrics_compute_frequency <= num_iterations)
+ def print_best_so_far(self):
+ """
+ Prints mst keys with best train/val losses at a given point.
+ """
+ best_so_far = '\n'
+ best_so_far += self.print_best_helper('training')
+ if self.validation_table:
+ best_so_far += self.print_best_helper('validation')
+ plpy.info(best_so_far)
+
+ def print_best_helper(self, keyword):
+ """
+ Helper function to Prints mst keys with best train/val losses at a
given point.
+ :param keyword: column prefix ('training' or 'validation')
+ :return:
+ """
+ metrics_word, loss_word = keyword + '_metrics_final', keyword +
'_loss_final'
+
+ res_str = 'Best {keyword} loss so far:\n'.format(keyword=keyword)
+ best_value = plpy.execute("SELECT {ModelSelectionSchema.MST_KEY},
{metrics_word}, " \
+ "{loss_word} FROM {self.model_info_table}
ORDER BY " \
+ "{loss_word} LIMIT 1".format(self=self,
ModelSelectionSchema=ModelSelectionSchema,
+
metrics_word=metrics_word, loss_word=loss_word))[0]
+ mst_key_value, metric_value, loss_value =
best_value[ModelSelectionSchema.MST_KEY], \
+ best_value[metrics_word],
best_value[loss_word]
+ res_str += ModelSelectionSchema.MST_KEY + '=' + str(mst_key_value) +
': metric=' + str(metric_value) + \
+ ', loss=' + str(loss_value) + '\n'
+ return res_str
+
+ def get_current_timestamp(self):
Review comment:
Should be in utilities
##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
##########
@@ -291,9 +338,100 @@ class KerasAutoML():
(self.metrics_compute_frequency >= 1 and \
self.metrics_compute_frequency <= num_iterations)
+ def print_best_so_far(self):
Review comment:
Generic name, print_best_mst_so_far would be better.
##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
##########
@@ -500,63 +648,432 @@ class KerasAutoML():
"b (key integer, s_val integer, i_val integer) WHERE
t.mst_key=b.key".format(self=self, l=l)
plpy.execute(query)
- def update_model_selection_table(self):
+# @MinWarning("warning")
+class AutoMLHyperopt(KerasAutoML):
+ """
+ This class implements Hyperopt, another automl method that explores
awkward search spaces using
+ Random Search, Tree-structured Parzen Estimator (TPE), or Adaptive TPE.
+
+ This function executes hyperopt on top of our multiple model training
infrastructure powered with
+ Model hOpper Parallelism (MOP), a hybrid of data and task parallelism.
+
+ This automl method inherits qualities from the automl class.
+ """
+ def __init__(self, schema_madlib, source_table, model_output_table,
model_arch_table, model_selection_table,
+ model_id_list, compile_params_grid, fit_params_grid,
automl_method='hyperopt',
+ automl_params='num_models=20, num_iters=5, algorithm=tpe',
random_state=None, object_table=None,
+ use_gpus=False, validation_table=None,
metrics_compute_frequency=None,
+ name=None, description=None, **kwargs):
+ KerasAutoML.__init__(self, schema_madlib, source_table,
model_output_table, model_arch_table,
+ model_selection_table, model_id_list,
compile_params_grid, fit_params_grid,
+ automl_method, automl_params, random_state,
object_table, use_gpus,
+ validation_table, metrics_compute_frequency,
name, description, **kwargs)
+ self.compile_params_grid = self.compile_params_grid.replace('\n',
'').replace(' ', '')
+ self.fit_params_grid = self.fit_params_grid.replace('\n',
'').replace(' ', '')
+ try:
+ self.compile_params_grid = literal_eval(self.compile_params_grid)
+
+ except:
+ plpy.error("Invalid syntax in 'compile_params_dict'")
+ try:
+ self.fit_params_grid = literal_eval(self.fit_params_grid)
+ except:
+ plpy.error("Invalid syntax in 'fit_params_dict'")
+ self.validate_and_define_inputs()
+
+ self.num_workers = get_seg_number() * get_segments_per_host()
+
+ self.create_model_output_table()
+ self.create_model_output_info_table()
+ self.find_hyperopt_config()
+
+ def validate_and_define_inputs(self):
+ automl_params_dict = extract_keyvalue_params(self.automl_params,
+
default_values={'num_models': 20,
+
'num_iters': 5,
+
'algorithm': 'tpe'},
+ lower_case_names=True)
+ # casting relevant values to int
+ for i in automl_params_dict:
+ try:
+ automl_params_dict[i] = int(automl_params_dict[i])
+ except ValueError:
+ pass
+ _assert(len(automl_params_dict) >= 1 and len(automl_params_dict) <= 3,
+ "DL: Only num_models, num_iters, and algorithm may be
specified")
+ for i in automl_params_dict:
+ if i == AutoMLSchema.NUM_MODELS:
+ self.num_models = automl_params_dict[AutoMLSchema.NUM_MODELS]
+ elif i == AutoMLSchema.NUM_ITERS:
+ self.num_iters = automl_params_dict[AutoMLSchema.NUM_ITERS]
+ elif i == AutoMLSchema.ALGORITHM:
+ if automl_params_dict[AutoMLSchema.ALGORITHM].lower() ==
'rand':
+ self.algorithm = rand
+ elif automl_params_dict[AutoMLSchema.ALGORITHM].lower() ==
'tpe':
+ self.algorithm = tpe
+ # elif automl_params_dict[AutoMLSchema.ALGORITHM].lower() ==
'atpe':
+ # self.algorithm = atpe
+ # uncomment the above lines after atpe works
Review comment:
Add TODO to draw attention.
##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
##########
@@ -223,10 +231,13 @@ class KerasAutoML():
{ModelArchSchema.MODEL_ARCH} JSON)
""".format(self=self,
ModelSelectionSchema=ModelSelectionSchema,
ModelArchSchema=ModelArchSchema)
- with MinWarning('warning'):
- plpy.execute(output_table_create_query)
+ # with MinWarning('warning'):
Review comment:
Same as before.
##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
##########
@@ -245,40 +256,76 @@ class KerasAutoML():
validation_metrics_final DOUBLE PRECISION,
validation_loss_final DOUBLE PRECISION,
validation_metrics DOUBLE PRECISION[],
- validation_loss DOUBLE PRECISION[],
- {AutoMLSchema.METRICS_ITERS} INTEGER[])
- """.format(self=self,
ModelSelectionSchema=ModelSelectionSchema,
-
ModelArchSchema=ModelArchSchema, AutoMLSchema=AutoMLSchema)
- with MinWarning('warning'):
- plpy.execute(info_table_create_query)
+ validation_loss DOUBLE PRECISION[]
+ """.format(self=self,
+
ModelSelectionSchema=ModelSelectionSchema,
+ ModelArchSchema=ModelArchSchema,
+ AutoMLSchema=AutoMLSchema) +
additional_cols + ")"
+ # with MinWarning('warning'):
+ plpy.execute(info_table_create_query)
- def validate_and_define_inputs(self):
+ def update_model_selection_table(self):
+ """
+ Drops and re-create the mst table to only include the best performing
model configuration.
+ """
+ drop_tables([self.model_selection_table])
- if AutoMLSchema.HYPERBAND.startswith(self.automl_method.lower()):
- automl_params_dict = extract_keyvalue_params(self.automl_params,
- default_values={'R':
6, 'eta': 3, 'skip_last': 0},
-
lower_case_names=False)
- # casting dict values to int
- for i in automl_params_dict:
- automl_params_dict[i] = int(automl_params_dict[i])
- _assert(len(automl_params_dict) >= 1 or len(automl_params_dict) <=
3,
- "DL: Only R, eta, and skip_last may be specified")
- for i in automl_params_dict:
- if i == AutoMLSchema.R:
- self.R = automl_params_dict[AutoMLSchema.R]
- elif i == AutoMLSchema.ETA:
- self.eta = automl_params_dict[AutoMLSchema.ETA]
- elif i == AutoMLSchema.SKIP_LAST:
- self.skip_last = automl_params_dict[AutoMLSchema.SKIP_LAST]
- else:
- plpy.error("DL: {0} is an invalid param".format(i))
- _assert(self.eta > 1, "DL: eta must be greater than 1")
- _assert(self.R >= self.eta, "DL: R should not be less than eta")
- self.s_max = int(math.floor(math.log(self.R, self.eta)))
- _assert(self.skip_last >= 0 and self.skip_last < self.s_max+1,
"DL: skip_last must be " +
- "non-negative and less than {0}".format(self.s_max))
- else:
- plpy.error("DL: Only hyperband is currently supported as the
automl method")
+ # only retaining best performing config
+ plpy.execute("CREATE TABLE {self.model_selection_table} AS SELECT
{ModelSelectionSchema.MST_KEY}, " \
+ "{ModelSelectionSchema.MODEL_ID},
{ModelSelectionSchema.COMPILE_PARAMS}, " \
+ "{ModelSelectionSchema.FIT_PARAMS} FROM
{self.model_info_table} " \
+ "ORDER BY {AutoMLSchema.LOSS_METRIC} LIMIT
1".format(self=self,
+
AutoMLSchema=AutoMLSchema,
+
ModelSelectionSchema=ModelSelectionSchema))
+
+ def generate_model_output_summary_table(self, model_training):
+ """
+ Creates and populates static values related to the AutoML workload.
+ :param model_training: Fit Multiple function call object.
+ """
+ create_query = plpy.prepare("""
+ CREATE TABLE {self.model_summary_table} AS
+ SELECT
+ $MAD${self.source_table}$MAD$::TEXT AS source_table,
+ $MAD${self.validation_table}$MAD$::TEXT AS
validation_table,
+ $MAD${self.model_output_table}$MAD$::TEXT AS model,
+ $MAD${self.model_info_table}$MAD$::TEXT AS model_info,
+ (SELECT dependent_varname FROM
{model_training.model_summary_table})
+ AS dependent_varname,
+ (SELECT independent_varname FROM
{model_training.model_summary_table})
+ AS independent_varname,
+ $MAD${self.model_arch_table}$MAD$::TEXT AS
model_arch_table,
+ $MAD${self.model_selection_table}$MAD$::TEXT AS
model_selection_table,
+ $MAD${self.automl_method}$MAD$::TEXT AS automl_method,
+ $MAD${self.automl_params}$MAD$::TEXT AS automl_params,
+ $MAD${self.random_state}$MAD$::TEXT AS random_state,
+ $MAD${self.object_table}$MAD$::TEXT AS object_table,
+ {self.use_gpus} AS use_gpus,
+ (SELECT metrics_compute_frequency FROM
{model_training.model_summary_table})::INTEGER
+ AS metrics_compute_frequency,
+ $MAD${self.name}$MAD$::TEXT AS name,
+ $MAD${self.description}$MAD$::TEXT AS description,
+ '{self.start_training_time}'::TIMESTAMP AS
start_training_time,
+ '{self.end_training_time}'::TIMESTAMP AS end_training_time,
+ (SELECT madlib_version FROM
{model_training.model_summary_table}) AS madlib_version,
+ (SELECT num_classes FROM
{model_training.model_summary_table})::INTEGER AS num_classes,
+ (SELECT class_values FROM
{model_training.model_summary_table}) AS class_values,
+ (SELECT dependent_vartype FROM
{model_training.model_summary_table})
+ AS dependent_vartype,
+ (SELECT normalizing_const FROM
{model_training.model_summary_table})
+ AS normalizing_const
+ """.format(self=self, model_training=model_training))
+
+ # with MinWarning('warning'):
+ plpy.execute(create_query)
+
+ def is_method(self, method_name):
Review comment:
`is_method` is a very generic function name, `is_automl_method` would be
better.
##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
##########
@@ -291,9 +338,100 @@ class KerasAutoML():
(self.metrics_compute_frequency >= 1 and \
self.metrics_compute_frequency <= num_iterations)
+ def print_best_so_far(self):
+ """
+ Prints mst keys with best train/val losses at a given point.
+ """
+ best_so_far = '\n'
+ best_so_far += self.print_best_helper('training')
+ if self.validation_table:
+ best_so_far += self.print_best_helper('validation')
+ plpy.info(best_so_far)
+
+ def print_best_helper(self, keyword):
+ """
+ Helper function to Prints mst keys with best train/val losses at a
given point.
+ :param keyword: column prefix ('training' or 'validation')
+ :return:
+ """
+ metrics_word, loss_word = keyword + '_metrics_final', keyword +
'_loss_final'
+
+ res_str = 'Best {keyword} loss so far:\n'.format(keyword=keyword)
+ best_value = plpy.execute("SELECT {ModelSelectionSchema.MST_KEY},
{metrics_word}, " \
+ "{loss_word} FROM {self.model_info_table}
ORDER BY " \
+ "{loss_word} LIMIT 1".format(self=self,
ModelSelectionSchema=ModelSelectionSchema,
+
metrics_word=metrics_word, loss_word=loss_word))[0]
+ mst_key_value, metric_value, loss_value =
best_value[ModelSelectionSchema.MST_KEY], \
+ best_value[metrics_word],
best_value[loss_word]
+ res_str += ModelSelectionSchema.MST_KEY + '=' + str(mst_key_value) +
': metric=' + str(metric_value) + \
+ ', loss=' + str(loss_value) + '\n'
+ return res_str
+
+ def get_current_timestamp(self):
+ """for start and end times for the chosen AutoML algorithm. Showcased
in the output summary table"""
+ return datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S')
+
+ def remove_temp_tables(self, model_training):
Review comment:
Generic name, a similar function already exists in utilities
(cleanup_madlib_temp_tables, utilities.sql_in line 101).
##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
##########
@@ -404,24 +544,30 @@ class KerasAutoML():
model_id=ModelSelectionSchema.MODEL_ID,
compile_params=ModelSelectionSchema.COMPILE_PARAMS,
fit_params=ModelSelectionSchema.FIT_PARAMS)
- with MinWarning('warning'):
- plpy.execute(create_query)
+ # with MinWarning('warning'):
Review comment:
Same as before
##########
File path: src/ports/postgres/modules/deep_learning/madlib_keras_automl.py_in
##########
@@ -291,9 +338,100 @@ class KerasAutoML():
(self.metrics_compute_frequency >= 1 and \
self.metrics_compute_frequency <= num_iterations)
+ def print_best_so_far(self):
+ """
+ Prints mst keys with best train/val losses at a given point.
+ """
+ best_so_far = '\n'
+ best_so_far += self.print_best_helper('training')
+ if self.validation_table:
+ best_so_far += self.print_best_helper('validation')
+ plpy.info(best_so_far)
+
+ def print_best_helper(self, keyword):
+ """
+ Helper function to Prints mst keys with best train/val losses at a
given point.
+ :param keyword: column prefix ('training' or 'validation')
+ :return:
+ """
+ metrics_word, loss_word = keyword + '_metrics_final', keyword +
'_loss_final'
+
+ res_str = 'Best {keyword} loss so far:\n'.format(keyword=keyword)
+ best_value = plpy.execute("SELECT {ModelSelectionSchema.MST_KEY},
{metrics_word}, " \
+ "{loss_word} FROM {self.model_info_table}
ORDER BY " \
+ "{loss_word} LIMIT 1".format(self=self,
ModelSelectionSchema=ModelSelectionSchema,
+
metrics_word=metrics_word, loss_word=loss_word))[0]
+ mst_key_value, metric_value, loss_value =
best_value[ModelSelectionSchema.MST_KEY], \
+ best_value[metrics_word],
best_value[loss_word]
+ res_str += ModelSelectionSchema.MST_KEY + '=' + str(mst_key_value) +
': metric=' + str(metric_value) + \
+ ', loss=' + str(loss_value) + '\n'
+ return res_str
+
+ def get_current_timestamp(self):
+ """for start and end times for the chosen AutoML algorithm. Showcased
in the output summary table"""
+ return datetime.fromtimestamp(time()).strftime('%Y-%m-%d %H:%M:%S')
+
+ def remove_temp_tables(self, model_training):
+ """
+ Remove all intermediate tables created for AutoML runs/updates.
+ :param model_training: Fit Multiple function call object.
+ """
+ drop_tables([model_training.original_model_output_table,
model_training.model_info_table,
+ model_training.model_summary_table,
AutoMLSchema.TEMP_MST_TABLE,
+ AutoMLSchema.TEMP_MST_SUMMARY_TABLE])
+
+# @MinWarning("warning")
+class AutoMLHyperband(KerasAutoML):
+ """
+ This class implements Hyperband, an infinite-arm bandit based algorithm
that speeds up random search
+ through adaptive resource allocation, successive halving (SHA) and early
stopping.
+
+ This class showcases and novel hyperband implementation by executing the
hyperband rounds 'diagonally'
Review comment:
Possible typo, and -> a ?
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]