EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/406067 )
Change subject: Pull make_cv_objective outside tuner ...................................................................... Pull make_cv_objective outside tuner This really had no business in tuner, it's function is independant and it didn't require any of the state. Adds a test that verifies the function works roughly as expected. Also drop the 'condition' argument from tuner stages. A standard if condition should be used when building the stage list. Change-Id: Ic3dff6a1a055cba3fc57debd4a1e3417476ddd4a --- M mjolnir/test/training/test_tuning.py M mjolnir/training/tuning.py M mjolnir/training/xgboost.py M mjolnir/utils.py 4 files changed, 72 insertions(+), 63 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/67/406067/1 diff --git a/mjolnir/test/training/test_tuning.py b/mjolnir/test/training/test_tuning.py index 22402f1..15389d7 100644 --- a/mjolnir/test/training/test_tuning.py +++ b/mjolnir/test/training/test_tuning.py @@ -46,7 +46,7 @@ } tuner = mjolnir.training.tuning.ModelSelection(initial_space, tune_stages) - train_func = tuner.make_cv_objective(f, folds, num_cv_jobs, **kwargs) + train_func = mjolnir.training.tuning.make_cv_objective(f, folds, num_cv_jobs, **kwargs) trials_pool = tuner.build_pool(folds, num_cv_jobs) result = tuner(train_func, trials_pool) return result, stats['called'] @@ -80,39 +80,14 @@ assert result['params']['baz'] == 0 -def test_ModelSelection_stage_condition(): - num_iterations = 3 - result, called = run_model_selection([ - ('a', { - 'condition': lambda: False, - 'iterations': num_iterations, - 'space': { - 'foo': hyperopt.hp.uniform('foo', 1, 9), - } - }), - ('b', { - 'iterations': num_iterations, - 'space': { - 'bar': hyperopt.hp.uniform('bar', 1, 9), - } - }), - ]) - # iterations * folds - assert called == num_iterations * 2 - assert result['params']['foo'] == 10 - assert 1 <= result['params']['bar'] <= 9 - assert result['params']['baz'] == 0 - - def test_ModelSelection_kwargs_pass_thru(): - tuner = mjolnir.training.tuning.ModelSelection(None, None) expected_kwargs = {'hi': 5, 'there': 'test'} def f(fold, params, **kwargs): assert kwargs == expected_kwargs return {'test': [fold[0]], 'train': [fold[0]]} - obj = tuner.make_cv_objective(f, [[1], [2]], 1, **expected_kwargs) + obj = mjolnir.training.tuning.make_cv_objective(f, [[1], [2]], 1, **expected_kwargs) res = obj(None) assert res == [ @@ -144,3 +119,23 @@ folds = [[1] * num_workers for i in range(num_folds)] pool = tuner.build_pool(folds, num_cv_jobs) assert (pool is not None) == expect_pool + + +def test_ModelSelection_transformer(): + stats = {'called': 0} + + def transformer(result, params): + assert 'foo' in result + assert result['foo'] == 'bar' + assert params == 'some params' + stats['called'] += 1 + return 'baz' + + def f(fold, params): + assert params == 'some params' + return {'foo': 'bar'} + + folds = [[1, 2, 3], [4, 5, 6]] + obj = mjolnir.training.tuning.make_cv_objective(f, folds, 1, transformer) + assert obj('some params') == ['baz', 'baz'] + assert stats['called'] == 2 diff --git a/mjolnir/training/tuning.py b/mjolnir/training/tuning.py index 7d2df68..81bfafe 100644 --- a/mjolnir/training/tuning.py +++ b/mjolnir/training/tuning.py @@ -133,11 +133,48 @@ return with_retry +def make_cv_objective(train_func, folds, num_cv_jobs, transformer=None, **kwargs): + """Create a cross-validation objective function + + Parameters + ---------- + train_func : callable + Function accepting a fold and hyperparameters to perform training + num_cv_jobs : int + The total number of folds to train in parallel + transformer : callable or None, optional + Function accepting output of train_func and hyperparameters to + return stats about the individual fold train/test performance + + Returns + ------- + callable + Accepts a set of hyperparameters as only argument and returns + list of per-fold train/test performance. + """ + train_func = _py4j_retry(train_func, None) + if num_cv_jobs > 1: + cv_pool = Pool(num_cv_jobs) + cv_mapper = cv_pool.map + else: + cv_mapper = map + + def f(params): + def inner(fold): + return train_func(fold, params, **kwargs) + + return cv_mapper(inner, folds) + + if transformer is None: + return f + else: + return lambda params: [transformer(scores, params) for scores in f(params)] + + class ModelSelection(object): - def __init__(self, initial_space, tune_stages, transformer=None): + def __init__(self, initial_space, tune_stages): self.initial_space = initial_space self.tune_stages = tune_stages - self.transformer = transformer def build_pool(self, folds, num_cv_jobs): num_folds = len(folds) @@ -148,31 +185,7 @@ else: return None - def make_cv_objective(self, train_func, folds, num_cv_jobs, **kwargs): - train_func = _py4j_retry(train_func, None) - if num_cv_jobs > 1: - cv_pool = Pool(num_cv_jobs) - cv_mapper = cv_pool.map - else: - cv_mapper = map - - def f(params): - def inner(fold): - return train_func(fold, params, **kwargs) - - return cv_mapper(inner, folds) - - if not self.transformer: - return f - - def g(params): - return [self.transformer(scores, params) for scores in f(params)] - - return g - def eval_stage(self, train_func, stage, space, pool): - if 'condition' in stage and not stage['condition'](): - return space, None # Override current space with new space merged = dict(space, **stage['space']) best, trials = mjolnir.training.hyperopt.maximize( @@ -190,8 +203,7 @@ stages = [] for stage_name, stage in self.tune_stages: space, trials = self.eval_stage(train_func, stage, space, pool) - if trials is not None: - stages.append((stage_name, trials)) + stages.append((stage_name, trials)) trials_final = stages[-1][1] best_trial = np.argmin(trials_final.losses()) diff --git a/mjolnir/training/xgboost.py b/mjolnir/training/xgboost.py index 1a23a66..abeaabf 100644 --- a/mjolnir/training/xgboost.py +++ b/mjolnir/training/xgboost.py @@ -2,7 +2,7 @@ import hyperopt import mjolnir.spark import mjolnir.training.hyperopt -from mjolnir.training.tuning import ModelSelection +from mjolnir.training.tuning import make_cv_objective, ModelSelection import numpy as np import pyspark import pyspark.sql @@ -410,16 +410,17 @@ 'colsample_bytree': hyperopt.hp.quniform('colsample_bytree', 0.8, 1, .01), } }[dataset_size] - }), - ('trees', { + }) + ] + + if final_num_trees is not None and final_num_trees != initial_num_trees: + tune_spaces.append(('trees', { 'iterations': 30, - 'condition': lambda: final_num_trees is not None and final_num_trees != initial_num_trees, 'space': { 'num_rounds': final_num_trees, 'eta': hyperopt.hp.uniform('eta', 0.1, 0.4), } - }) - ] + })) # Baseline parameters to start with. Roughly tuned by what has worked in # the past. These vary though depending on number of training samples. These @@ -441,7 +442,7 @@ 'colsample_bytree': 0.8, } - tuner = ModelSelection(space, tune_spaces, cv_transformer) - train_func = tuner.make_cv_objective(train, folds, num_cv_jobs, train_matrix=train_matrix) + tuner = ModelSelection(space, tune_spaces) + train_func = make_cv_objective(train, folds, num_cv_jobs, cv_transformer, train_matrix=train_matrix) trials_pool = tuner.build_pool(folds, num_cv_jobs) return tuner(train_func, trials_pool) diff --git a/mjolnir/utils.py b/mjolnir/utils.py index 7ac6422..f64f2bd 100644 --- a/mjolnir/utils.py +++ b/mjolnir/utils.py @@ -63,6 +63,7 @@ else: # TODO: Untested with tempfile.NamedTemporaryFile() as local: + os.unlink(local.name) subprocess.check_call(['hdfs', 'dfs', '-copyToLocal', path, local.name]) if with_query: try: -- To view, visit https://gerrit.wikimedia.org/r/406067 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ic3dff6a1a055cba3fc57debd4a1e3417476ddd4a Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR Gerrit-Branch: master Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits