EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/406067 )

Change subject: Pull make_cv_objective outside tuner
......................................................................

Pull make_cv_objective outside tuner

This really had no business in tuner, it's function is
independant and it didn't require any of the state. Adds
a test that verifies the function works roughly as expected.

Also drop the 'condition' argument from tuner stages. A standard
if condition should be used when building the stage list.

Change-Id: Ic3dff6a1a055cba3fc57debd4a1e3417476ddd4a
---
M mjolnir/test/training/test_tuning.py
M mjolnir/training/tuning.py
M mjolnir/training/xgboost.py
M mjolnir/utils.py
4 files changed, 72 insertions(+), 63 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/67/406067/1

diff --git a/mjolnir/test/training/test_tuning.py 
b/mjolnir/test/training/test_tuning.py
index 22402f1..15389d7 100644
--- a/mjolnir/test/training/test_tuning.py
+++ b/mjolnir/test/training/test_tuning.py
@@ -46,7 +46,7 @@
             }
 
     tuner = mjolnir.training.tuning.ModelSelection(initial_space, tune_stages)
-    train_func = tuner.make_cv_objective(f, folds, num_cv_jobs, **kwargs)
+    train_func = mjolnir.training.tuning.make_cv_objective(f, folds, 
num_cv_jobs, **kwargs)
     trials_pool = tuner.build_pool(folds, num_cv_jobs)
     result = tuner(train_func, trials_pool)
     return result, stats['called']
@@ -80,39 +80,14 @@
     assert result['params']['baz'] == 0
 
 
-def test_ModelSelection_stage_condition():
-    num_iterations = 3
-    result, called = run_model_selection([
-        ('a', {
-            'condition': lambda: False,
-            'iterations': num_iterations,
-            'space': {
-                'foo': hyperopt.hp.uniform('foo', 1, 9),
-            }
-        }),
-        ('b', {
-            'iterations': num_iterations,
-            'space': {
-                'bar': hyperopt.hp.uniform('bar', 1, 9),
-            }
-        }),
-    ])
-    # iterations * folds
-    assert called == num_iterations * 2
-    assert result['params']['foo'] == 10
-    assert 1 <= result['params']['bar'] <= 9
-    assert result['params']['baz'] == 0
-
-
 def test_ModelSelection_kwargs_pass_thru():
-    tuner = mjolnir.training.tuning.ModelSelection(None, None)
     expected_kwargs = {'hi': 5, 'there': 'test'}
 
     def f(fold, params, **kwargs):
         assert kwargs == expected_kwargs
         return {'test': [fold[0]], 'train': [fold[0]]}
 
-    obj = tuner.make_cv_objective(f, [[1], [2]], 1, **expected_kwargs)
+    obj = mjolnir.training.tuning.make_cv_objective(f, [[1], [2]], 1, 
**expected_kwargs)
 
     res = obj(None)
     assert res == [
@@ -144,3 +119,23 @@
     folds = [[1] * num_workers for i in range(num_folds)]
     pool = tuner.build_pool(folds, num_cv_jobs)
     assert (pool is not None) == expect_pool
+
+
+def test_ModelSelection_transformer():
+    stats = {'called': 0}
+
+    def transformer(result, params):
+        assert 'foo' in result
+        assert result['foo'] == 'bar'
+        assert params == 'some params'
+        stats['called'] += 1
+        return 'baz'
+
+    def f(fold, params):
+        assert params == 'some params'
+        return {'foo': 'bar'}
+
+    folds = [[1, 2, 3], [4, 5, 6]]
+    obj = mjolnir.training.tuning.make_cv_objective(f, folds, 1, transformer)
+    assert obj('some params') == ['baz', 'baz']
+    assert stats['called'] == 2
diff --git a/mjolnir/training/tuning.py b/mjolnir/training/tuning.py
index 7d2df68..81bfafe 100644
--- a/mjolnir/training/tuning.py
+++ b/mjolnir/training/tuning.py
@@ -133,11 +133,48 @@
     return with_retry
 
 
+def make_cv_objective(train_func, folds, num_cv_jobs, transformer=None, 
**kwargs):
+    """Create a cross-validation objective function
+
+    Parameters
+    ----------
+    train_func : callable
+        Function accepting a fold and hyperparameters to perform training
+    num_cv_jobs : int
+        The total number of folds to train in parallel
+    transformer : callable or None, optional
+        Function accepting output of train_func and hyperparameters to
+        return stats about the individual fold train/test performance
+
+    Returns
+    -------
+    callable
+        Accepts a set of hyperparameters as only argument and returns
+        list of per-fold train/test performance.
+    """
+    train_func = _py4j_retry(train_func, None)
+    if num_cv_jobs > 1:
+        cv_pool = Pool(num_cv_jobs)
+        cv_mapper = cv_pool.map
+    else:
+        cv_mapper = map
+
+    def f(params):
+        def inner(fold):
+            return train_func(fold, params, **kwargs)
+
+        return cv_mapper(inner, folds)
+
+    if transformer is None:
+        return f
+    else:
+        return lambda params: [transformer(scores, params) for scores in 
f(params)]
+
+
 class ModelSelection(object):
-    def __init__(self, initial_space, tune_stages, transformer=None):
+    def __init__(self, initial_space, tune_stages):
         self.initial_space = initial_space
         self.tune_stages = tune_stages
-        self.transformer = transformer
 
     def build_pool(self, folds, num_cv_jobs):
         num_folds = len(folds)
@@ -148,31 +185,7 @@
         else:
             return None
 
-    def make_cv_objective(self, train_func, folds, num_cv_jobs, **kwargs):
-        train_func = _py4j_retry(train_func, None)
-        if num_cv_jobs > 1:
-            cv_pool = Pool(num_cv_jobs)
-            cv_mapper = cv_pool.map
-        else:
-            cv_mapper = map
-
-        def f(params):
-            def inner(fold):
-                return train_func(fold, params, **kwargs)
-
-            return cv_mapper(inner, folds)
-
-        if not self.transformer:
-            return f
-
-        def g(params):
-            return [self.transformer(scores, params) for scores in f(params)]
-
-        return g
-
     def eval_stage(self, train_func, stage, space, pool):
-        if 'condition' in stage and not stage['condition']():
-            return space, None
         # Override current space with new space
         merged = dict(space, **stage['space'])
         best, trials = mjolnir.training.hyperopt.maximize(
@@ -190,8 +203,7 @@
         stages = []
         for stage_name, stage in self.tune_stages:
             space, trials = self.eval_stage(train_func, stage, space, pool)
-            if trials is not None:
-                stages.append((stage_name, trials))
+            stages.append((stage_name, trials))
 
         trials_final = stages[-1][1]
         best_trial = np.argmin(trials_final.losses())
diff --git a/mjolnir/training/xgboost.py b/mjolnir/training/xgboost.py
index 1a23a66..abeaabf 100644
--- a/mjolnir/training/xgboost.py
+++ b/mjolnir/training/xgboost.py
@@ -2,7 +2,7 @@
 import hyperopt
 import mjolnir.spark
 import mjolnir.training.hyperopt
-from mjolnir.training.tuning import ModelSelection
+from mjolnir.training.tuning import make_cv_objective, ModelSelection
 import numpy as np
 import pyspark
 import pyspark.sql
@@ -410,16 +410,17 @@
                     'colsample_bytree': 
hyperopt.hp.quniform('colsample_bytree', 0.8, 1, .01),
                 }
             }[dataset_size]
-        }),
-        ('trees', {
+        })
+    ]
+
+    if final_num_trees is not None and final_num_trees != initial_num_trees:
+        tune_spaces.append(('trees', {
             'iterations': 30,
-            'condition': lambda: final_num_trees is not None and 
final_num_trees != initial_num_trees,
             'space': {
                 'num_rounds': final_num_trees,
                 'eta': hyperopt.hp.uniform('eta', 0.1, 0.4),
             }
-        })
-    ]
+        }))
 
     # Baseline parameters to start with. Roughly tuned by what has worked in
     # the past. These vary though depending on number of training samples. 
These
@@ -441,7 +442,7 @@
         'colsample_bytree': 0.8,
     }
 
-    tuner = ModelSelection(space, tune_spaces, cv_transformer)
-    train_func = tuner.make_cv_objective(train, folds, num_cv_jobs, 
train_matrix=train_matrix)
+    tuner = ModelSelection(space, tune_spaces)
+    train_func = make_cv_objective(train, folds, num_cv_jobs, cv_transformer, 
train_matrix=train_matrix)
     trials_pool = tuner.build_pool(folds, num_cv_jobs)
     return tuner(train_func, trials_pool)
diff --git a/mjolnir/utils.py b/mjolnir/utils.py
index 7ac6422..f64f2bd 100644
--- a/mjolnir/utils.py
+++ b/mjolnir/utils.py
@@ -63,6 +63,7 @@
     else:
         # TODO: Untested
         with tempfile.NamedTemporaryFile() as local:
+            os.unlink(local.name)
             subprocess.check_call(['hdfs', 'dfs', '-copyToLocal', path, 
local.name])
             if with_query:
                 try:

-- 
To view, visit https://gerrit.wikimedia.org/r/406067
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ic3dff6a1a055cba3fc57debd4a1e3417476ddd4a
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to