EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/377343 )

Change subject: Allow selecting exact number of trees for training
......................................................................

Allow selecting exact number of trees for training

The idea to base the number of trees off node evaluations, rather
than some explicit number of trees, seems to abstract. Lets just
allow explicitly setting the exact number of trees to use.

Change-Id: Ie5a50865244af91e082d8acb04596d12dfbe3f0a
---
M mjolnir/cli/training_pipeline.py
M mjolnir/training/xgboost.py
2 files changed, 28 insertions(+), 28 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/43/377343/1

diff --git a/mjolnir/cli/training_pipeline.py b/mjolnir/cli/training_pipeline.py
index 32d25cd..7425e87 100644
--- a/mjolnir/cli/training_pipeline.py
+++ b/mjolnir/cli/training_pipeline.py
@@ -20,7 +20,7 @@
 from pyspark.sql import functions as F
 
 
-def main(sc, sqlContext, input_dir, output_dir, wikis, target_node_evaluations,
+def main(sc, sqlContext, input_dir, output_dir, wikis, initial_num_trees, 
final_num_trees,
          num_workers, num_cv_jobs, num_folds, test_dir, zero_features):
 
     if os.path.exists(output_dir):
@@ -48,12 +48,11 @@
             df_hits_with_features = mjolnir.feature_engineering.zero_features(
                     df_hits_with_features, zero_features)
 
-        # Explore a hyperparameter space. Skip the most expensive part of 
tuning,
-        # increasing the # of trees, with target_node_evaluations=None
         tune_results = mjolnir.training.xgboost.tune(
             df_hits_with_features, num_folds=num_folds,
             num_cv_jobs=num_cv_jobs, num_workers=num_workers,
-            target_node_evaluations=target_node_evaluations)
+            initial_num_trees=initial_num_trees,
+            final_num_trees=final_num_trees)
 
         print 'CV  test-ndcg@10: %.4f' % (tune_results['metrics']['cv-test'])
         print 'CV train-ndcg@10: %.4f' % (tune_results['metrics']['cv-train'])
@@ -131,11 +130,12 @@
         '-f', '--folds', dest='num_folds', default=5, type=int,
         help='Number of cross validation folds to use. (Default: 5)')
     parser.add_argument(
-        '-n', '--node-evaluations', dest='target_node_evaluations', type=int, 
default=None,
-        help='Approximate number of node evaluations per predication that '
-             + 'the final result will require. This controls the number of '
-             + 'trees used in the final result. Default uses 100 trees rather '
-             + 'than dynamically choosing based on max_depth. (Default: None)')
+        '--initial-trees', dest='initial_num_trees', default=100, type=int,
+        help='Number of trees to perform hyperparamter tuning with.  (Default: 
100)')
+    parser.add_argument(
+        '--final-trees', dest='final_num_trees', default=None, type=int,
+        help='Number of trees in the final ensemble. If not provided the value 
from '
+             + '--initial-trees will be used.  (Default: None)')
     parser.add_argument(
         '-t', '--test-path', dest='test_dir', type=str, required=False, 
default=None,
         help='A holdout test set to evaluate the final model against')
diff --git a/mjolnir/training/xgboost.py b/mjolnir/training/xgboost.py
index dd20135..c010fb0 100644
--- a/mjolnir/training/xgboost.py
+++ b/mjolnir/training/xgboost.py
@@ -403,7 +403,7 @@
     return eta_pred[idx]
 
 
-def tune(df, num_folds=5, num_cv_jobs=5, num_workers=5, 
target_node_evaluations=5000):
+def tune(df, num_folds=5, num_cv_jobs=5, num_workers=5, initial_num_trees=100, 
final_num_trees=500):
     """Find appropriate hyperparameters for training df
 
     This is far from perfect, hyperparameter tuning is a bit of a black art
@@ -432,13 +432,14 @@
         number of executors used will be (num_cv_jobs * num_workers). Generally
         prefer executors with more cpu's over a higher number of workers where
         possible. (Default: 5)
-    target_node_evaluations : int, optional
-        The approximate number of node evaluations per prediction that the
-        final result will require. This controls the number of trees used in
-        the final result. The number of trees will be (target_node_evaluations
-        / optimal_max_depth). This is by far the most expensive part to tune,
-        setting to None skips this and uses a constant 100 trees.
-        (Default: 5000)
+    initial_num_trees: int, optional
+        The number of trees to do most of the hyperparameter tuning with. This
+        should be large enough to be resonably representative of the final
+        training size. (Default: 100)
+    final_num_trees: int, optional
+        The number of trees to do the final eta optimization with. If set to
+        None the final eta optimization will be skipped and initial_n_tree will
+        be kept.
 
     Returns
     -------
@@ -482,7 +483,7 @@
     space = {
         'objective': 'rank:ndcg',
         'eval_metric': 'ndcg@10',
-        'num_rounds': 100,
+        'num_rounds': initial_num_trees,
         'min_child_weight': 200,
         'max_depth': 6,
         'gamma': 0,
@@ -519,7 +520,6 @@
     space['min_child_weight'] = int(best_complexity['min_child_weight'])
     pprint.pprint(space)
 
-
     # subsample helps make the model more robust to noisy data. For each 
update to
     # a tree only this % of samples are considered.
     space['subsample'] = hyperopt.hp.quniform('subsample', 0.8, 1, .01)
@@ -536,19 +536,19 @@
     space['colsample_bytree'] = best_noise['colsample_bytree']
     pprint.pprint(space)
 
-    # Finally increase the number of trees to our target, which is mostly based
-    # on how computationally expensive it is to generate predictions with the 
final
-    # model. Find the optimal eta for this new # of trees. This step can take 
as
-    # much time as all previous steps combined, and then some, so it can be 
disabled
-    # with target_node_evalations of None.
-    if target_node_evaluations is None:
+    # Finally increase the number of trees to our target, if it was requested.
+    if final_num_trees is None or final_num_trees == initial_num_trees:
         trials_trees = None
         trials_final = trials_noise
     else:
-        space['num_rounds'] = target_node_evaluations / space['max_depth']
+        space['num_rounds'] = final_num_trees
         # TODO: Is 30 steps right amount? too many? too few? This generally
-        # uses a large number of trees which takes 10 to 20 minutes per 
evaluation.
-        # That means evaluating 15 points is 2.5 to 5 hours.
+        # uses a large number of trees which takes 10 to 20 minutes per 
evaluation
+        # on large training sets.That means evaluating 15 points is 2.5 to 5 
hours.
+        # TODO: The appropriate space here really depends on the amount of 
data and
+        # the number of trees. A small wiki with 300k observations and 500 
trees needs
+        # to search a very different space than a large wiki with 30M 
observations
+        # and the same 500 trees.
         etas = np.linspace(0.01, 0.3, 30)
         space['eta'] = hyperopt.hp.choice('eta', etas)
         best_trees, trials_trees = eval_space_grid(space)

-- 
To view, visit https://gerrit.wikimedia.org/r/377343
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie5a50865244af91e082d8acb04596d12dfbe3f0a
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to