EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/406068 )
Change subject: Fixup xgboost training ...................................................................... Fixup xgboost training * Use tree_method = hist when training on a single worker. This is significantly faster than the approx method used by default. * We've always trained with dense feature matrix's, and the ltr plugin only supports dense evaluation, but the DataWriter was writing out sparse matrixs. This caused a degredation in ndcg. * The txt file emitted by DataWriter has to be read by lightgbm and xgboost. As such it starts features at idx 1 to make lightgbm happy (which stores the label at idx 0). This broke XGBoostModel.eval because it was not providing the empty feature at index 0 that training sees. * XGBoostModel.loadModelFrom* always failed because the summary method on the jvm side throws an exception (metrics are not serialized). Wrap in try/except and set summary to None when not available. Change-Id: I48d7bf96a300313b6f62e3f60742345e8bd1a83f --- M jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala M jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala M mjolnir/training/xgboost.py M mjolnir/utilities/make_folds.py 4 files changed, 10 insertions(+), 4 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR refs/changes/68/406068/1 diff --git a/jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala index 8d6976b..01fcf6d 100644 --- a/jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala +++ b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala @@ -28,7 +28,7 @@ ) extends Serializable { // Accepting JavaSparkContext for py4j compatability - def this(sc: JavaSparkContext) = this(sc.broadcast(new SerializableConfiguration(sc.hadoopConfiguration))) + def this(sc: JavaSparkContext, sparse: Boolean) = this(sc.broadcast(new SerializableConfiguration(sc.hadoopConfiguration)), sparse) private def asHDFSPath(path: String): HDFSPath = if (path.charAt(0) == '/') { new HDFSPath(s"file://$path") diff --git a/jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala b/jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala index ca85260..06004a7 100644 --- a/jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala +++ b/jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala @@ -44,7 +44,7 @@ try { val df = makeData() val pattern = s"$testDir/%s-fold-%s-partition-%d" - val writer = new DataWriter(spark.sparkContext) + val writer = new DataWriter(spark.sparkContext, true) val folds = writer.write(df, numWorkers, pattern, foldCol) assert(folds.length == expectedFolds) diff --git a/mjolnir/training/xgboost.py b/mjolnir/training/xgboost.py index abeaabf..4d6bb9d 100644 --- a/mjolnir/training/xgboost.py +++ b/mjolnir/training/xgboost.py @@ -4,6 +4,7 @@ import mjolnir.training.hyperopt from mjolnir.training.tuning import make_cv_objective, ModelSelection import numpy as np +import py4j import pyspark import pyspark.sql from pyspark.sql import functions as F @@ -114,6 +115,8 @@ # ints, so this gets all the types right for Java. Also makes # a copy of params so we don't modifying the incoming dict. params = _coerce_params(params) + # Histogram doesn't work with distributed training + params['tree_method'] = 'hist' if len(fold) == 1 else 'approx' # TODO: Maybe num_rounds should just be external? But it's easier # to do hyperparameter optimization with a consistent dict interface kwargs = { @@ -158,7 +161,10 @@ class XGBoostModel(object): def __init__(self, j_xgb_model): self._j_xgb_model = j_xgb_model - self.summary = XGBoostSummary(self._j_xgb_model.summary()) + try: + self.summary = XGBoostSummary(self._j_xgb_model.summary()) + except py4j.protocol.Py4JJavaError: + self.summary = None @staticmethod def trainWithFiles(fold, train_matrix, params, num_rounds=100, diff --git a/mjolnir/utilities/make_folds.py b/mjolnir/utilities/make_folds.py index c7ac04d..5cbd682 100644 --- a/mjolnir/utilities/make_folds.py +++ b/mjolnir/utilities/make_folds.py @@ -64,7 +64,7 @@ write_xgb(local_input, local_output.name) # Write out as text files from scala, much faster than shuffling to python - writer = sc._jvm.org.wikimedia.search.mjolnir.DataWriter(sc._jsc) + writer = sc._jvm.org.wikimedia.search.mjolnir.DataWriter(sc._jsc, False) j_paths = writer.write(df._jdf, num_workers, path_format, fold_col) # Convert everything to python objects -- To view, visit https://gerrit.wikimedia.org/r/406068 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I48d7bf96a300313b6f62e3f60742345e8bd1a83f Gerrit-PatchSet: 1 Gerrit-Project: search/MjoLniR Gerrit-Branch: master Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits