EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/406068 )

Change subject: Fixup xgboost training
......................................................................

Fixup xgboost training

* Use tree_method = hist when training on a single worker. This is
  significantly faster than the approx method used by default.

* We've always trained with dense feature matrix's, and the ltr plugin
  only supports dense evaluation, but the DataWriter was writing out
  sparse matrixs. This caused a degredation in ndcg.

* The txt file emitted by DataWriter has to be read by
  lightgbm and xgboost. As such it starts features at idx 1 to make
  lightgbm happy (which stores the label at idx 0). This broke
  XGBoostModel.eval because it was not providing the empty feature at
  index 0 that training sees.

* XGBoostModel.loadModelFrom* always failed because the summary
  method on the jvm side throws an exception (metrics are not
  serialized). Wrap in try/except and set summary to None when not
  available.

Change-Id: I48d7bf96a300313b6f62e3f60742345e8bd1a83f
---
M jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala
M jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala
M mjolnir/training/xgboost.py
M mjolnir/utilities/make_folds.py
4 files changed, 10 insertions(+), 4 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/68/406068/1

diff --git a/jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala 
b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala
index 8d6976b..01fcf6d 100644
--- a/jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala
+++ b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala
@@ -28,7 +28,7 @@
 ) extends Serializable {
 
   // Accepting JavaSparkContext for py4j compatability
-  def this(sc: JavaSparkContext) = this(sc.broadcast(new 
SerializableConfiguration(sc.hadoopConfiguration)))
+  def this(sc: JavaSparkContext, sparse: Boolean) = this(sc.broadcast(new 
SerializableConfiguration(sc.hadoopConfiguration)), sparse)
 
   private def asHDFSPath(path: String): HDFSPath = if (path.charAt(0) == '/') {
     new HDFSPath(s"file://$path")
diff --git 
a/jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala 
b/jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala
index ca85260..06004a7 100644
--- a/jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala
+++ b/jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala
@@ -44,7 +44,7 @@
       try {
         val df = makeData()
         val pattern = s"$testDir/%s-fold-%s-partition-%d"
-        val writer = new DataWriter(spark.sparkContext)
+        val writer = new DataWriter(spark.sparkContext, true)
         val folds = writer.write(df, numWorkers, pattern, foldCol)
 
         assert(folds.length == expectedFolds)
diff --git a/mjolnir/training/xgboost.py b/mjolnir/training/xgboost.py
index abeaabf..4d6bb9d 100644
--- a/mjolnir/training/xgboost.py
+++ b/mjolnir/training/xgboost.py
@@ -4,6 +4,7 @@
 import mjolnir.training.hyperopt
 from mjolnir.training.tuning import make_cv_objective, ModelSelection
 import numpy as np
+import py4j
 import pyspark
 import pyspark.sql
 from pyspark.sql import functions as F
@@ -114,6 +115,8 @@
     # ints, so this gets all the types right for Java. Also makes
     # a copy of params so we don't modifying the incoming dict.
     params = _coerce_params(params)
+    # Histogram doesn't work with distributed training
+    params['tree_method'] = 'hist' if len(fold) == 1 else 'approx'
     # TODO: Maybe num_rounds should just be external? But it's easier
     # to do hyperparameter optimization with a consistent dict interface
     kwargs = {
@@ -158,7 +161,10 @@
 class XGBoostModel(object):
     def __init__(self, j_xgb_model):
         self._j_xgb_model = j_xgb_model
-        self.summary = XGBoostSummary(self._j_xgb_model.summary())
+        try:
+            self.summary = XGBoostSummary(self._j_xgb_model.summary())
+        except py4j.protocol.Py4JJavaError:
+            self.summary = None
 
     @staticmethod
     def trainWithFiles(fold, train_matrix, params, num_rounds=100,
diff --git a/mjolnir/utilities/make_folds.py b/mjolnir/utilities/make_folds.py
index c7ac04d..5cbd682 100644
--- a/mjolnir/utilities/make_folds.py
+++ b/mjolnir/utilities/make_folds.py
@@ -64,7 +64,7 @@
                     write_xgb(local_input, local_output.name)
 
     # Write out as text files from scala, much faster than shuffling to python
-    writer = sc._jvm.org.wikimedia.search.mjolnir.DataWriter(sc._jsc)
+    writer = sc._jvm.org.wikimedia.search.mjolnir.DataWriter(sc._jsc, False)
     j_paths = writer.write(df._jdf, num_workers, path_format, fold_col)
 
     # Convert everything to python objects

-- 
To view, visit https://gerrit.wikimedia.org/r/406068
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I48d7bf96a300313b6f62e3f60742345e8bd1a83f
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to