[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Fixup xgboost training

2018-01-24 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/406068 )

Change subject: Fixup xgboost training
..

Fixup xgboost training

* Use tree_method = hist when training on a single worker. This is
  significantly faster than the approx method used by default.

* We've always trained with dense feature matrix's, and the ltr plugin
  only supports dense evaluation, but the DataWriter was writing out
  sparse matrixs. This caused a degredation in ndcg.

* The txt file emitted by DataWriter has to be read by
  lightgbm and xgboost. As such it starts features at idx 1 to make
  lightgbm happy (which stores the label at idx 0). This broke
  XGBoostModel.eval because it was not providing the empty feature at
  index 0 that training sees.

* XGBoostModel.loadModelFrom* always failed because the summary
  method on the jvm side throws an exception (metrics are not
  serialized). Wrap in try/except and set summary to None when not
  available.

Change-Id: I48d7bf96a300313b6f62e3f60742345e8bd1a83f
---
M jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala
M jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala
M mjolnir/training/xgboost.py
M mjolnir/utilities/make_folds.py
4 files changed, 10 insertions(+), 4 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/68/406068/1

diff --git a/jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala 
b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala
index 8d6976b..01fcf6d 100644
--- a/jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala
+++ b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala
@@ -28,7 +28,7 @@
 ) extends Serializable {
 
   // Accepting JavaSparkContext for py4j compatability
-  def this(sc: JavaSparkContext) = this(sc.broadcast(new 
SerializableConfiguration(sc.hadoopConfiguration)))
+  def this(sc: JavaSparkContext, sparse: Boolean) = this(sc.broadcast(new 
SerializableConfiguration(sc.hadoopConfiguration)), sparse)
 
   private def asHDFSPath(path: String): HDFSPath = if (path.charAt(0) == '/') {
 new HDFSPath(s"file://$path")
diff --git 
a/jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala 
b/jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala
index ca85260..06004a7 100644
--- a/jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala
+++ b/jvm/src/test/scala/org/wikimedia/search/mjolnir/DataWriterSuite.scala
@@ -44,7 +44,7 @@
   try {
 val df = makeData()
 val pattern = s"$testDir/%s-fold-%s-partition-%d"
-val writer = new DataWriter(spark.sparkContext)
+val writer = new DataWriter(spark.sparkContext, true)
 val folds = writer.write(df, numWorkers, pattern, foldCol)
 
 assert(folds.length == expectedFolds)
diff --git a/mjolnir/training/xgboost.py b/mjolnir/training/xgboost.py
index abeaabf..4d6bb9d 100644
--- a/mjolnir/training/xgboost.py
+++ b/mjolnir/training/xgboost.py
@@ -4,6 +4,7 @@
 import mjolnir.training.hyperopt
 from mjolnir.training.tuning import make_cv_objective, ModelSelection
 import numpy as np
+import py4j
 import pyspark
 import pyspark.sql
 from pyspark.sql import functions as F
@@ -114,6 +115,8 @@
 # ints, so this gets all the types right for Java. Also makes
 # a copy of params so we don't modifying the incoming dict.
 params = _coerce_params(params)
+# Histogram doesn't work with distributed training
+params['tree_method'] = 'hist' if len(fold) == 1 else 'approx'
 # TODO: Maybe num_rounds should just be external? But it's easier
 # to do hyperparameter optimization with a consistent dict interface
 kwargs = {
@@ -158,7 +161,10 @@
 class XGBoostModel(object):
 def __init__(self, j_xgb_model):
 self._j_xgb_model = j_xgb_model
-self.summary = XGBoostSummary(self._j_xgb_model.summary())
+try:
+self.summary = XGBoostSummary(self._j_xgb_model.summary())
+except py4j.protocol.Py4JJavaError:
+self.summary = None
 
 @staticmethod
 def trainWithFiles(fold, train_matrix, params, num_rounds=100,
diff --git a/mjolnir/utilities/make_folds.py b/mjolnir/utilities/make_folds.py
index c7ac04d..5cbd682 100644
--- a/mjolnir/utilities/make_folds.py
+++ b/mjolnir/utilities/make_folds.py
@@ -64,7 +64,7 @@
 write_xgb(local_input, local_output.name)
 
 # Write out as text files from scala, much faster than shuffling to python
-writer = sc._jvm.org.wikimedia.search.mjolnir.DataWriter(sc._jsc)
+writer = sc._jvm.org.wikimedia.search.mjolnir.DataWriter(sc._jsc, False)
 j_paths = writer.write(df._jdf, num_workers, path_format, fold_col)
 
 # Convert everything to python objects

-- 
To view, visit https://gerrit.wikimedia.org/r/406068
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings


[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Pull make_cv_objective outside tuner

2018-01-24 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/406067 )

Change subject: Pull make_cv_objective outside tuner
..

Pull make_cv_objective outside tuner

This really had no business in tuner, it's function is
independant and it didn't require any of the state. Adds
a test that verifies the function works roughly as expected.

Also drop the 'condition' argument from tuner stages. A standard
if condition should be used when building the stage list.

Change-Id: Ic3dff6a1a055cba3fc57debd4a1e3417476ddd4a
---
M mjolnir/test/training/test_tuning.py
M mjolnir/training/tuning.py
M mjolnir/training/xgboost.py
M mjolnir/utils.py
4 files changed, 72 insertions(+), 63 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/67/406067/1

diff --git a/mjolnir/test/training/test_tuning.py 
b/mjolnir/test/training/test_tuning.py
index 22402f1..15389d7 100644
--- a/mjolnir/test/training/test_tuning.py
+++ b/mjolnir/test/training/test_tuning.py
@@ -46,7 +46,7 @@
 }
 
 tuner = mjolnir.training.tuning.ModelSelection(initial_space, tune_stages)
-train_func = tuner.make_cv_objective(f, folds, num_cv_jobs, **kwargs)
+train_func = mjolnir.training.tuning.make_cv_objective(f, folds, 
num_cv_jobs, **kwargs)
 trials_pool = tuner.build_pool(folds, num_cv_jobs)
 result = tuner(train_func, trials_pool)
 return result, stats['called']
@@ -80,39 +80,14 @@
 assert result['params']['baz'] == 0
 
 
-def test_ModelSelection_stage_condition():
-num_iterations = 3
-result, called = run_model_selection([
-('a', {
-'condition': lambda: False,
-'iterations': num_iterations,
-'space': {
-'foo': hyperopt.hp.uniform('foo', 1, 9),
-}
-}),
-('b', {
-'iterations': num_iterations,
-'space': {
-'bar': hyperopt.hp.uniform('bar', 1, 9),
-}
-}),
-])
-# iterations * folds
-assert called == num_iterations * 2
-assert result['params']['foo'] == 10
-assert 1 <= result['params']['bar'] <= 9
-assert result['params']['baz'] == 0
-
-
 def test_ModelSelection_kwargs_pass_thru():
-tuner = mjolnir.training.tuning.ModelSelection(None, None)
 expected_kwargs = {'hi': 5, 'there': 'test'}
 
 def f(fold, params, **kwargs):
 assert kwargs == expected_kwargs
 return {'test': [fold[0]], 'train': [fold[0]]}
 
-obj = tuner.make_cv_objective(f, [[1], [2]], 1, **expected_kwargs)
+obj = mjolnir.training.tuning.make_cv_objective(f, [[1], [2]], 1, 
**expected_kwargs)
 
 res = obj(None)
 assert res == [
@@ -144,3 +119,23 @@
 folds = [[1] * num_workers for i in range(num_folds)]
 pool = tuner.build_pool(folds, num_cv_jobs)
 assert (pool is not None) == expect_pool
+
+
+def test_ModelSelection_transformer():
+stats = {'called': 0}
+
+def transformer(result, params):
+assert 'foo' in result
+assert result['foo'] == 'bar'
+assert params == 'some params'
+stats['called'] += 1
+return 'baz'
+
+def f(fold, params):
+assert params == 'some params'
+return {'foo': 'bar'}
+
+folds = [[1, 2, 3], [4, 5, 6]]
+obj = mjolnir.training.tuning.make_cv_objective(f, folds, 1, transformer)
+assert obj('some params') == ['baz', 'baz']
+assert stats['called'] == 2
diff --git a/mjolnir/training/tuning.py b/mjolnir/training/tuning.py
index 7d2df68..81bfafe 100644
--- a/mjolnir/training/tuning.py
+++ b/mjolnir/training/tuning.py
@@ -133,11 +133,48 @@
 return with_retry
 
 
+def make_cv_objective(train_func, folds, num_cv_jobs, transformer=None, 
**kwargs):
+"""Create a cross-validation objective function
+
+Parameters
+--
+train_func : callable
+Function accepting a fold and hyperparameters to perform training
+num_cv_jobs : int
+The total number of folds to train in parallel
+transformer : callable or None, optional
+Function accepting output of train_func and hyperparameters to
+return stats about the individual fold train/test performance
+
+Returns
+---
+callable
+Accepts a set of hyperparameters as only argument and returns
+list of per-fold train/test performance.
+"""
+train_func = _py4j_retry(train_func, None)
+if num_cv_jobs > 1:
+cv_pool = Pool(num_cv_jobs)
+cv_mapper = cv_pool.map
+else:
+cv_mapper = map
+
+def f(params):
+def inner(fold):
+return train_func(fold, params, **kwargs)
+
+return cv_mapper(inner, folds)
+
+if transformer is None:
+return f
+else:
+return lambda params: [transformer(scores, params) for scores in 
f(params)]
+
+
 class ModelSelection(object):
-def __init__(self, initial_space, tune_stages, 

[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: add python interface to scala dbn

2018-01-24 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/406069 )

Change subject: add python interface to scala dbn
..

add python interface to scala dbn

It turns out only the driver has a py4j connection to the jvm,
executors talk to spark directly through sockets. To use jvm
implementations in the executors we need to trigger that from
jvm. Added an implementation and some basic tests.

Change-Id: Iee7f79662e89bcf64cdb447aac0df5b68ee1170c
---
M jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala
M jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala
M jvm/src/test/scala/org/wikimedia/search/mjolnir/DBNSuite.scala
M mjolnir/dbn.py
M mjolnir/test/conftest.py
M mjolnir/test/training/test_xgboost.py
M mjolnir/utilities/data_pipeline.py
M setup.py
8 files changed, 164 insertions(+), 197 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/69/406069/1

diff --git a/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala 
b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala
index faac7dc..d051c7d 100644
--- a/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala
+++ b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala
@@ -13,7 +13,12 @@
   * implementation was ported from python clickmodels by Aleksandr Chuklin and 
the
   * notes on math were added in an attempt to understand why the 
implementation works.
   */
+import org.apache.spark.rdd.RDD
+
 import scala.collection.mutable
+import org.apache.spark.sql.{DataFrame, Row, functions => F}
+import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
+import org.apache.spark.sql.{types => T}
 import org.json4s.{JArray, JBool, JString}
 import org.json4s.jackson.JsonMethods
 
@@ -29,7 +34,7 @@
   def urlToId(queryId: Int, url: String): Int = {
 val urlToIdMap = queryIdToUrlToIdMap.getOrElseUpdate(queryId, { 
mutable.Map() })
 urlToIdMap.getOrElseUpdate(url, {
-  var nextUrlId = queryIdToNextUrlId.getOrElse(queryId, 0)
+  val nextUrlId = queryIdToNextUrlId.getOrElse(queryId, 0)
   queryIdToNextUrlId(queryId) = nextUrlId + 1
   nextUrlId
 })
@@ -79,7 +84,7 @@
   c
 }
 
-val hasClicks = allClicks.take(n).exists { x => x}
+val hasClicks = allClicks.exists { x => x }
 if (urls.length < minDocsPerQuery ||
 (discardNoClicks && !hasClicks)
 ) {
@@ -185,20 +190,14 @@
 // attractiveness and satisfaction values for each position
 class PositionRel(var a: Array[Double], var s: Array[Double])
 
-case class SessionEstimate(
-  a: (Double, Double), s: (Double, Double),
-  e: Array[(Double, Double)], C: Double,
-  clicks: Array[Double])
-
-
 class DbnModel(gamma: Double, config: Config) {
   val invGamma: Double = 1D - gamma
 
   def train(sessions: Seq[SessionItem]): Array[Array[UrlRel]] = {
 // This is basically a multi-dimensional array with queryId in the first
-// dimension and urlId in the second dimension. Because queries only 
reference
-// a subset of the known urls we use a map at the second level instead of
-// creating the entire matrix.
+// dimension and urlId in the second dimension. InputReader guarantees
+// that queryId starts at 0 and is continuous, and that per-query id urlId
+// also starts at 0 and is continuous, allowing static sized arrays to be 
used.
 val urlRelevances: Array[Array[UrlRel]] = (0 to config.maxQueryId).map { 
queryId =>
   (0 to config.maxUrlIds(queryId)).map { _ => new 
UrlRel(config.defaultRel, config.defaultRel) }.toArray
 }.toArray
@@ -267,7 +266,7 @@
   val queryUrlRelFrac = urlRelFractions(s.queryId)
   i = 0
   while (i < N) {
-var urlId = s.urlIds(i)
+val urlId = s.urlIds(i)
 // update attraction
 val rel = queryUrlRelFrac(urlId)
 val estA = sessionEstimate.a(i)
@@ -410,7 +409,7 @@
 // (alpha, beta)
   }
 
-  var sessionEstimate = new PositionRel(new Array[Double](config.serpSize), 
new Array[Double](config.serpSize))
+  val sessionEstimate = new PositionRel(new Array[Double](config.serpSize), 
new Array[Double](config.serpSize))
   // Returns
   //  a: P(A_i|C_i,G) - Probability of attractiveness at position i 
conditioned on clicked and gamma
   //  s: P(S_i|C_i,G) - Probability of satisfaction at position i conditioned 
on clicked and gamma
@@ -461,4 +460,95 @@
   }
 }
 
+private class DbnHitPage(val hitPageId: Int, val hitPosition: Double, val 
clicked: Boolean)
 
+/**
+  * Predict relevance of query/page pairs from individual user search sessions.
+  */
+object DBN {
+  // TODO: These should all be configurable? Perhaps
+  // also simplified somehow...
+  private val CLICKED = "clicked"
+  private val HITS = "hits"
+  private val HIT_PAGE_ID = "hit_page_id"
+  private val HIT_POSITION = "hit_position"
+  private val NORM_QUERY_ID = "norm_query_id"
+  private val RELEVANCE = "relevance"
+  

[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Add end-to-end integration test

2018-01-24 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/406070 )

Change subject: Add end-to-end integration test
..

Add end-to-end integration test

A basic end to end run through of the training pipeline. It's
of course a bit slow, but worthwhile to see the whole operation
run from end to end.

* verifies that the general premise works
* outputs from one stage are expected by the input from the next
* models are in expected places and loadable
* evaluations run against the models match train time metrics

Change-Id: I8ad5fe1dbbbd50b897362b44411cfc19650b0390
---
M jvm/src/main/scala/org/wikimedia/search/mjolnir/PythonUtils.scala
M mjolnir/test/conftest.py
A mjolnir/test/fixtures/requests/test_integration.sqlite3
M mjolnir/test/training/test_xgboost.py
M mjolnir/training/xgboost.py
M mjolnir/utilities/data_pipeline.py
M mjolnir/utilities/make_folds.py
M mjolnir/utilities/training_pipeline.py
8 files changed, 63 insertions(+), 20 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/70/406070/1

diff --git a/jvm/src/main/scala/org/wikimedia/search/mjolnir/PythonUtils.scala 
b/jvm/src/main/scala/org/wikimedia/search/mjolnir/PythonUtils.scala
index 8c6929e..9200fe9 100644
--- a/jvm/src/main/scala/org/wikimedia/search/mjolnir/PythonUtils.scala
+++ b/jvm/src/main/scala/org/wikimedia/search/mjolnir/PythonUtils.scala
@@ -1,7 +1,7 @@
 package org.wikimedia.search.mjolnir
 
 import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
-import org.apache.spark.ml.linalg.{Vector => MLVector}
+import org.apache.spark.ml.linalg.{DenseVector, SparseVector, Vectors, Vector 
=> MLVector}
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.functions.col
 import org.apache.spark.sql.{Dataset, Row}
@@ -13,22 +13,38 @@
   * pyspark.
   */
 object PythonUtils {
+  private def shiftVector(vec: MLVector): MLVector = vec match {
+case y: DenseVector => Vectors.dense(Array(0D) ++ y.toArray)
+case y: SparseVector => Vectors.sparse(y.size + 1, y.indices.map(_ + 1), 
y.values)
+  }
+
   /**
* There is no access to LabeledPoint from pyspark, but various methods such 
as
* trainWithRDD and eval require an RDD[MLLabeledPoint]. This offers a 
bridge to
* convert a Dataset into the required format.
*
+   * @deprecated
* @param ds Input dataframe containing features and label
* @param featureCol Name of the column containing feature vectors
* @param labelCol Name of the column containing numeric labels
+   * @param shiftRight Shift all features to index + 1. This is a disapointing 
hack,
+   *  but due to the way data files are created feature 
indices start
+   *  at 1 and the 0 feature is empty. This allows to shift to 
match
+   *  when evaluating a dataframe againts a model trained that 
way.
*/
-  def toLabeledPoints(ds: Dataset[_], featureCol: String, labelCol: String): 
RDD[MLLabeledPoint] = {
+  def toLabeledPoints(ds: Dataset[_], featureCol: String, labelCol: String, 
shiftRight: Boolean): RDD[MLLabeledPoint] = {
 ds.select(col(featureCol), col(labelCol).cast(DoubleType)).rdd.map {
   case Row(feature: MLVector, label: Double) =>
+val shiftedFeature = if (shiftRight) shiftVector(feature) else feature
 MLLabeledPoint(label, feature)
 }
   }
 
+  def toLabeledPoints(ds: Dataset[_], featureCol: String, labelCol: String): 
RDD[MLLabeledPoint] = {
+toLabeledPoints(ds, featureCol, labelCol, shiftRight = false)
+  }
+
+
   /**
* Training/evaluating a ranking model in XGBoost requires rows for the same
* query to be provided sequentially, and it needs to know for each partition
diff --git a/mjolnir/test/conftest.py b/mjolnir/test/conftest.py
index efc8441..c4c3d77 100644
--- a/mjolnir/test/conftest.py
+++ b/mjolnir/test/conftest.py
@@ -72,7 +72,8 @@
 .set('spark.jars.packages', ','.join([
 'ml.dmlc:xgboost4j-spark:0.8-wmf-1',
 'org.wikimedia.search:mjolnir:0.4-SNAPSHOT',
-'org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0']))
+'org.apache.spark:spark-streaming-kafka-0-8_2.11:2.1.0',
+'org.wikimedia.analytics.refinery.hive:refinery-hive:0.0.57']))
 # By default spark will shuffle to 200 partitions, which is
 # way too many for our small test cases. This cuts execution
 # time of the tests in half.
diff --git a/mjolnir/test/fixtures/requests/test_integration.sqlite3 
b/mjolnir/test/fixtures/requests/test_integration.sqlite3
new file mode 100644
index 000..44957bf
--- /dev/null
+++ b/mjolnir/test/fixtures/requests/test_integration.sqlite3
Binary files differ
diff --git a/mjolnir/test/training/test_xgboost.py 
b/mjolnir/test/training/test_xgboost.py
index 100a3e8..ba8dc48 100644
--- a/mjolnir/test/training/test_xgboost.py
+++ b/mjolnir/test/training/test_xgboost.py
@@ 

[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Support LXC in Vagrantfile

2018-01-18 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/405209 )

Change subject: Support LXC in Vagrantfile
..

Support LXC in Vagrantfile

LXC, from a linux host, can be significantly more responsive
than virtualbox. Support it in the Vagrantfile.

Change-Id: I2b1e2d41beea97dd216c8c75395735a472104208
---
M Vagrantfile
1 file changed, 14 insertions(+), 9 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/09/405209/1

diff --git a/Vagrantfile b/Vagrantfile
index d8d71ad..1005dca 100644
--- a/Vagrantfile
+++ b/Vagrantfile
@@ -1,18 +1,23 @@
 Vagrant.configure("2") do |config|
 
+config.vm.provider :lxc do |_lxc, override|
+override.vm.box = 'LEAP/jessie'
+end
+
 config.vm.provider :virtualbox do |vb, override|
 override.vm.box = 'debian/contrib-jessie64'
 vb.customize ['modifyvm', :id, '--memory', '2048']
+
+root_share_options = { id: 'vagrant-root' }
+root_share_options[:type] = :nfs
+root_share_options[:mount_options] = ['noatime', 'rsize=32767', 
'wsize=3267', 'async', 'nolock']
+override.nfs.map_uid = Process.uid
+override.nfs.map_gid = Process.gid
+override.vm.synced_folder ".", "/vagrant", root_share_options
+
+override.vm.hostname = "MjoLniR"
+override.vm.network "private_network", type: "dhcp"
 end
 
-root_share_options = { id: 'vagrant-root' }
-root_share_options[:type] = :nfs
-root_share_options[:mount_options] = ['noatime', 'rsize=32767', 
'wsize=3267', 'async', 'nolock']
-config.nfs.map_uid = Process.uid
-config.nfs.map_gid = Process.gid
-config.vm.synced_folder ".", "/vagrant", root_share_options
-
-config.vm.hostname = "MjoLniR"
-config.vm.network "private_network", type: "dhcp"
 config.vm.provision "shell", path: "bootstrap-vm.sh"
 end

-- 
To view, visit https://gerrit.wikimedia.org/r/405209
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I2b1e2d41beea97dd216c8c75395735a472104208
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] operations/mediawiki-config[master]: Switch wiktionary sister search on enwiki to title only

2018-01-18 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/405206 )

Change subject: Switch wiktionary sister search on enwiki to title only
..

Switch wiktionary sister search on enwiki to title only

Bug: T185250
Change-Id: I43ff74472e4cdd2a925cf284905e70c318eb6468
---
M wmf-config/CirrusSearch-common.php
M wmf-config/InitialiseSettings.php
2 files changed, 19 insertions(+), 8 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/mediawiki-config 
refs/changes/06/405206/1

diff --git a/wmf-config/CirrusSearch-common.php 
b/wmf-config/CirrusSearch-common.php
index e5e45ac..572a692 100644
--- a/wmf-config/CirrusSearch-common.php
+++ b/wmf-config/CirrusSearch-common.php
@@ -241,14 +241,7 @@
 $wgCirrusSearchFetchConfigFromApi = true;
 
 // Override sister search profiles for specific projects
-$wgCirrusSearchCrossProjectProfiles = [
-   // full text wikivoyage results are often irrelevant, filter the
-   // search with title matches to improve relevance.
-   'voy' => [
-   'ftbuilder' => 'perfield_builder_title_filter',
-   'rescore' => 'wsum_inclinks',
-   ],
-];
+$wgCirrusSearchCrossProjectProfiles = $wmgCirrusSearchCrossProjectProfiles
 
 $wgCirrusSearchCrossProjectSearchBlackList = 
$wmgCirrusSearchCrossProjectSearchBlackList;
 $wgCirrusSearchCrossProjectShowMultimedia = 
$wmgCirrusSearchCrossProjectShowMultimedia;
diff --git a/wmf-config/InitialiseSettings.php 
b/wmf-config/InitialiseSettings.php
index 39dccc7..2c9b460 100755
--- a/wmf-config/InitialiseSettings.php
+++ b/wmf-config/InitialiseSettings.php
@@ -18437,6 +18437,24 @@
'itwikivoyage' => false,
 ],
 
+'wmgCirrusSearchCrossProjectProfiles = [
+   'default' => [
+   // full text wikivoyage results are often irrelevant, filter the
+   // search with title matches to improve relevance
+   'voy' => [
+   'ftbuilder' => 'perfield_builder_title_filter',
+   'rescore' => 'wsum_inclinks',
+   ],
+   ],
+   '+enwiki' => [
+   // T185250
+   'wikt' => [
+   'ftbuilder' => 'perfield_builder_title_filter',
+   'rescore' => 'wsum_inclinks',
+   ],
+   ],
+],
+
 'wmgCirrusSearchIgnoreOnWikiBoostTemplates' => [
'default' => false,
// on wiki boost templates have to high boosts for enwiki

-- 
To view, visit https://gerrit.wikimedia.org/r/405206
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I43ff74472e4cdd2a925cf284905e70c318eb6468
Gerrit-PatchSet: 1
Gerrit-Project: operations/mediawiki-config
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] mediawiki...WikimediaEvents[wmf/1.31.0-wmf.17]: Turn off cirrus AB test on hewiki

2018-01-16 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/404595 )

Change subject: Turn off cirrus AB test on hewiki
..

Turn off cirrus AB test on hewiki

Test has run it's course. Time to turn off and reset hewiki
sampling back to the old (default) rates.

Bug: T182616
Change-Id: I0b6d95af218379d47043c63a240ec5a7132d86ee
(cherry picked from commit b2495fcacf3ae7325f34f37705c2c108eb29e513)
---
M modules/all/ext.wikimediaEvents.searchSatisfaction.js
1 file changed, 1 insertion(+), 7 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikimediaEvents 
refs/changes/95/404595/1

diff --git a/modules/all/ext.wikimediaEvents.searchSatisfaction.js 
b/modules/all/ext.wikimediaEvents.searchSatisfaction.js
index de78f11..b818261 100644
--- a/modules/all/ext.wikimediaEvents.searchSatisfaction.js
+++ b/modules/all/ext.wikimediaEvents.searchSatisfaction.js
@@ -114,9 +114,7 @@
function initialize( session ) {
 
var sessionId = session.get( 'sessionId' ),
-   validBuckets = mw.config.get( 'wgDBname' ) === 
'hewiki' ?
-   [ 'control', 'ltr-1024', 'ltr-1024-i' ] 
:
-   [],
+   validBuckets = [],
sampleSize = ( function () {
var dbName = mw.config.get( 'wgDBname' 
),
// Provides a place to handle 
wiki-specific sampling,
@@ -188,10 +186,6 @@
zhwiki: {
test: 100,
subTest: null
-   },
-   hewiki: {
-   test: 0.8112,
-   subTest: 0.8767
}
};
if ( subTests[ dbName ] ) {

-- 
To view, visit https://gerrit.wikimedia.org/r/404595
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I0b6d95af218379d47043c63a240ec5a7132d86ee
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/WikimediaEvents
Gerrit-Branch: wmf/1.31.0-wmf.17
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] mediawiki...WikimediaEvents[wmf/1.31.0-wmf.16]: Turn off cirrus AB test on hewiki

2018-01-16 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/404594 )

Change subject: Turn off cirrus AB test on hewiki
..

Turn off cirrus AB test on hewiki

Test has run it's course. Time to turn off and reset hewiki
sampling back to the old (default) rates.

Bug: T182616
Change-Id: I0b6d95af218379d47043c63a240ec5a7132d86ee
(cherry picked from commit b2495fcacf3ae7325f34f37705c2c108eb29e513)
---
M modules/all/ext.wikimediaEvents.searchSatisfaction.js
1 file changed, 1 insertion(+), 7 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikimediaEvents 
refs/changes/94/404594/1

diff --git a/modules/all/ext.wikimediaEvents.searchSatisfaction.js 
b/modules/all/ext.wikimediaEvents.searchSatisfaction.js
index de78f11..b818261 100644
--- a/modules/all/ext.wikimediaEvents.searchSatisfaction.js
+++ b/modules/all/ext.wikimediaEvents.searchSatisfaction.js
@@ -114,9 +114,7 @@
function initialize( session ) {
 
var sessionId = session.get( 'sessionId' ),
-   validBuckets = mw.config.get( 'wgDBname' ) === 
'hewiki' ?
-   [ 'control', 'ltr-1024', 'ltr-1024-i' ] 
:
-   [],
+   validBuckets = [],
sampleSize = ( function () {
var dbName = mw.config.get( 'wgDBname' 
),
// Provides a place to handle 
wiki-specific sampling,
@@ -188,10 +186,6 @@
zhwiki: {
test: 100,
subTest: null
-   },
-   hewiki: {
-   test: 0.8112,
-   subTest: 0.8767
}
};
if ( subTests[ dbName ] ) {

-- 
To view, visit https://gerrit.wikimedia.org/r/404594
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I0b6d95af218379d47043c63a240ec5a7132d86ee
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/WikimediaEvents
Gerrit-Branch: wmf/1.31.0-wmf.16
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] operations/mediawiki-config[master]: Remove cirrus AB test config for hewiki

2018-01-16 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/404592 )

Change subject: Remove cirrus AB test config for hewiki
..

Remove cirrus AB test config for hewiki

This test is complete and the configuration is no longer
necessary.

Bug: T182616
Change-Id: Ibb55c90ecf7fc562860fb69a06d2d1f2babf49aa
---
M wmf-config/InitialiseSettings.php
1 file changed, 0 insertions(+), 24 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/mediawiki-config 
refs/changes/92/404592/1

diff --git a/wmf-config/InitialiseSettings.php 
b/wmf-config/InitialiseSettings.php
index 9767eb5..c76cecc 100644
--- a/wmf-config/InitialiseSettings.php
+++ b/wmf-config/InitialiseSettings.php
@@ -18702,30 +18702,6 @@
 
 'wmgCirrusSearchUserTesting' => [
'default' => [],
-   'hewiki' => [
-   'ltr' => [
-   'globals' => [],
-   'buckets' => [
-   'control' => [
-   'trigger' => 'control',
-   ],
-   'ltr-1024' => [
-   'trigger' => 'ltr-1024',
-   'globals' => [
-   'wgCirrusSearchRescoreProfile' 
=> 'mlr-1024rs',
-   ]
-   ],
-   'ltr-1024-i' => [
-   'trigger' => 'ltr-1024-i',
-   'globals' => [
-   
'wgCirrusSearchInterleaveConfig' => [
-   
'CirrusSearchRescoreProfile' => 'mlr-1024rs'
-   ],
-   ],
-   ],
-   ],
-   ],
-   ],
 ],
 
 'wmgCirrusSearchLanguageDetectors' => [

-- 
To view, visit https://gerrit.wikimedia.org/r/404592
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ibb55c90ecf7fc562860fb69a06d2d1f2babf49aa
Gerrit-PatchSet: 1
Gerrit-Project: operations/mediawiki-config
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] mediawiki...WikimediaEvents[master]: Turn off hewiki AB test

2018-01-16 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/404591 )

Change subject: Turn off hewiki AB test
..

Turn off hewiki AB test

Test has run it's course. Time to turn off and reset hewiki
sampling back to the old (default) rates.

Bug: T182616
Change-Id: I0b6d95af218379d47043c63a240ec5a7132d86ee
---
M modules/all/ext.wikimediaEvents.searchSatisfaction.js
1 file changed, 1 insertion(+), 7 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikimediaEvents 
refs/changes/91/404591/1

diff --git a/modules/all/ext.wikimediaEvents.searchSatisfaction.js 
b/modules/all/ext.wikimediaEvents.searchSatisfaction.js
index de78f11..b818261 100644
--- a/modules/all/ext.wikimediaEvents.searchSatisfaction.js
+++ b/modules/all/ext.wikimediaEvents.searchSatisfaction.js
@@ -114,9 +114,7 @@
function initialize( session ) {
 
var sessionId = session.get( 'sessionId' ),
-   validBuckets = mw.config.get( 'wgDBname' ) === 
'hewiki' ?
-   [ 'control', 'ltr-1024', 'ltr-1024-i' ] 
:
-   [],
+   validBuckets = [],
sampleSize = ( function () {
var dbName = mw.config.get( 'wgDBname' 
),
// Provides a place to handle 
wiki-specific sampling,
@@ -188,10 +186,6 @@
zhwiki: {
test: 100,
subTest: null
-   },
-   hewiki: {
-   test: 0.8112,
-   subTest: 0.8767
}
};
if ( subTests[ dbName ] ) {

-- 
To view, visit https://gerrit.wikimedia.org/r/404591
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I0b6d95af218379d47043c63a240ec5a7132d86ee
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/WikimediaEvents
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Generalize tuning pipeline

2018-01-11 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/403869 )

Change subject: Generalize tuning pipeline
..

Generalize tuning pipeline

This pipeline was pretty convoluted. Push most of the complexity
up out of the pipeline into the single ModelSelection object leaving
the rest of the model selection code (cross validation, tuning,
parameter selection, etc) clearer and more directly implemented.
This also provides a reusable tuning implementation to share between
xgboost and lightgbm.

Change-Id: I8f2a2f3aeca85fe86cb6d466622a2e83dd249172
---
M mjolnir/test/training/test_hyperopt.py
M mjolnir/test/training/test_tuning.py
M mjolnir/training/hyperopt.py
M mjolnir/training/lightgbm.py
M mjolnir/training/tuning.py
M mjolnir/training/xgboost.py
M mjolnir/utilities/training_pipeline.py
7 files changed, 289 insertions(+), 276 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/69/403869/1

diff --git a/mjolnir/test/training/test_hyperopt.py 
b/mjolnir/test/training/test_hyperopt.py
index c4c4782..1dec547 100644
--- a/mjolnir/test/training/test_hyperopt.py
+++ b/mjolnir/test/training/test_hyperopt.py
@@ -1,27 +1,19 @@
 from __future__ import absolute_import
 import hyperopt
 import mjolnir.training.hyperopt
-from pyspark.ml.linalg import Vectors
-import pytest
 
 
-def _make_q(query, n=4):
-"Generates single feature queries"
-return [('foowiki', query, query, float(f), Vectors.dense([float(f)])) for 
f in range(n)]
-
-
-@pytest.fixture
-def df_train(spark_context, hive_context):
-# TODO: Use some fixture dataset representing real-ish data? But
-# it needs to be pretty small
-return spark_context.parallelize(
-_make_q('abc') + _make_q('def') + _make_q('ghi') + _make_q('jkl')
-+ _make_q('mno') + _make_q('pqr') + _make_q('stu')
-).toDF(['wikiid', 'norm_query_id', 'query', 'label', 'features'])
-
-
-def test_minimize(folds_b):
+def test_maximize(folds_b):
 "Not an amazing test...basically sees if the happy path doesnt blow up"
+def f(params):
+assert isinstance(params, dict)
+assert 'max_depth' in params
+assert params['num_rounds'] == 50
+return [{
+'train': [0.80],
+'test': [0.79],
+}]
+
 space = {
 'num_rounds': 50,
 'max_depth': hyperopt.hp.quniform('max_depth', 1, 20, 1)
@@ -30,33 +22,11 @@
 # mostly hyperopt just calls cross_validate, of which the integration with
 # xgboost is separately tested. Instead of going all the way into xgboost
 # mock it out w/MockModel.
-best_params, trails = mjolnir.training.hyperopt.minimize(
-folds_b, MockModel, space, max_evals=5)
+best_params, trails = mjolnir.training.hyperopt.maximize(
+f, space, max_evals=5)
 assert isinstance(best_params, dict)
 # num_rounds should have been unchanged
 assert 'num_rounds' in best_params
 assert best_params['num_rounds'] == 50
 # should have max_evals evaluations
 assert len(trails.trials) == 5
-
-
-class MockSummary(object):
-def train(self):
-return [1.]
-
-def test(self):
-return [1.]
-
-
-class MockModel(object):
-def __init__(self, df, params, train_matrix=None):
-# Params that were passed to hyperopt
-assert isinstance(params, dict)
-assert 'max_depth' in params
-assert params['num_rounds'] == 50
-
-def eval(self, df_test, j_groups=None, feature_col='features', 
label_col='label'):
-return 1.0
-
-def summary(self):
-return MockSummary()
diff --git a/mjolnir/test/training/test_tuning.py 
b/mjolnir/test/training/test_tuning.py
index e14982b..22402f1 100644
--- a/mjolnir/test/training/test_tuning.py
+++ b/mjolnir/test/training/test_tuning.py
@@ -1,8 +1,8 @@
 from __future__ import absolute_import
+import hyperopt
 import mjolnir.training.tuning
 import mjolnir.training.xgboost
 from pyspark.sql import functions as F
-from pyspark.ml.linalg import Vectors
 import pytest
 
 
@@ -32,27 +32,115 @@
 assert len(queries_in_0.intersection(queries_in_1)) == 0
 
 
-def _make_q(query, n=4):
-"Generates single feature queries"
-return [('foowiki', query, query, float(f), Vectors.dense([float(f)])) for 
f in range(n)]
+def run_model_selection(tune_stages, f=None, num_cv_jobs=1, **kwargs):
+stats = {'called': 0}
+initial_space = {'foo': 10, 'bar': 20, 'baz': 0}
+folds = [[1, 2, 3], [4, 5, 6]]
+if not f:
+def f(fold, params, **kwargs):
+stats['called'] += 1
+factor = 1.0 / (6 * params['foo'])
+return {
+'test': [v * factor * 0.9 for v in fold],
+'train': [v * factor for v in fold],
+}
+
+tuner = mjolnir.training.tuning.ModelSelection(initial_space, tune_stages)
+train_func = tuner.make_cv_objective(f, folds, num_cv_jobs, 

[MediaWiki-commits] [Gerrit] mediawiki/core[wmf/1.31.0-wmf.16]: Deprecate old interwiki search result widget

2018-01-11 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/403719 )

Change subject: Deprecate old interwiki search result widget
..

Deprecate old interwiki search result widget

Update the flag for new interwiki sidebar from unset means disabled
to unset means enabled. Deprecate the old rendering widgets to be
removed at a later date per deprecation policy.

Change-Id: I80d8375bbd3e1fabc9b2432b6875d17a96aee099
Related: I9a488438
(cherry picked from commit d95f644e80fb894ca4f22a9fcdeab53cde9dedc9)
---
M includes/specials/SpecialSearch.php
M includes/widget/search/SimpleSearchResultSetWidget.php
M includes/widget/search/SimpleSearchResultWidget.php
3 files changed, 8 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core 
refs/changes/19/403719/1

diff --git a/includes/specials/SpecialSearch.php 
b/includes/specials/SpecialSearch.php
index b3a58cb..f826844 100644
--- a/includes/specials/SpecialSearch.php
+++ b/includes/specials/SpecialSearch.php
@@ -394,7 +394,8 @@
$linkRenderer = $this->getLinkRenderer();
$mainResultWidget = new FullSearchResultWidget( $this, 
$linkRenderer );
 
-   if ( $search->getFeatureData( 'enable-new-crossproject-page' ) 
) {
+   // Default (null) on. Can be explicitly disabled.
+   if ( $search->getFeatureData( 'enable-new-crossproject-page' ) 
!== false ) {
$sidebarResultWidget = new InterwikiSearchResultWidget( 
$this, $linkRenderer );
$sidebarResultsWidget = new 
InterwikiSearchResultSetWidget(
$this,
diff --git a/includes/widget/search/SimpleSearchResultSetWidget.php 
b/includes/widget/search/SimpleSearchResultSetWidget.php
index d6583a3..d0c259f 100644
--- a/includes/widget/search/SimpleSearchResultSetWidget.php
+++ b/includes/widget/search/SimpleSearchResultSetWidget.php
@@ -13,6 +13,8 @@
  * Renders one or more SearchResultSets into a sidebar grouped by
  * interwiki prefix. Includes a per-wiki header indicating where
  * the results are from.
+ *
+ * @deprecated since 1.31. Use InterwikiSearchResultSetWidget
  */
 class SimpleSearchResultSetWidget implements SearchResultSetWidget {
/** @var SpecialSearch */
@@ -32,6 +34,7 @@
LinkRenderer $linkRenderer,
InterwikiLookup $iwLookup
) {
+   wfDeprecated( __METHOD__, '1.31' );
$this->specialSearch = $specialSearch;
$this->resultWidget = $resultWidget;
$this->linkRenderer = $linkRenderer;
diff --git a/includes/widget/search/SimpleSearchResultWidget.php 
b/includes/widget/search/SimpleSearchResultWidget.php
index fa07563..552cbaf 100644
--- a/includes/widget/search/SimpleSearchResultWidget.php
+++ b/includes/widget/search/SimpleSearchResultWidget.php
@@ -9,6 +9,8 @@
 
 /**
  * Renders a simple one-line result
+ *
+ * @deprecated since 1.31. Use other result widgets.
  */
 class SimpleSearchResultWidget implements SearchResultWidget {
/** @var SpecialSearch */
@@ -17,6 +19,7 @@
protected $linkRenderer;
 
public function __construct( SpecialSearch $specialSearch, LinkRenderer 
$linkRenderer ) {
+   wfDeprecated( __METHOD__, '1.31' );
$this->specialSearch = $specialSearch;
$this->linkRenderer = $linkRenderer;
}

-- 
To view, visit https://gerrit.wikimedia.org/r/403719
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I80d8375bbd3e1fabc9b2432b6875d17a96aee099
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/core
Gerrit-Branch: wmf/1.31.0-wmf.16
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Repoint spark in example_train.yaml

2018-01-10 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/403546 )

Change subject: Repoint spark in example_train.yaml
..

Repoint spark in example_train.yaml

Mostly this makes it easier to push a branch over to stat1005 and try
something out against full-sized data. Having example_train.yaml
be "close enough" helps a good bit.

Also update spark to 2.1.2 to match, and add a 'master' template
so yarn/local can be toggled from spark.py command line

Change-Id: Iccd44c0c9436287ba963a3c8b2244b3fa0a46274
---
M example_train.yaml
A mjolnir/pruning.py
A mjolnir/scan_es.py
M mjolnir/test/fixtures/load_config/example_train.expect
4 files changed, 316 insertions(+), 100 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/46/403546/1

diff --git a/example_train.yaml b/example_train.yaml
index 31ce45c..7b1749c 100644
--- a/example_train.yaml
+++ b/example_train.yaml
@@ -3,10 +3,11 @@
 global:
 environment:
 PYSPARK_PYTHON: venv/bin/python
-SPARK_CONF_DIR: /etc/spark/conf
-SPARK_HOME: "%(HOME)s/spark-%(spark_version)s-bin-hadoop2.6"
+SPARK_CONF_DIR: /etc/spark2/conf
+SPARK_HOME: "/usr/lib/spark2"
 template_vars:
-spark_version: 2.1.0
+spark_version: 2.1.2
+master: yarn
 # Path to spark-submit applicatoin
 spark_submit: "%(SPARK_HOME)s/bin/spark-submit"
 # Local path to zip'd virtualenv which will be shipped to executors
@@ -50,7 +51,7 @@
 ? "%(spark_submit)s"
 ? "%(PYSPARK_PYTHON)s"
 spark_args:
-master: yarn
+master: "%(master)s"
 # TODO: When is this necessary?
 files: /usr/lib/libhdfs.so.0.0.0
 # Ship the mjolnir virtualenv to executors and decompress it to ./venv
diff --git a/mjolnir/pruning.py b/mjolnir/pruning.py
new file mode 100644
index 000..c2e78f9
--- /dev/null
+++ b/mjolnir/pruning.py
@@ -0,0 +1,134 @@
+from __future__ import absolute_import
+import json
+import math
+from pyspark.sql import functions as F
+from pyspark.sql.types import FloatType, StructField, StructType
+
+
+class Split(object):
+def __init__(self, left, right, feature, threshold):
+self.left = left
+self.right = right
+self.feature = feature
+self.threshold = threshold
+
+def isLeaf(self):
+return False
+
+def eval(self, features):
+n = self
+while not n.isLeaf():
+if n.threshold > features[n.feature]:
+n = n.left
+else:
+n = n.right
+return n.output
+
+
+class Leaf(object):
+def __init__(self, output):
+self.output = output
+
+def isLeaf(self):
+return True
+
+
+def _parse_node(json_node):
+if 'leaf' in json_node:
+return Leaf(json_node['leaf'])
+else:
+left = _parse_node(json_node['children'][0])
+right = _parse_node(json_node['children'][1])
+return Split(left, right, json_node['split'], 
json_node['split_condition'])
+
+
+def parse_xgboost(json_tree):
+return [_parse_node(tree) for tree in json.loads(json_tree)]
+
+
+def ndcg_at_k(k, predicted, actual):
+idcg = sum([((1 << label) - 1) / math.log(i + 2.0, 2) for i, label in 
enumerate(actual[:k])])
+if idcg == 0:
+return 0.
+else:
+dcg = sum([((1 << label) - 1) / math.log(i + 2.0, 2) for i, label in 
enumerate(predicted[:k])])
+return dcg / idcg
+
+
+# Horrible name ... it returns the ndcg for each removed tree
+def gen_per_tree_ndcg(tree_cols, removed_trees, label_col, k=10):
+def f(rows):
+# Remove trees from the sum
+cur_sum = [reduce(lambda acc, tree: acc - row[tree], removed_trees, 
row.sum) for row in rows]
+data = zip(rows, cur_sum)
+
+# TODO: actual could be pre-calculated? Actually full idcg could be 
pre-calculated
+actual = [x[0][label_col] for x in sorted(data, key=lambda x: 
x[0][label_col], reverse=True)]
+# baseline ndcg
+predicted = [x[0][label_col] for x in sorted(data, key=lambda x: x[1], 
reverse=True)]
+res = [ndcg_at_k(k, predicted, actual)]
+# Per-tree ndcgs
+for tree_pred in tree_cols:
+predicted = [x[0][label_col] for x in sorted(data, key=lambda x: 
x[1] - x[0][tree_pred], reverse=True)]
+res.append(ndcg_at_k(k, predicted, actual))
+return res
+fields = [StructField(name, FloatType()) for name in ['orig'] + tree_cols]
+return F.udf(f, StructType(fields))
+
+
+def gen_eval_tree_udf(bc_trees):
+def f(tree_id, features):
+return bc_trees.value[tree_id].eval(features)
+return F.udf(f, FloatType())
+
+
+def prune(df, trees, feature_col='features', label_col='label', 
group_cols=['wikiid', 'query']):
+# Calculate per-tree scores
+eval_tree_udf = gen_eval_tree_udf(df._sc.broadcast(trees))
+

[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: JVM components to support file-based training

2018-01-10 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/403545 )

Change subject: JVM components to support file-based training
..

JVM components to support file-based training

An upcoming refactor changes training_pipeline.py from dataframe based
training to file based training, where we emit partitioned and
formatted folds/splits to hdfs and load them into training by copying
to a local file and pointing c++ as it.

This is a separate patch so we can release a new version of the
MjoLniR jar. Due to how our CI works python cannot test against new
jvm code until it has been released.

The entry points that python will be using are:
* DataWriter.write
* MlrXGBoost.trainWithFiles

Change-Id: Ib5e8cd9d3e87e724f05b5ec0941c140aa5077d71
---
M .gitignore
D jvm/mjolnir.iml
M jvm/pom.xml
A jvm/src/main/scala/ml/dmlc/xgboost4j/scala/spark/MjolnirUtils.scala
A jvm/src/main/scala/org/wikimedia/search/mjolnir/AsLocalFile.scala
A jvm/src/main/scala/org/wikimedia/search/mjolnir/DataWriter.scala
A jvm/src/main/scala/org/wikimedia/search/mjolnir/MlrXGBoost.scala
A jvm/src/test/resources/fixtures/datasets/test.txt
A jvm/src/test/resources/fixtures/datasets/test.txt.query
A jvm/src/test/resources/fixtures/datasets/train.txt
A jvm/src/test/resources/fixtures/datasets/train.txt.query
M jvm/src/test/scala/org/wikimedia/search/mjolnir/PythonUtilsSuite.scala
12 files changed, 749 insertions(+), 163 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/45/403545/1

diff --git a/.gitignore b/.gitignore
index f2a9cf7..83930f6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -27,6 +27,7 @@
 # Editor temporary files
 .*.sw[po]
 /jvm/.idea
+/jvm/mjolnir.iml
 
 # Vagrant, and cdh stuff in vagrant
 .vagrant
diff --git a/jvm/mjolnir.iml b/jvm/mjolnir.iml
deleted file mode 100644
index b341014..000
--- a/jvm/mjolnir.iml
+++ /dev/null
@@ -1,162 +0,0 @@
-
-
-  
-
-
-
-  
-  
-  
-  
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-  
-
\ No newline at end of file
diff --git a/jvm/pom.xml b/jvm/pom.xml
index 479b4ec..d1fdc13 100644
--- a/jvm/pom.xml
+++ b/jvm/pom.xml
@@ -14,7 +14,7 @@
 2.1.0
 2.11.8
 2.11
-0.7-wmf-1
+0.8-wmf-1-SNAPSHOT
 
 
 
@@ -146,6 +146,16 @@
 
jackson-module-scala_${scala.binary.version}
 2.6.5
 
+
+ml.dmlc
+xgboost4j-spark
+${xgboost.version}
+
+
+ml.dmlc
+xgboost4j
+${xgboost.version}
+
 
 
 
diff --git 
a/jvm/src/main/scala/ml/dmlc/xgboost4j/scala/spark/MjolnirUtils.scala 
b/jvm/src/main/scala/ml/dmlc/xgboost4j/scala/spark/MjolnirUtils.scala
new file mode 100644
index 000..45a00af
--- /dev/null
+++ b/jvm/src/main/scala/ml/dmlc/xgboost4j/scala/spark/MjolnirUtils.scala
@@ -0,0 +1,28 @@
+package ml.dmlc.xgboost4j.scala.spark
+
+import ml.dmlc.xgboost4j.java.IRabitTracker
+import ml.dmlc.xgboost4j.scala.Booster
+import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
+
+/**
+  * Provide access to package-private constructs of xgboost4j-spark
+  */
+object MjolnirUtils {
+  def model(booster: Booster, metrics: Map[String, Array[Float]], trainMatrix: 
String): XGBoostModel = {
+// Arbitrarily take an 'other' matrix if available
+val xgMetrics = metrics.keys.find(!_.equals(trainMatrix)).map{ name => Map(
+  "train" -> metrics(trainMatrix),
+  "test" -> metrics(name)
+) }.getOrElse(Map(
+  "train" -> metrics(trainMatrix)
+))
+
+val model = new XGBoostRegressionModel(booster)
+model.setSummary(XGBoostTrainingSummary(xgMetrics))
+model
+  }
+
+  def scalaRabitTracker(nWorkers: Int): IRabitTracker = {
+new RabitTracker(nWorkers)
+  }
+}
diff --git a/jvm/src/main/scala/org/wikimedia/search/mjolnir/AsLocalFile.scala 
b/jvm/src/main/scala/org/wikimedia/search/mjolnir/AsLocalFile.scala
new file mode 100644
index 000..9962b3a
--- /dev/null
+++ 

[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Add lightgbm support

2018-01-09 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/403335 )

Change subject: Add lightgbm support
..

Add lightgbm support

Only support single-executor training at the moment. Distributed
training is left for another day.

Change-Id: Ia9a188ef87afc86985ac9c3e269b6665dcceca10
---
A mjolnir/training/lightgbm.py
M mjolnir/utilities/training_pipeline.py
M setup.py
3 files changed, 248 insertions(+), 6 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/35/403335/1

diff --git a/mjolnir/training/lightgbm.py b/mjolnir/training/lightgbm.py
new file mode 100644
index 000..cbc8883
--- /dev/null
+++ b/mjolnir/training/lightgbm.py
@@ -0,0 +1,221 @@
+from __future__ import absolute_import
+import contextlib
+import functools
+import hyperopt
+import json
+import lightgbm as lgb
+import math
+import mjolnir.training.hyperopt
+from mjolnir.utils import as_local_paths
+from multiprocessing.dummy import Pool
+import numpy as np
+import pyspark
+
+
+def _overrideParamsAccordingToTaskCpus(sc, params):
+n_cpus = int(sc.getConf().get("spark.task.cpus", "1"))
+if 'num_threads' not in params:
+params['num_threads'] = n_cpus
+elif params['num_threads'] > n_cpus:
+raise Exception(
+"the num_threads param %d must be no larger than spark.task.cpus 
(%d)" % (
+params['num_threads'], n_cpus))
+
+
+@contextlib.contextmanager
+def load_datasets(fold, train_matrix):
+with as_local_paths(*fold.values) as local_paths:
+datasets = dict(zip(fold.keys(), local_paths))
+try:
+yield datasets
+finally:
+for ds in datasets.values():
+ds._free_handle()
+
+
+def build_distributed_boosters(rdd, params, train_matrix):
+def build_partition(rows):
+fold = rows.next()
+try:
+rows.next()
+raise Exception("Expected single row in partition but received 
more.")
+except StopIteration:
+pass
+
+num_rounds = 100
+if 'num_rounds' in params:
+num_rounds = params['num_rounds']
+del params['num_rounds']
+
+# TODO: Generalize
+with load_datasets(fold) as datasets:
+eval_results = {}
+gbm = lgb.train(
+params, datasets[train_matrix],
+num_boost_round=num_rounds,
+valid_sets=datasets.values(), valid_names=datasets.keys(),
+early_stopping_rounds=None, evals_result=eval_results)
+gbm.free_dataset()
+yield (gbm, eval_results)
+
+return rdd.mapPartitions(build_partition).cache()
+
+
+def _coerce_params(params):
+types = {
+'min_data_in_leaf': int,
+'num_leaves': int,
+}
+for k, val_type in types.items():
+if k in params:
+params[k] = val_type(params[k])
+
+
+def train(fold, paramOverrides, train_matrix=None):
+sc = pyspark.SparkContext.getOrCreate()
+params = {
+'boosting_type': 'gbdt',
+'objective': 'lambdarank',
+'metric': 'ndcg',
+'ndcg_eval_at': '1,3,5,10',
+'is_training_metric': True,
+'num_rounds': 100,
+'max_bin': 255,
+'num_leaves': 63,
+'learning_rate': 0.1,
+'feature_fraction': 1.0,
+'bagging_fraction': 0.9,
+'bagging_freq': 1,
+'verbose': 0,
+}
+params.update(paramOverrides)
+_overrideParamsAccordingToTaskCpus(sc, params)
+_coerce_params(params)
+
+if (len(fold) > 1):
+rdd = sc.parallelize(list(enumerate(fold)), 1).partitionBy(len(fold), 
lambda x: x).map(lambda x: x[1])
+raise Exception("TODO: Distributed Training")
+else:
+rdd = sc.parallelize(fold, 1)
+
+if train_matrix is None:
+train_matrix = "all" if "all" in fold else "train"
+
+booster, metrics = build_distributed_boosters(rdd, params, 
train_matrix).collect()[0]
+return LightGBMModel(booster, metrics)
+
+
+class LightGBMSummary(object):
+def __init__(self, metrics):
+self._metrics = metrics
+
+def train(self):
+return self._metrics['train']['ndcg@10']
+
+def test(self):
+return self._metrics['test']['ndcg@10']
+
+
+class LightGBMModel(object):
+def __init__(self, booster, metrics):
+self._booster = booster
+self.metrics = metrics
+
+def summary(self):
+return LightGBMSummary(self.metrics)
+
+def dump(self, features=None):
+# TODO: lightgbm needs features provided when creating the dataset
+return json.dumps(self._booster.dump_model())
+
+def saveModelAsLocalFile(self, path):
+self._booster.save_model(path)
+
+
+def tune(folds, stats, train_matrix, num_cv_jobs=5, num_workers=5, 
initial_num_trees=100, final_num_trees=500):
+cv_pool = None
+if num_cv_jobs > 1:
+cv_pool 

[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Simplify hyperparameter tuning

2018-01-09 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/40 )

Change subject: Simplify hyperparameter tuning
..

Simplify hyperparameter tuning

I tested letting all the tuning happen at once instead of
the iterative approach we were using, it went quicker and
gave comparable results. This will also make it easier to
add in lightgbm as an alternate training algo.

Also removed use_external_memory parameter from xgboost. This
is specialized and won't be necessary anymore after an
upcoming refactor for file based training.

Change-Id: I8cc4ee504d0e49bc61ffc5d2781e131fabe4372c
---
M example_train.yaml
A mjolnir/pruning.py
A mjolnir/scan_es.py
M mjolnir/test/fixtures/load_config/example_train.expect
M mjolnir/test/training/test_hyperopt.py
M mjolnir/test/training/test_tuning.py
M mjolnir/training/hyperopt.py
M mjolnir/training/tuning.py
M mjolnir/training/xgboost.py
M mjolnir/utilities/training_pipeline.py
10 files changed, 306 insertions(+), 330 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/33/40/1

diff --git a/example_train.yaml b/example_train.yaml
index 183ea6e..31ce45c 100644
--- a/example_train.yaml
+++ b/example_train.yaml
@@ -138,7 +138,6 @@
 cv-jobs: 22
 folds: 3
 final-trees: 100
-use-external-memory: yes
 
 medium:
 # 4M to 12M observations per executor.
diff --git a/mjolnir/pruning.py b/mjolnir/pruning.py
new file mode 100644
index 000..c2e78f9
--- /dev/null
+++ b/mjolnir/pruning.py
@@ -0,0 +1,134 @@
+from __future__ import absolute_import
+import json
+import math
+from pyspark.sql import functions as F
+from pyspark.sql.types import FloatType, StructField, StructType
+
+
+class Split(object):
+def __init__(self, left, right, feature, threshold):
+self.left = left
+self.right = right
+self.feature = feature
+self.threshold = threshold
+
+def isLeaf(self):
+return False
+
+def eval(self, features):
+n = self
+while not n.isLeaf():
+if n.threshold > features[n.feature]:
+n = n.left
+else:
+n = n.right
+return n.output
+
+
+class Leaf(object):
+def __init__(self, output):
+self.output = output
+
+def isLeaf(self):
+return True
+
+
+def _parse_node(json_node):
+if 'leaf' in json_node:
+return Leaf(json_node['leaf'])
+else:
+left = _parse_node(json_node['children'][0])
+right = _parse_node(json_node['children'][1])
+return Split(left, right, json_node['split'], 
json_node['split_condition'])
+
+
+def parse_xgboost(json_tree):
+return [_parse_node(tree) for tree in json.loads(json_tree)]
+
+
+def ndcg_at_k(k, predicted, actual):
+idcg = sum([((1 << label) - 1) / math.log(i + 2.0, 2) for i, label in 
enumerate(actual[:k])])
+if idcg == 0:
+return 0.
+else:
+dcg = sum([((1 << label) - 1) / math.log(i + 2.0, 2) for i, label in 
enumerate(predicted[:k])])
+return dcg / idcg
+
+
+# Horrible name ... it returns the ndcg for each removed tree
+def gen_per_tree_ndcg(tree_cols, removed_trees, label_col, k=10):
+def f(rows):
+# Remove trees from the sum
+cur_sum = [reduce(lambda acc, tree: acc - row[tree], removed_trees, 
row.sum) for row in rows]
+data = zip(rows, cur_sum)
+
+# TODO: actual could be pre-calculated? Actually full idcg could be 
pre-calculated
+actual = [x[0][label_col] for x in sorted(data, key=lambda x: 
x[0][label_col], reverse=True)]
+# baseline ndcg
+predicted = [x[0][label_col] for x in sorted(data, key=lambda x: x[1], 
reverse=True)]
+res = [ndcg_at_k(k, predicted, actual)]
+# Per-tree ndcgs
+for tree_pred in tree_cols:
+predicted = [x[0][label_col] for x in sorted(data, key=lambda x: 
x[1] - x[0][tree_pred], reverse=True)]
+res.append(ndcg_at_k(k, predicted, actual))
+return res
+fields = [StructField(name, FloatType()) for name in ['orig'] + tree_cols]
+return F.udf(f, StructType(fields))
+
+
+def gen_eval_tree_udf(bc_trees):
+def f(tree_id, features):
+return bc_trees.value[tree_id].eval(features)
+return F.udf(f, FloatType())
+
+
+def prune(df, trees, feature_col='features', label_col='label', 
group_cols=['wikiid', 'query']):
+# Calculate per-tree scores
+eval_tree_udf = gen_eval_tree_udf(df._sc.broadcast(trees))
+cols = [eval_tree_udf(F.lit(i), feature_col).alias('tree_%d' % (i)) for i 
in range(len(trees))]
+tree_cols = ['tree_%d' % (i) for i in range(len(trees))]
+
+# We should iterate until it gets worse or we hit some desired # of trees
+df_w_scores = (
+df
+.select(feature_col, label_col, 
F.concat(*group_cols).alias('group_id'))
+ 

[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: [WIP] distributed training for lightgbm

2018-01-09 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/403336 )

Change subject: [WIP] distributed training for lightgbm
..

[WIP] distributed training for lightgbm

untested. The daemon never closes right

Change-Id: Id50f4f53b221003a89555e870bb771ba26faad21
---
M mjolnir/training/lightgbm.py
1 file changed, 267 insertions(+), 81 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/36/403336/1

diff --git a/mjolnir/training/lightgbm.py b/mjolnir/training/lightgbm.py
index cbc8883..460740b 100644
--- a/mjolnir/training/lightgbm.py
+++ b/mjolnir/training/lightgbm.py
@@ -9,7 +9,11 @@
 from mjolnir.utils import as_local_paths
 from multiprocessing.dummy import Pool
 import numpy as np
+import Pyro4
 import pyspark
+import socket
+import threading
+import time
 
 
 def _overrideParamsAccordingToTaskCpus(sc, params):
@@ -33,7 +37,9 @@
 ds._free_handle()
 
 
-def build_distributed_boosters(rdd, params, train_matrix):
+def build_distributed_boosters(rdd, params, train_matrix, client):
+num_partitions = rdd.getNumPartitions()
+
 def build_partition(rows):
 fold = rows.next()
 try:
@@ -47,7 +53,11 @@
 num_rounds = params['num_rounds']
 del params['num_rounds']
 
-# TODO: Generalize
+if client is not None:
+machines, listen_port = client.request_machine_list(num_partitions)
+params['machines'] = machines
+params['local_listen_port'] = listen_port
+
 with load_datasets(fold) as datasets:
 eval_results = {}
 gbm = lgb.train(
@@ -71,7 +81,7 @@
 params[k] = val_type(params[k])
 
 
-def train(fold, paramOverrides, train_matrix=None):
+def train(fold, paramOverrides, train_matrix=None, client=None):
 sc = pyspark.SparkContext.getOrCreate()
 params = {
 'boosting_type': 'gbdt',
@@ -95,13 +105,15 @@
 if (len(fold) > 1):
 rdd = sc.parallelize(list(enumerate(fold)), 1).partitionBy(len(fold), 
lambda x: x).map(lambda x: x[1])
 raise Exception("TODO: Distributed Training")
+if client is None:
+raise Exception("client required for distributed training")
 else:
 rdd = sc.parallelize(fold, 1)
 
 if train_matrix is None:
 train_matrix = "all" if "all" in fold else "train"
 
-booster, metrics = build_distributed_boosters(rdd, params, 
train_matrix).collect()[0]
+booster, metrics = build_distributed_boosters(rdd, params, train_matrix, 
client).collect()[0]
 return LightGBMModel(booster, metrics)
 
 
@@ -132,90 +144,264 @@
 self._booster.save_model(path)
 
 
+DAEMON_PORT = 6827
+
+
 def tune(folds, stats, train_matrix, num_cv_jobs=5, num_workers=5, 
initial_num_trees=100, final_num_trees=500):
 cv_pool = None
 if num_cv_jobs > 1:
 cv_pool = Pool(num_cv_jobs)
 
-# Configure the trials pool large enough to keep cv_pool full
-num_folds = len(folds)
-num_workers = len(folds[0])
-trials_pool_size = int(math.floor(num_cv_jobs / (num_workers * num_folds)))
-if trials_pool_size > 1:
-trials_pool = Pool(trials_pool_size)
-else:
-trials_pool = None
+with Daemon(socket.gethostname(), DAEMON_PORT) as daemon:
+while not daemon.ready:
+time.sleep(1)
 
-train_func = functools.partial(train, train_matrix=train_matrix)
+# Configure the trials pool large enough to keep cv_pool full
+num_folds = len(folds)
+num_workers = len(folds[0])
+trials_pool_size = int(math.floor(num_cv_jobs / (num_workers * 
num_folds)))
+if trials_pool_size > 1:
+trials_pool = Pool(trials_pool_size)
+else:
+trials_pool = None
 
-def eval_space(space, max_evals):
-max_evals = 2  # TODO: remove
-best, trials = mjolnir.training.hyperopt.minimize(
-folds, train_func, space, max_evals=max_evals,
-cv_pool=cv_pool, trials_pool=trials_pool)
-for k, v in space.items():
-if not np.isscalar(v):
-print 'best %s: %f' % (k, best[k])
-return best, trials
+kwargs = {'train_matrix': train_matrix}
+if num_workers > 1:
+kwargs['client'] = Client(daemon.url)
+train_func = functools.partial(train, **kwargs)
 
-space = {
-'boosting_type': 'gbdt',
-'objective': 'lambdarank',
-'metric': 'ndcg',
-'ndcg_eval_at': '1,3,10',
-'is_training_metric': True,
-'num_rounds': initial_num_trees,
-'max_bin': 255,
-'num_leaves': 63,
-'learning_rate': 0.1,
-'feature_fraction': 1.0,
-'bagging_fraction': 0.9,
-'bagging_freq': 1,
-}
-tune_spaces = [
-('initial', {
-'iterations': 5,
-'space': {
-

[MediaWiki-commits] [Gerrit] search/xgboost[master]: [WIP] Merge remote-tracking branch 'upstream/master'

2018-01-04 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/402114 )

Change subject: [WIP] Merge remote-tracking branch 'upstream/master'
..

[WIP] Merge remote-tracking branch 'upstream/master'

Conflicts:
   jvm-packages/pom.xml
   jvm-packages/xgboost4j-spark/pom.xml
   
jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
   jvm-packages/xgboost4j/pom.xml

Change-Id: I1ae675ee924579623f2cf5d5fc4b797c84e56d0c
---
M jvm-packages/pom.xml
M jvm-packages/xgboost4j-spark/pom.xml
M 
jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
M jvm-packages/xgboost4j/pom.xml
4 files changed, 15 insertions(+), 101 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/xgboost 
refs/changes/14/402114/1

diff --git a/jvm-packages/pom.xml b/jvm-packages/pom.xml
index 130505d..0fab33d 100644
--- a/jvm-packages/pom.xml
+++ b/jvm-packages/pom.xml
@@ -4,11 +4,7 @@
 
 ml.dmlc
 xgboost-jvm
-<<< HEAD   (9bdbdc Add unique tag to log instances in RabitTracker)
-0.7-wmf-2-SNAPSHOT
-===
-0.8-SNAPSHOT
->>> BRANCH (14c639 [jvm-packages] add dev script to update version and 
update v)
+0.8-wmf-1-SNAPSHOT
 pom
 
 UTF-8
diff --git a/jvm-packages/xgboost4j-spark/pom.xml 
b/jvm-packages/xgboost4j-spark/pom.xml
index 3532a91..5f02dd7 100644
--- a/jvm-packages/xgboost4j-spark/pom.xml
+++ b/jvm-packages/xgboost4j-spark/pom.xml
@@ -4,11 +4,7 @@
 
 ml.dmlc
 xgboost-jvm
-<<< HEAD   (9bdbdc Add unique tag to log instances in RabitTracker)
-0.7-wmf-2-SNAPSHOT
-===
-0.8-SNAPSHOT
->>> BRANCH (14c639 [jvm-packages] add dev script to update version and 
update v)
+0.8-wmf-1-SNAPSHOT
 
 xgboost4j-spark
 
diff --git 
a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
 
b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index 053fbbb..2ff1ddf 100644
--- 
a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ 
b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -16,17 +16,12 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
-<<< HEAD   (9bdbdc Add unique tag to log instances in RabitTracker)
 import java.io.ByteArrayInputStream
 import java.util.concurrent.TimeUnit
-
-import scala.collection.mutable
-import scala.concurrent.duration.Duration
-===
 import java.io.File
 
 import scala.collection.mutable
->>> BRANCH (14c639 [jvm-packages] add dev script to update version and 
update v)
+import scala.concurrent.duration.Duration
 import scala.util.Random
 import ml.dmlc.xgboost4j.java.{IRabitTracker, Rabit, XGBoostError, 
RabitTracker => PyRabitTracker}
 import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
@@ -38,7 +33,6 @@
 import org.apache.spark.sql.Dataset
 import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
 import org.apache.spark.{SparkContext, SparkParallelismTracker, TaskContext}
-
 
 
 /**
@@ -121,23 +115,11 @@
   obj: ObjectiveTrait,
   eval: EvalTrait,
   useExternalMemory: Boolean,
-<<< HEAD   (9bdbdc Add unique tag to log instances in RabitTracker)
-  missing: Float): RDD[Array[Byte]] = {
-val partitionedData = if (data.getNumPartitions != numWorkers) {
-  logger.info(s"repartitioning training set to $numWorkers partitions")
-  data.repartition(numWorkers)
-} else {
-  data
-}
-val partitionedBaseMargin = partitionedData.map(_.baseMargin)
-val appName = partitionedData.context.appName
-===
   missing: Float,
   prevBooster: Booster
-): RDD[(Booster, Map[String, Array[Float]])] = {
+): RDD[(Int, Array[Byte], Map[String, Array[Float]])] = {
 
 val partitionedBaseMargin = data.map(_.baseMargin)
->>> BRANCH (14c639 [jvm-packages] add dev script to update version and 
update v)
 // to workaround the empty partitions in training dataset,
 // this might not be the best efficient implementation, see
 // (https://github.com/dmlc/xgboost/issues/1277)
@@ -157,42 +139,28 @@
   } else {
 None
   }
-<<< HEAD   (9bdbdc Add unique tag to log instances in RabitTracker)
 
   // Yes it's odd to access this but not do anything. We are ensuring the 
lazily
   // initialized resource monitor is setup before we enter training.
   monitor
 
-  rabitEnv.put("DMLC_TASK_ID", TaskContext.getPartitionId().toString)
-===
   rabitEnv.put("DMLC_TASK_ID", taskId)
->>> BRANCH (14c639 [jvm-packages] add dev script to update version and 
update v)
   Rabit.init(rabitEnv)
   val watches = Watches(params,
-<<< HEAD   (9bdbdc Add unique tag to log instances in RabitTracker)
-fromDenseToSparseLabeledPoints(labeledPoints, missing),
-

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Revert "Add backend support for the new crossproject result ...

2018-01-03 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/401795 )

Change subject: Revert "Add backend support for the new crossproject result 
page A/B test"
..

Revert "Add backend support for the new crossproject result page A/B test"

This test is over, we don't need the ability to request but throw away
interwiki results to estimate recall and such anymore. The related
feature-marker, enable-new-crossproject-page, is removed from core
in the related patch.

This reverts commit b43efa1dbe8d257c6d31a4e350dbcc6d1723e9fb.

Related: I80d8375b
Change-Id: I9a4884386a5c15852af15942ff51de60d9355858
---
M CirrusSearch.php
M includes/CirrusSearch.php
2 files changed, 0 insertions(+), 31 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/95/401795/1

diff --git a/CirrusSearch.php b/CirrusSearch.php
index a2d3461..3357283 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -1062,22 +1062,6 @@
 $wgCirrusSearchCrossProjectProfiles = [];
 
 /**
- * When wgCirrusSearchEnableCrossProjectSearch is true
- * Setting wgCirrusSearchHideCrossProjectResults will
- * tell SpecialSearch to run normally without displaying
- * interwiki results.
- * Useful to report how many results we could have been
- * displayed (For analytics purpose).
- */
-$wgCirrusSearchHideCrossProjectResults = false;
-
-/**
- * Informs SpeciaSearch in core that we want
- * to use the new cross project result page
- */
-$wgCirrusSearchNewCrossProjectPage = false;
-
-/**
  * Enables the explore similar feature for search results
  * which adds links to related pages (morelike), categories and
  * languages beside each search result on the SERP.
diff --git a/includes/CirrusSearch.php b/includes/CirrusSearch.php
index 40e759b..f41c9b3 100644
--- a/includes/CirrusSearch.php
+++ b/includes/CirrusSearch.php
@@ -407,34 +407,19 @@
$iwSearch = new InterwikiSearcher( $this->connection, 
$config, $this->namespaces, null, $highlightingConfig );
$iwSearch->setOptionsFromRequest( $this->request );
$interwikiResults = $iwSearch->getInterwikiResults( 
$term );
-
if ( $interwikiResults !== null ) {
// If we are dumping we need to convert into an 
array that can be appended to
-   $recallMetrics = [];
if ( $iwSearch->isReturnRaw() ) {
$result = [ $result ];
}
foreach ( $interwikiResults as $interwiki => 
$interwikiResult ) {
-   $recallMetrics[$interwiki] = 
"$interwiki:0";
if ( $iwSearch->isReturnRaw() ) {
$result[] = $interwikiResult;
} elseif ( $interwikiResult && 
$interwikiResult->numRows() > 0 ) {
-   $recallMetrics[$interwiki] = 
"$interwiki:" . $interwikiResult->getTotalHits();
-   // Hide the search results, we 
are only
-   // running the query for 
analytic purposes
-   if ( $this->config->get( 
'CirrusSearchHideCrossProjectResults' ) ) {
-   continue;
-   }
$result->addInterwikiResults(
$interwikiResult, 
SearchResultSet::SECONDARY_RESULTS, $interwiki
);
}
-   }
-   
$this->extraSearchMetrics['wgCirrusSearchCrossProjectRecall'] = implode( '|', 
$recallMetrics );
-   if ( $this->config->get( 
'CirrusSearchNewCrossProjectPage' ) &&
-   !$this->config->get( 
'CirrusSearchHideCrossProjectResults' ) ) {
-   
$this->features['enable-new-crossproject-page'] = true;
-   
$this->features['show-multimedia-search-results'] = $this->config->get( 
'CirrusSearchCrossProjectShowMultimedia' );
}
}
}

-- 
To view, visit https://gerrit.wikimedia.org/r/401795
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I9a4884386a5c15852af15942ff51de60d9355858
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 


[MediaWiki-commits] [Gerrit] mediawiki/core[master]: Remove old interwiki search result widget

2018-01-03 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/401794 )

Change subject: Remove old interwiki search result widget
..

Remove old interwiki search result widget

This code is no longer necessary, if interwiki search is on it should
use the new default which includes highlights.

Change-Id: I80d8375bbd3e1fabc9b2432b6875d17a96aee099
Related: I9a488438
---
M includes/specials/SpecialSearch.php
D includes/widget/search/SimpleSearchResultSetWidget.php
D includes/widget/search/SimpleSearchResultWidget.php
3 files changed, 8 insertions(+), 210 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core 
refs/changes/94/401794/1

diff --git a/includes/specials/SpecialSearch.php 
b/includes/specials/SpecialSearch.php
index b3a58cb..789ab4d 100644
--- a/includes/specials/SpecialSearch.php
+++ b/includes/specials/SpecialSearch.php
@@ -28,8 +28,6 @@
 use MediaWiki\Widget\Search\FullSearchResultWidget;
 use MediaWiki\Widget\Search\InterwikiSearchResultWidget;
 use MediaWiki\Widget\Search\InterwikiSearchResultSetWidget;
-use MediaWiki\Widget\Search\SimpleSearchResultWidget;
-use MediaWiki\Widget\Search\SimpleSearchResultSetWidget;
 
 /**
  * implements Special:Search - Run text & title search and display the output
@@ -394,24 +392,14 @@
$linkRenderer = $this->getLinkRenderer();
$mainResultWidget = new FullSearchResultWidget( $this, 
$linkRenderer );
 
-   if ( $search->getFeatureData( 'enable-new-crossproject-page' ) 
) {
-   $sidebarResultWidget = new InterwikiSearchResultWidget( 
$this, $linkRenderer );
-   $sidebarResultsWidget = new 
InterwikiSearchResultSetWidget(
-   $this,
-   $sidebarResultWidget,
-   $linkRenderer,
-   
MediaWikiServices::getInstance()->getInterwikiLookup(),
-   $search->getFeatureData( 
'show-multimedia-search-results' )
-   );
-   } else {
-   $sidebarResultWidget = new SimpleSearchResultWidget( 
$this, $linkRenderer );
-   $sidebarResultsWidget = new SimpleSearchResultSetWidget(
-   $this,
-   $sidebarResultWidget,
-   $linkRenderer,
-   
MediaWikiServices::getInstance()->getInterwikiLookup()
-   );
-   }
+   $sidebarResultWidget = new InterwikiSearchResultWidget( $this, 
$linkRenderer );
+   $sidebarResultsWidget = new InterwikiSearchResultSetWidget(
+   $this,
+   $sidebarResultWidget,
+   $linkRenderer,
+   MediaWikiServices::getInstance()->getInterwikiLookup(),
+   $search->getFeatureData( 
'show-multimedia-search-results' )
+   );
 
$widget = new BasicSearchResultSetWidget( $this, 
$mainResultWidget, $sidebarResultsWidget );
 
diff --git a/includes/widget/search/SimpleSearchResultSetWidget.php 
b/includes/widget/search/SimpleSearchResultSetWidget.php
deleted file mode 100644
index d6583a3..000
--- a/includes/widget/search/SimpleSearchResultSetWidget.php
+++ /dev/null
@@ -1,130 +0,0 @@
-specialSearch = $specialSearch;
-   $this->resultWidget = $resultWidget;
-   $this->linkRenderer = $linkRenderer;
-   $this->iwLookup = $iwLookup;
-   }
-
-   /**
-* @param string $term User provided search term
-* @param SearchResultSet|SearchResultSet[] $resultSets List of 
interwiki
-*  results to render.
-* @return string HTML
-*/
-   public function render( $term, $resultSets ) {
-   if ( !is_array( $resultSets ) ) {
-   $resultSets = [ $resultSets ];
-   }
-
-   $this->loadCustomCaptions();
-
-   $iwResults = [];
-   foreach ( $resultSets as $resultSet ) {
-   $result = $resultSet->next();
-   while ( $result ) {
-   if ( !$result->isBrokenTitle() ) {
-   
$iwResults[$result->getTitle()->getInterwiki()][] = $result;
-   }
-   $result = $resultSet->next();
-   }
-   }
-
-   $out = '';
-   foreach ( $iwResults as $iwPrefix => $results ) {
-   $out .= $this->headerHtml( $iwPrefix, $term );
-   $out .= "";
-   // TODO: Assumes interwiki results are never paginated
-   $position = 0;
-   foreach ( $results as $result ) {
- 

[MediaWiki-commits] [Gerrit] mediawiki...WikimediaEvents[master]: Bring back human search relevance survey

2018-01-02 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/401631 )

Change subject: Bring back human search relevance survey
..

Bring back human search relevance survey

This survey was temporarily turned off while evaluating the results of
the first round. We are getting ready to run a second survey, this
time with queries we don't know the relevance of (the final goal of
the surveys), and need this code available to be updated before
shipping out to users.

Bug: T184019
Change-Id: I036c611d93e0f3f20f992ab89ab79b8e8d738826
---
A modules/all/ext.wikimediaEvents.humanSearchRelevance.css
A modules/all/ext.wikimediaEvents.humanSearchRelevance.js
2 files changed, 150 insertions(+), 0 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikimediaEvents 
refs/changes/31/401631/1

diff --git a/modules/all/ext.wikimediaEvents.humanSearchRelevance.css 
b/modules/all/ext.wikimediaEvents.humanSearchRelevance.css
new file mode 100644
index 000..41594a0
--- /dev/null
+++ b/modules/all/ext.wikimediaEvents.humanSearchRelevance.css
@@ -0,0 +1,10 @@
+/* Needs some extra specificity to override `.mw-body p` */
+.mw-notification-content .mw-wme-humanrel-question {
+   margin: 0;
+}
+
+.mw-wme-humanrel-question > span,
+.mw-wme-humanrel-question > small {
+   display: inline-block;
+   margin-top: 0.5em;
+}
diff --git a/modules/all/ext.wikimediaEvents.humanSearchRelevance.js 
b/modules/all/ext.wikimediaEvents.humanSearchRelevance.js
new file mode 100644
index 000..5803354
--- /dev/null
+++ b/modules/all/ext.wikimediaEvents.humanSearchRelevance.js
@@ -0,0 +1,140 @@
+( function ( mw, $ ) {
+   'use strict';
+
+   var config;
+
+   function sample( acceptPercentage ) {
+   var rand = mw.user.generateRandomSessionId(),
+   // take the first 52 bits of the rand value to match js
+   // integer precision
+   parsed = parseInt( rand.slice( 0, 13 ), 16 );
+   if ( acceptPercentage >= 1 ) {
+   return true;
+   }
+   return parsed / Math.pow( 2, 52 ) < acceptPercentage;
+   }
+
+   function chooseOne( options ) {
+   var rand = mw.user.generateRandomSessionId(),
+   parsed = parseInt( rand.slice( 0, 13 ), 16 ),
+   step = Math.pow( 2, 52 ) / options.length;
+   return options[ Math.floor( parsed / step ) ];
+   }
+
+   // See 
https://developer.mozilla.org/en-US/docs/Web/API/Navigator/doNotTrack
+   // Taken from https://www.npmjs.com/package/dnt-polyfill
+   if ( window.doNotTrack === '1' ||
+   navigator.doNotTrack === '1' ||
+   navigator.doNotTrack === 'yes' ||
+   navigator.msDoNotTrack === '1'
+   ) {
+   return;
+   }
+
+   // Page is not part of this test
+   if ( !mw.config.exists( 'wgWMESearchRelevancePages' ) ) {
+   return;
+   }
+
+   // The config value is coded into the page output and cached in varnish.
+   // That means any changes to sampling rates or pages chosen will take 
up to
+   // a week to propogate into the wild.
+   config = mw.config.get( 'wgWMESearchRelevancePages' );
+
+   // bad configuration
+   if ( !config.hasOwnProperty( 'sampleRate' ) || !config.hasOwnProperty( 
'queries' ) ) {
+   return;
+   }
+
+   // This page view not chosen for sampling
+   if ( !sample( config.sampleRate ) ) {
+   return;
+   }
+
+   function askQuestion() {
+   mw.loader.using( [
+   'oojs-ui-core',
+   'mediawiki.notification',
+   'ext.wikimediaEvents.humanSearchRel'
+   ] ).then( function () {
+   var notification, originalClose,
+   closed = false,
+   query = chooseOne( config.queries ),
+   question = 'wikimediaevents-humanrel-question-' 
+ chooseOne( [ 'a', 'b', 'c', 'd' ] ),
+   logEvent = function ( choice ) {
+   if ( !closed ) {
+   closed = true;
+   notification.close();
+   }
+   mw.loader.using( [ 
'schema.HumanSearchRelevance' ] ).then( function () {
+   mw.eventLog.logEvent( 
'HumanSearchRelevance', {
+   articleId: 
mw.config.get( 'wgArticleId' ),
+   query: query,
+   choice: choice,
+  

[MediaWiki-commits] [Gerrit] mediawiki/core[master]: Push pagination decision for prefix search into SearchEngine

2017-12-19 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/399312 )

Change subject: Push pagination decision for prefix search into SearchEngine
..

Push pagination decision for prefix search into SearchEngine

Various code using the search engine shouldn't need to implement it's
own methods, such as over-fetching, to determine if there are more
results available. This should be knowledge internal to search that is
exposed by a boolean.  Full-text search unfortunately does the same
thing, but fixing it is delegated to some future patch.

Change-Id: Ica094428700637dfdedb723b03f6aeadfe12b9f4
---
M includes/api/ApiQueryPrefixSearch.php
M includes/api/SearchApi.php
M includes/search/SearchEngine.php
M includes/search/SearchSuggestionSet.php
M tests/phpunit/includes/search/SearchEnginePrefixTest.php
5 files changed, 116 insertions(+), 47 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core 
refs/changes/12/399312/1

diff --git a/includes/api/ApiQueryPrefixSearch.php 
b/includes/api/ApiQueryPrefixSearch.php
index 2fbc518..aaf81b0 100644
--- a/includes/api/ApiQueryPrefixSearch.php
+++ b/includes/api/ApiQueryPrefixSearch.php
@@ -51,7 +51,12 @@
$offset = $params['offset'];
 
$searchEngine = $this->buildSearchEngine( $params );
-   $titles = $searchEngine->extractTitles( 
$searchEngine->completionSearchWithVariants( $search ) );
+   $suggestions = $searchEngine->completionSearchWithVariants( 
$search );
+   $titles = $searchEngine->extractTitles( $suggestions );
+
+   if ( $suggestions->hasMoreResults() ) {
+   $this->setContinueEnumParameter( 'offset', $offset + 
$limit );
+   }
 
if ( $resultPageSet ) {
$resultPageSet->setRedirectMergePolicy( function ( 
array $current, array $new ) {
@@ -60,10 +65,6 @@
}
return $current;
} );
-   if ( count( $titles ) > $limit ) {
-   $this->setContinueEnumParameter( 'offset', 
$offset + $limit );
-   array_pop( $titles );
-   }
$resultPageSet->populateFromTitles( $titles );
foreach ( $titles as $index => $title ) {
$resultPageSet->setGeneratorData( $title, [ 
'index' => $index + $offset + 1 ] );
@@ -72,10 +73,6 @@
$result = $this->getResult();
$count = 0;
foreach ( $titles as $title ) {
-   if ( ++$count > $limit ) {
-   $this->setContinueEnumParameter( 
'offset', $offset + $limit );
-   break;
-   }
$vals = [
'ns' => intval( $title->getNamespace() 
),
'title' => $title->getPrefixedText(),
diff --git a/includes/api/SearchApi.php b/includes/api/SearchApi.php
index f7c6471..fb6b635 100644
--- a/includes/api/SearchApi.php
+++ b/includes/api/SearchApi.php
@@ -157,15 +157,7 @@
$searchEngine = 
MediaWikiServices::getInstance()->getSearchEngineFactory()->create( $type );
$limit = $params['limit'];
$searchEngine->setNamespaces( $params['namespace'] );
-   $offset = null;
-   if ( isset( $params['offset'] ) ) {
-   // If the API supports offset then it probably
-   // wants to fetch limit+1 so it can check if
-   // more results are available to properly set
-   // the continue param
-   $offset = $params['offset'];
-   $limit += 1;
-   }
+   $offset = isset( $params['offset'] ) ? 
$params['offset'] : null;
$searchEngine->setLimitOffset( $limit, $offset );
 
// Initialize requested search profiles.
diff --git a/includes/search/SearchEngine.php b/includes/search/SearchEngine.php
index 3c8fe60..6876099 100644
--- a/includes/search/SearchEngine.php
+++ b/includes/search/SearchEngine.php
@@ -517,7 +517,15 @@
return SearchSuggestionSet::emptySuggestionSet(); // 
Return empty result
}
$search = $this->normalizeNamespaces( $search );
-   return $this->processCompletionResults( $search, 
$this->completionSearchBackend( $search ) );
+   // Over-fetch results so we can determine if pagination is 
possible in
+   // 

[MediaWiki-commits] [Gerrit] operations/mediawiki-config[master]: Dont discount file searches on commonswiki

2017-12-14 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/398394 )

Change subject: Dont discount file searches on commonswiki
..

Dont discount file searches on commonswiki

Change-Id: I5c1d7493ce68c5afe4d6f21c25b25112283924be
---
M wmf-config/InitialiseSettings.php
1 file changed, 3 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/mediawiki-config 
refs/changes/94/398394/1

diff --git a/wmf-config/InitialiseSettings.php 
b/wmf-config/InitialiseSettings.php
index c355b31..81bab5d 100644
--- a/wmf-config/InitialiseSettings.php
+++ b/wmf-config/InitialiseSettings.php
@@ -18679,6 +18679,9 @@
104 => 0.9,
106 => 0.9,
],
+   'commonswiki' => [
+   6 => 1.0,
+   ],
'wikisource' => [
'author' => 1,
],

-- 
To view, visit https://gerrit.wikimedia.org/r/398394
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I5c1d7493ce68c5afe4d6f21c25b25112283924be
Gerrit-PatchSet: 1
Gerrit-Project: operations/mediawiki-config
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] mediawiki/core[master]: Silently drop unknown titles in completion search

2017-12-13 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/398192 )

Change subject: Silently drop unknown titles in completion search
..

Silently drop unknown titles in completion search

This mimics how full text works by silenty dropping results returned from
search that no longer exist. This could be because the search index is slightly
out of sync with reality, or the search engine could simply be broken.

Only silent from the users perspective. We maintain a count in statsd of
the number of titles dropped. This can be monitored over time to
recognize any increases.

Bug: T115756
Change-Id: I2f29d73e258cd448a14d35a2b4902a4fb6f61c68
---
M includes/search/SearchEngine.php
M includes/search/SearchSuggestionSet.php
2 files changed, 31 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core 
refs/changes/92/398192/1

diff --git a/includes/search/SearchEngine.php b/includes/search/SearchEngine.php
index 3c8fe60..94e0d80 100644
--- a/includes/search/SearchEngine.php
+++ b/includes/search/SearchEngine.php
@@ -580,6 +580,16 @@
$lb->setCaller( __METHOD__ );
$lb->execute();
 
+   $before = $suggestions->count();
+   $suggestions = $suggestions->filter( function ( 
SearchSuggestion $sugg ) {
+   return $sugg->getSuggestedTitle()->isKnown();
+   } );
+   $after = $suggestions->count();
+   if ( $before !== $after ) {
+   MediaWikiServices::getInstance()->getStatsdDataFactory()
+   ->updateCount( 'search.completion.missing', 
$before - $after );
+   }
+
$results = $suggestions->map( function ( SearchSuggestion $sugg 
) {
return $sugg->getSuggestedTitle()->getPrefixedText();
} );
diff --git a/includes/search/SearchSuggestionSet.php 
b/includes/search/SearchSuggestionSet.php
index aced5e1..7c4b484 100644
--- a/includes/search/SearchSuggestionSet.php
+++ b/includes/search/SearchSuggestionSet.php
@@ -23,7 +23,7 @@
  * A set of search suggestions.
  * The set is always ordered by score, with the best match first.
  */
-class SearchSuggestionSet {
+class SearchSuggestionSet implements Countable {
/**
 * @var SearchSuggestion[]
 */
@@ -73,6 +73,19 @@
return array_map( $callback, $this->suggestions );
}
 
+   /**
+* Filter the suggestions array
+* @param callback $callback
+* @return self
+*/
+   public function filter( $callback ) {
+   $suggestions = array_filter( $this->suggestions, $callback );
+   if ( count( $suggestions ) === count( $this->suggestions ) ) {
+   return $this;
+   } else {
+   return new self( $suggestions );
+   }
+   }
/**
 * Add a new suggestion at the end.
 * If the score of the new suggestion is greater than the worst one,
@@ -171,6 +184,13 @@
}
 
/**
+* @return int The number of suggestions held
+*/
+   public function count() {
+   return count( $this->suggestions );
+   }
+
+   /**
 * Builds a new set of suggestion based on a title array.
 * Useful when using a backend that supports only Titles.
 *

-- 
To view, visit https://gerrit.wikimedia.org/r/398192
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I2f29d73e258cd448a14d35a2b4902a4fb6f61c68
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Remove duplicate uploads in integration tests

2017-12-13 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/398129 )

Change subject: Remove duplicate uploads in integration tests
..

Remove duplicate uploads in integration tests

The same files were being uploaded, in slightly different ways, from
two different hooks. Unify them into a single hook. I think the recent
errors on cindy were due to the ordering of the execution of these
two hooks, if one came before the other things would fail. I'm not
entirely sure why though, there is something odd with file uploads
but it's not clear to me what it is ...

Change-Id: I5b780033ff61b1c06016d9b9c8840cdf0a0b9fbb
---
M tests/integration/features/support/hooks.js
1 file changed, 20 insertions(+), 34 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/29/398129/1

diff --git a/tests/integration/features/support/hooks.js 
b/tests/integration/features/support/hooks.js
index c6cf80c..fa7d0ac 100644
--- a/tests/integration/features/support/hooks.js
+++ b/tests/integration/features/support/hooks.js
@@ -280,21 +280,27 @@
}
} ) );
 
-   BeforeOnce( { tags: "@filesearch" }, Promise.coroutine( function* () {
-   // Unfortunatly the current deduplication between wikis 
requires a file
-   // be uploaded to commons before it's uploaded to any other 
wiki, or the
-   // other wiki isn't tagged.
-   yield runBatch( this, 'commons', [
-   job.upload( "DuplicatedLocally.svg", "File stored on 
commons and duplicated locally" ),
-   job.upload( "OnCommons.svg", "File stored on commons 
for test purposes" ),
-   ] );
+   BeforeOnce( { tags: "@filesearch or @setup_main or @commons" }, 
Promise.coroutine( function* () {
+   yield runBatch( this, 'commons', {
+   delete: [
+   'File:OnCommons.svg',
+   'File:DuplicatedLocally.svg',
+   ]
+   } );
+   yield runBatch( this, false, {
+   delete: [ 'File:DuplicatedLocally.svg' ]
+   } );
 
+   yield runBatch( this, 'commons', [
+   // TODO: Why is overwrite necessary here? Otherwise the 
upload is rejected
+   // with was-deleted or some such?
+   job.uploadOverwrite( 'OnCommons.svg', "File stored on 
commons for test purposes" ),
+   job.uploadOverwrite( 'DuplicatedLocally.svg', 'File 
stored on commons and duplicated locally' ),
+   ] );
+   // For duplications to track correctly commons has to be 
uploaded first. This is a bug
+   // in cirrus, but no current plans to fix.
yield runBatch( this, false, [
-   job.upload( 'No_SVG.svg', "[[Category:Red circle with 
left slash]]" ),
-   job.upload( 'Somethingelse_svg_SVG.svg', 
"[[Category:Red circle with left slash]]" ),
-   job.upload( 'Savepage-greyed.png', "Screenshot, for 
test purposes, associated with 
https://bugzilla.wikimedia.org/show_bug.cgi?id=52908 ." ),
-   job.upload( 'DuplicatedLocally.svg', "Locally stored 
file duplicated on commons" ),
-   job.delete( 'File:Frozen.svg' ),
+   job.uploadOverwrite( 'DuplicatedLocally.svg','Locally 
stored file duplicated on commons' )
] );
 
} ) );
@@ -490,6 +496,7 @@
yield runBatch(this, false, {
delete: [

'File:Linux_Distribution_Timeline_text_version.pdf',
+   'File:Frozen.svg',
]
});
yield runBatch(this, false, [
@@ -635,27 +642,6 @@
} ) );
 
BeforeOnce( { tags: "@setup_main or @commons" }, Promise.coroutine( 
function* () {
-   yield runBatch( this, 'commons', {
-   delete: [
-   'File:OnCommons.svg',
-   'File:DuplicatedLocally.svg',
-   ]
-   } );
-   yield runBatch( this, false, {
-   delete: [ 'File:DuplicatedLocally.svg' ]
-   } );
-
-   yield runBatch( this, 'commons', [
-   // TODO: Why is overwrite necessary here? Otherwise the 
upload is rejected
-   // with was-deleted or some such?
-   job.uploadOverwrite( 'OnCommons.svg', "File stored on 
commons for test purposes" ),
-   job.uploadOverwrite( 'DuplicatedLocally.svg', 'File 
stored on commons and duplicated locally' ),
-   ] );
-   // For duplications to 

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Resolve redirect namespaces from source docs in fancy title ...

2017-12-13 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/398124 )

Change subject: Resolve redirect namespaces from source docs in fancy title 
results type
..

Resolve redirect namespaces from source docs in fancy title results type

Rather than figuring out an appropriate namespace for redirects we
used the namespace of the document redirected to which is regularly
wrong, especially for 'shorthand' redirects on wikis such as
CAT:PROD on enwiki in NS_MAIN which redirects to Category:Proposed
Deletion (any many other similar shortcuts). Dig through the redirects
stored with the document and figure out what the likely namespace
of the document is.

Change-Id: I8de5c9d35ed709ee100cc3ad8093e49a7a5476d3
---
M includes/CompletionSuggester.php
M includes/Search/ResultsType.php
M tests/unit/Search/ResultsTypeTest.php
3 files changed, 171 insertions(+), 17 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/24/398124/1

diff --git a/includes/CompletionSuggester.php b/includes/CompletionSuggester.php
index e553aca..bfcf045 100644
--- a/includes/CompletionSuggester.php
+++ b/includes/CompletionSuggester.php
@@ -283,9 +283,10 @@
// they'll be forgotten in client response
$score = $collector->getMinScore() !== null ? 
$collector->getMinScore() - 1 : count( $prefixResults->getResults() );
 
+   $namespaces = $this->searchContext->getNamespaces();
foreach ( $prefixResults->getResults() as $res ) {
$pageId = $this->config->makePageId( $res->getId() );
-   $title = 
FancyTitleResultsType::chooseBestTitleOrRedirect( 
$rType->transformOneElasticResult( $res ) );
+   $title = 
FancyTitleResultsType::chooseBestTitleOrRedirect( 
$rType->transformOneElasticResult( $res, $namespaces ) );
if ( $title === false ) {
continue;
}
diff --git a/includes/Search/ResultsType.php b/includes/Search/ResultsType.php
index 372ef36..39faeff 100644
--- a/includes/Search/ResultsType.php
+++ b/includes/Search/ResultsType.php
@@ -128,6 +128,10 @@
$this->matchedAnalyzer = $matchedAnalyzer;
}
 
+   public function getSourceFiltering() {
+   return [ 'namespace', 'title', 'namespace_text', 'wiki', 
'redirect' ];
+   }
+
/**
 * @param array $highlightSource
 * @return array|null
@@ -223,11 +227,12 @@
 * Transform a result from elastic into an array of Titles.
 *
 * @param \Elastica\Result $r
+* @param int[] $namespaces Prefer
 * @return \Title[] with the following keys :
 *   titleMatch => a title if the title matched
 *   redirectMatches => an array of redirect matches, one per matched 
redirect
 */
-   public function transformOneElasticResult( \Elastica\Result $r ) {
+   public function transformOneElasticResult( \Elastica\Result $r, array 
$namespaces = [] ) {
$title = TitleHelper::makeTitle( $r );
$highlights = $r->getHighlights();
$resultForTitle = [];
@@ -251,21 +256,16 @@

$highlights["redirect.title.{$this->matchedAnalyzer}_asciifolding"] );
}
if ( count( $redirectHighlights ) !== 0 ) {
-   foreach ( $redirectHighlights as $redirectTitle ) {
-   // The match was against a redirect so we 
should replace the $title with one that
-   // represents the redirect.
-   // The first step is to strip the actual 
highlighting from the title.
-   $redirectTitle = str_replace( [ 
Searcher::HIGHLIGHT_PRE, Searcher::HIGHLIGHT_POST ],
-   '', $redirectTitle );
-
-   // Instead of getting the redirect's real 
namespace we're going to just use the namespace
-   // of the title.  This is not great but OK 
given that we can't find cross namespace
-   // redirects properly any way.
-   // TODO: ask the highlighter to return the 
namespace for this kind of matches
-   // this would perhaps help to partially fix 
T115756
-   $redirectTitle =
-   TitleHelper::makeRedirectTitle( $r, 
$redirectTitle, $r->namespace );
-   $resultForTitle['redirectMatches'][] = 
$redirectTitle;
+   $source = $r->getSource();
+   $docRedirects = [];
+   if ( isset( $source['redirect'] ) ) {
+   foreach ( 

[MediaWiki-commits] [Gerrit] operations/mediawiki-config[master]: Enable Cirrus MLR for 4 more wikis

2017-12-13 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/398093 )

Change subject: Enable Cirrus MLR for 4 more wikis
..

Enable Cirrus MLR for 4 more wikis

Change-Id: I26051ee0b2356c2dc7a9f3959a596f7aaa32028f
---
M tests/cirrusTest.php
M wmf-config/InitialiseSettings.php
2 files changed, 5 insertions(+), 5 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/mediawiki-config 
refs/changes/93/398093/1

diff --git a/tests/cirrusTest.php b/tests/cirrusTest.php
index 2c8edb3..8a57bda 100644
--- a/tests/cirrusTest.php
+++ b/tests/cirrusTest.php
@@ -241,7 +241,7 @@
'zhwiki' => [ 'zhwiki', 'wiki',
[
'wmgCirrusSearchSimilarityProfile' => 
'wmf_defaults',
-   'wmgCirrusSearchRescoreProfile' => 
'classic',
+   'wmgCirrusSearchRescoreProfile' => 
'mlr-1024rs',

'wmgCirrusSearchFullTextQueryBuilderProfile' => 'perfield_builder',
'wmgCirrusSearchMaxPhraseTokens' => 10,
],
diff --git a/wmf-config/InitialiseSettings.php 
b/wmf-config/InitialiseSettings.php
index c61f97b..c355b31 100644
--- a/wmf-config/InitialiseSettings.php
+++ b/wmf-config/InitialiseSettings.php
@@ -18297,7 +18297,7 @@
'default' => 'wsum_inclinks',
'commonswiki' => 'classic_noboostlinks',
'enwiki' => 'mlr-1024rs',
-   // 'arwiki' => 'mlr-1024rs',
+   'arwiki' => 'mlr-1024rs',
'dewiki' => 'mlr-1024rs',
'fawiki' => 'mlr-1024rs',
'fiwiki' => 'mlr-1024rs',
@@ -18306,14 +18306,14 @@
'itwiki' => 'mlr-1024rs',
'jawiki' => 'mlr-1024rs',
'kowiki' => 'mlr-1024rs',
-   // 'nlwiki' => 'mlr-1024rs',
+   'nlwiki' => 'mlr-1024rs',
'nowiki' => 'mlr-1024rs',
-   // 'plwiki' => 'mlr-1024rs',
+   'plwiki' => 'mlr-1024rs',
'ptwiki' => 'mlr-1024rs',
'ruwiki' => 'mlr-1024rs',
'svwiki' => 'mlr-1024rs',
'viwiki' => 'mlr-1024rs',
-   // 'zhwiki' => 'mlr-1024rs',
+   'zhwiki' => 'mlr-1024rs',
// Uses the lang tag, list of spaceless languages
// (see 
https://www.mediawiki.org/wiki/User:TJones_(WMF)/Notes/Spaceless_Writing_Systems_and_Wiki-Projects)
"bo" => "classic",

-- 
To view, visit https://gerrit.wikimedia.org/r/398093
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I26051ee0b2356c2dc7a9f3959a596f7aaa32028f
Gerrit-PatchSet: 1
Gerrit-Project: operations/mediawiki-config
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] operations/mediawiki-config[master]: Turn off a couple search ranking models that arnt ready

2017-12-12 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/397988 )

Change subject: Turn off a couple search ranking models that arnt ready
..

Turn off a couple search ranking models that arnt ready

It seems a few of the wikis ranking models havn't been uploaded to the
search clusters yet. Turn those off, enabling only those with
live ranking models.

Change-Id: I28a269c940290518467f99e6003c6eb13774b633
---
M wmf-config/InitialiseSettings.php
1 file changed, 4 insertions(+), 4 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/mediawiki-config 
refs/changes/88/397988/1

diff --git a/wmf-config/InitialiseSettings.php 
b/wmf-config/InitialiseSettings.php
index ccf3f56..652fbb9 100644
--- a/wmf-config/InitialiseSettings.php
+++ b/wmf-config/InitialiseSettings.php
@@ -18253,7 +18253,7 @@
'default' => 'wsum_inclinks',
'commonswiki' => 'classic_noboostlinks',
'enwiki' => 'mlr-1024rs',
-   'arwiki' => 'mlr-1024rs',
+   //'arwiki' => 'mlr-1024rs',
'dewiki' => 'mlr-1024rs',
'fawiki' => 'mlr-1024rs',
'fiwiki' => 'mlr-1024rs',
@@ -18262,14 +18262,14 @@
'itwiki' => 'mlr-1024rs',
'jawiki' => 'mlr-1024rs',
'kowiki' => 'mlr-1024rs',
-   'nlwiki' => 'mlr-1024rs',
+   //'nlwiki' => 'mlr-1024rs',
'nowiki' => 'mlr-1024rs',
-   'plwiki' => 'mlr-1024rs',
+   //'plwiki' => 'mlr-1024rs',
'ptwiki' => 'mlr-1024rs',
'ruwiki' => 'mlr-1024rs',
'svwiki' => 'mlr-1024rs',
'viwiki' => 'mlr-1024rs',
-   'zhwiki' => 'mlr-1024rs',
+   //'zhwiki' => 'mlr-1024rs',
// Uses the lang tag, list of spaceless languages
// (see 
https://www.mediawiki.org/wiki/User:TJones_(WMF)/Notes/Spaceless_Writing_Systems_and_Wiki-Projects)
"bo" => "classic",

-- 
To view, visit https://gerrit.wikimedia.org/r/397988
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I28a269c940290518467f99e6003c6eb13774b633
Gerrit-PatchSet: 1
Gerrit-Project: operations/mediawiki-config
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] mediawiki...WikimediaEvents[wmf/1.31.0-wmf.12]: Turn on second mlr test for hewiki

2017-12-12 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/397985 )

Change subject: Turn on second mlr test for hewiki
..

Turn on second mlr test for hewiki

Re-uses the same sampling rates as last time around on hewiki. Depends
on I681e1e724 being deployed first to setup the appropriate backend
triggers.

Bug: T182616
Change-Id: I7bc0ddd86da966a8f49b27a4ddf9aa93074a6b39
(cherry picked from commit 4d6d5e905de945647fc24795d172c787bbb33128)
---
M modules/all/ext.wikimediaEvents.searchSatisfaction.js
1 file changed, 7 insertions(+), 4 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikimediaEvents 
refs/changes/85/397985/1

diff --git a/modules/all/ext.wikimediaEvents.searchSatisfaction.js 
b/modules/all/ext.wikimediaEvents.searchSatisfaction.js
index 7517df7..de78f11 100644
--- a/modules/all/ext.wikimediaEvents.searchSatisfaction.js
+++ b/modules/all/ext.wikimediaEvents.searchSatisfaction.js
@@ -114,7 +114,9 @@
function initialize( session ) {
 
var sessionId = session.get( 'sessionId' ),
-   validBuckets = [],
+   validBuckets = mw.config.get( 'wgDBname' ) === 
'hewiki' ?
+   [ 'control', 'ltr-1024', 'ltr-1024-i' ] 
:
+   [],
sampleSize = ( function () {
var dbName = mw.config.get( 'wgDBname' 
),
// Provides a place to handle 
wiki-specific sampling,
@@ -135,9 +137,6 @@
test: 350,
subTest: null
},
-   // .0005 works out to 
~2.7k sessions per week.
-   // .15 increases that 
to 810k per week. Giving
-   // 160k sessions per 
bucket per week.
enwiki: {
test: 2000,
subTest: null
@@ -189,6 +188,10 @@
zhwiki: {
test: 100,
subTest: null
+   },
+   hewiki: {
+   test: 0.8112,
+   subTest: 0.8767
}
};
if ( subTests[ dbName ] ) {

-- 
To view, visit https://gerrit.wikimedia.org/r/397985
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I7bc0ddd86da966a8f49b27a4ddf9aa93074a6b39
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/WikimediaEvents
Gerrit-Branch: wmf/1.31.0-wmf.12
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] mediawiki...WikimediaEvents[wmf/1.31.0-wmf.11]: Turn on second mlr test for hewiki

2017-12-12 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/397984 )

Change subject: Turn on second mlr test for hewiki
..

Turn on second mlr test for hewiki

Re-uses the same sampling rates as last time around on hewiki. Depends
on I681e1e724 being deployed first to setup the appropriate backend
triggers.

Bug: T182616
Change-Id: I7bc0ddd86da966a8f49b27a4ddf9aa93074a6b39
---
M modules/all/ext.wikimediaEvents.searchSatisfaction.js
1 file changed, 7 insertions(+), 4 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikimediaEvents 
refs/changes/84/397984/1

diff --git a/modules/all/ext.wikimediaEvents.searchSatisfaction.js 
b/modules/all/ext.wikimediaEvents.searchSatisfaction.js
index 7517df7..de78f11 100644
--- a/modules/all/ext.wikimediaEvents.searchSatisfaction.js
+++ b/modules/all/ext.wikimediaEvents.searchSatisfaction.js
@@ -114,7 +114,9 @@
function initialize( session ) {
 
var sessionId = session.get( 'sessionId' ),
-   validBuckets = [],
+   validBuckets = mw.config.get( 'wgDBname' ) === 
'hewiki' ?
+   [ 'control', 'ltr-1024', 'ltr-1024-i' ] 
:
+   [],
sampleSize = ( function () {
var dbName = mw.config.get( 'wgDBname' 
),
// Provides a place to handle 
wiki-specific sampling,
@@ -135,9 +137,6 @@
test: 350,
subTest: null
},
-   // .0005 works out to 
~2.7k sessions per week.
-   // .15 increases that 
to 810k per week. Giving
-   // 160k sessions per 
bucket per week.
enwiki: {
test: 2000,
subTest: null
@@ -189,6 +188,10 @@
zhwiki: {
test: 100,
subTest: null
+   },
+   hewiki: {
+   test: 0.8112,
+   subTest: 0.8767
}
};
if ( subTests[ dbName ] ) {

-- 
To view, visit https://gerrit.wikimedia.org/r/397984
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I7bc0ddd86da966a8f49b27a4ddf9aa93074a6b39
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/WikimediaEvents
Gerrit-Branch: wmf/1.31.0-wmf.11
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] mediawiki...WikimediaEvents[master]: Turn on second mlr test for hewiki

2017-12-12 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/397973 )

Change subject: Turn on second mlr test for hewiki
..

Turn on second mlr test for hewiki

Resets enwiki to its default sampling of 1:2000. Re-uses the same
sampling rates as last time around on hewiki. Depends on
I681e1e724 being deployed first to setup the appropriate backend
triggers.

Change-Id: I7bc0ddd86da966a8f49b27a4ddf9aa93074a6b39
---
M modules/all/ext.wikimediaEvents.searchSatisfaction.js
1 file changed, 8 insertions(+), 7 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikimediaEvents 
refs/changes/73/397973/1

diff --git a/modules/all/ext.wikimediaEvents.searchSatisfaction.js 
b/modules/all/ext.wikimediaEvents.searchSatisfaction.js
index a48355d..de78f11 100644
--- a/modules/all/ext.wikimediaEvents.searchSatisfaction.js
+++ b/modules/all/ext.wikimediaEvents.searchSatisfaction.js
@@ -114,8 +114,8 @@
function initialize( session ) {
 
var sessionId = session.get( 'sessionId' ),
-   validBuckets = mw.config.get( 'wgDBname' ) === 
'enwiki' ?
-   [ 'control', 'dbn20', 'dbn20-i', 
'dbn35', 'dbn35-i' ] :
+   validBuckets = mw.config.get( 'wgDBname' ) === 
'hewiki' ?
+   [ 'control', 'ltr-1024', 'ltr-1024-i' ] 
:
[],
sampleSize = ( function () {
var dbName = mw.config.get( 'wgDBname' 
),
@@ -137,12 +137,9 @@
test: 350,
subTest: null
},
-   // .0005 works out to 
~2.7k sessions per week.
-   // .15 increases that 
to 810k per week. Giving
-   // 160k sessions per 
bucket per week.
enwiki: {
-   test: 0.15,
-   subTest: 0.996
+   test: 2000,
+   subTest: null
},
enwiktionary: {
test: 40,
@@ -191,6 +188,10 @@
zhwiki: {
test: 100,
subTest: null
+   },
+   hewiki: {
+   test: 0.8112,
+   subTest: 0.8767
}
};
if ( subTests[ dbName ] ) {

-- 
To view, visit https://gerrit.wikimedia.org/r/397973
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I7bc0ddd86da966a8f49b27a4ddf9aa93074a6b39
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/WikimediaEvents
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] operations/mediawiki-config[master]: Turn on MLR for most wikis with >1% of search traffic

2017-12-12 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/397970 )

Change subject: Turn on MLR for most wikis with >1% of search traffic
..

Turn on MLR for most wikis with >1% of search traffic

Based on the results of our AB test showing improved search relevance
for machine learned ranking, turn them on. Also updates to models
trained in the last week.

Change-Id: I657d80bd1fbda61d5fd84fcdd0e29383a1d857cd
---
M tests/cirrusTest.php
M wmf-config/InitialiseSettings.php
2 files changed, 36 insertions(+), 19 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/mediawiki-config 
refs/changes/70/397970/1

diff --git a/tests/cirrusTest.php b/tests/cirrusTest.php
index 5bccd87..8a57bda 100644
--- a/tests/cirrusTest.php
+++ b/tests/cirrusTest.php
@@ -241,7 +241,7 @@
'zhwiki' => [ 'zhwiki', 'wiki',
[
'wmgCirrusSearchSimilarityProfile' => 
'wmf_defaults',
-   'wmgCirrusSearchRescoreProfile' => 
'wsum_inclinks',
+   'wmgCirrusSearchRescoreProfile' => 
'mlr-1024rs',

'wmgCirrusSearchFullTextQueryBuilderProfile' => 'perfield_builder',
'wmgCirrusSearchMaxPhraseTokens' => 10,
],
diff --git a/wmf-config/InitialiseSettings.php 
b/wmf-config/InitialiseSettings.php
index 9ab29bf..ccf3f56 100644
--- a/wmf-config/InitialiseSettings.php
+++ b/wmf-config/InitialiseSettings.php
@@ -18253,6 +18253,23 @@
'default' => 'wsum_inclinks',
'commonswiki' => 'classic_noboostlinks',
'enwiki' => 'mlr-1024rs',
+   'arwiki' => 'mlr-1024rs',
+   'dewiki' => 'mlr-1024rs',
+   'fawiki' => 'mlr-1024rs',
+   'fiwiki' => 'mlr-1024rs',
+   'frwiki' => 'mlr-1024rs',
+   'idwiki' => 'mlr-1024rs',
+   'itwiki' => 'mlr-1024rs',
+   'jawiki' => 'mlr-1024rs',
+   'kowiki' => 'mlr-1024rs',
+   'nlwiki' => 'mlr-1024rs',
+   'nowiki' => 'mlr-1024rs',
+   'plwiki' => 'mlr-1024rs',
+   'ptwiki' => 'mlr-1024rs',
+   'ruwiki' => 'mlr-1024rs',
+   'svwiki' => 'mlr-1024rs',
+   'viwiki' => 'mlr-1024rs',
+   'zhwiki' => 'mlr-1024rs',
// Uses the lang tag, list of spaceless languages
// (see 
https://www.mediawiki.org/wiki/User:TJones_(WMF)/Notes/Spaceless_Writing_Systems_and_Wiki-Projects)
"bo" => "classic",
@@ -19595,25 +19612,25 @@
 
 'wmgCirrusSearchMLRModel' => [
'default' => false,
-   'enwiki' => '20171101_enwiki_v1',
-   'arwiki' => '20170905_arwiki_v1',
-   'fawiki' => '20170905_fawiki_v1',
-   'jawiki' => '20170905_jawiki_v1',
-   'svwiki' => '20170905_svwiki_v1',
-   'frwiki' => '20170905_frwiki_v1',
-   'itwiki' => '20170905_itwiki_v1',
-   'ptwiki' => '20170905_ptwiki_v1',
-   'ruwiki' => '20170905_ruwiki_v1',
-   'dewiki' => '20170905_dewiki_v1',
-   'fiwiki' => '20170908_fiwiki_v1',
+   'enwiki' => '20171130_enwiki_v1',
+   'arwiki' => '20171130_arwiki_v1',
+   'fawiki' => '20171130_fawiki_v1',
+   'jawiki' => '20171130_jawiki_v1',
+   'svwiki' => '20171130_svwiki_v1',
+   'frwiki' => '20171130_frwiki_v1',
+   'itwiki' => '20171130_itwiki_v1',
+   'ptwiki' => '20171130_ptwiki_v1',
+   'ruwiki' => '20171130_ruwiki_v1',
+   'dewiki' => '20171130_dewiki_v1',
+   'fiwiki' => '20171130_fiwiki_v1',
'hewiki' => '20171130_hewiki_v1',
-   'idwiki' => '20170908_idwiki_v1',
-   'kowiki' => '20170908_kowiki_v1',
-   'nlwiki' => '20170908_nlwiki_v1',
-   'nowiki' => '20170908_nowiki_v1',
-   'plwiki' => '20170908_plwiki_v1',
-   'viwiki' => '20170908_viwiki_v1',
-   'zhwiki' => '20170908_zhwiki_v1',
+   'idwiki' => '20171130_idwiki_v1',
+   'kowiki' => '20171130_kowiki_v1',
+   'nlwiki' => '20171130_nlwiki_v1',
+   'nowiki' => '20171130_nowiki_v1',
+   'plwiki' => '20171130_plwiki_v1',
+   'viwiki' => '20171130_viwiki_v1',
+   'zhwiki' => '20171130_zhwiki_v1',
 ],
 
 'wmgWMESearchRelevancePages' => [

-- 
To view, visit https://gerrit.wikimedia.org/r/397970
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I657d80bd1fbda61d5fd84fcdd0e29383a1d857cd
Gerrit-PatchSet: 1
Gerrit-Project: operations/mediawiki-config
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] wikimedia...relevanceForge[master]: Add sanity checks for more languages

2017-12-12 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/397968 )

Change subject: Add sanity checks for more languages
..

Add sanity checks for more languages

I'm not completely sure these are very good. I didn't really try and
figure out what any of the pages were, just chose random pages from the
bottom 2/3 of the top 100 most popular pages (by page views) of the last
month. Then chose sometimes a redirect and sometimes the primary title
as the query string.

We can probably iterate on these if we get any complaints about search
results that users expect at the top but are not.

Change-Id: I39b47b47f03d02907f5d291050465c14ae596287
---
M sanityCheck.py
A sanityCheck/arwiki.json
A sanityCheck/dewiki.json
A sanityCheck/fawiki.json
A sanityCheck/fiwiki.json
A sanityCheck/frwiki.json
A sanityCheck/hewiki.json
A sanityCheck/idwiki.json
A sanityCheck/itwiki.json
A sanityCheck/jawiki.json
A sanityCheck/kowiki.json
A sanityCheck/nlwiki.json
A sanityCheck/nowiki.json
A sanityCheck/plwiki.json
A sanityCheck/ptwiki.json
A sanityCheck/ruwiki.json
A sanityCheck/svwiki.json
A sanityCheck/viwiki.json
A sanityCheck/zhwiki.json
19 files changed, 256 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/relevanceForge 
refs/changes/68/397968/1

diff --git a/sanityCheck.py b/sanityCheck.py
old mode 100644
new mode 100755
index 19aae72..341f476
--- a/sanityCheck.py
+++ b/sanityCheck.py
@@ -1,3 +1,4 @@
+#!/usr/bin/env python
 from __future__ import print_function
 import argparse
 import json
@@ -37,6 +38,9 @@
 print('')
 else:
 print("PASSED\n")
+
+print("OVERALL: %s" % ("PASSED" if ok else "FAILED"))
+
 return ok
 
 
diff --git a/sanityCheck/arwiki.json b/sanityCheck/arwiki.json
new file mode 100644
index 000..1f27498
--- /dev/null
+++ b/sanityCheck/arwiki.json
@@ -0,0 +1,11 @@
+{
+"api": "https://ar.wikipedia.org/w/api.php;,
+"queries": {
+"ابن_الهيثم": [
+"ابن الهيثم"
+],
+"مؤسسة محمد الخامس": [
+"مؤسسة محمد الخامس للتضامن"
+]
+}
+}
diff --git a/sanityCheck/dewiki.json b/sanityCheck/dewiki.json
new file mode 100644
index 000..4480dc3
--- /dev/null
+++ b/sanityCheck/dewiki.json
@@ -0,0 +1,14 @@
+{
+"api": "https://de.wikipedia.org/w/api.php;,
+"queries": {
+"Bundestagswahl 2017": [
+"Bundestagswahl 2017"
+],
+"FIFA WM 2018": [
+"Fußball-Weltmeisterschaft 2018"
+],
+"Rechtswesen Österreichs": [
+"Österreich"
+]
+}
+}
diff --git a/sanityCheck/fawiki.json b/sanityCheck/fawiki.json
new file mode 100644
index 000..7397c81
--- /dev/null
+++ b/sanityCheck/fawiki.json
@@ -0,0 +1,14 @@
+{
+"api": "https://fa.wikipedia.org/w/api.php;,
+"queries": {
+"سید محمدحسین شهریار": [
+"سید محمدحسین شهریار"
+],
+"شادمهر عقيلي": [
+"شادمهر عقیلی"
+],
+"ساكر": [
+"آمیزش جنسی دهانی"
+]
+}
+}
diff --git a/sanityCheck/fiwiki.json b/sanityCheck/fiwiki.json
new file mode 100644
index 000..2be778b
--- /dev/null
+++ b/sanityCheck/fiwiki.json
@@ -0,0 +1,14 @@
+{
+"api": "https://fi.wikipedia.org/w/api.php;,
+"queries": {
+"Happamuus": [
+"Happamuus"
+],
+"Veljessota": [
+"Suomen sisällissota"
+],
+"Mannerheim": [
+"Carl Gustaf Emil Mannerheim"
+]
+}
+}
diff --git a/sanityCheck/frwiki.json b/sanityCheck/frwiki.json
new file mode 100644
index 000..4528194
--- /dev/null
+++ b/sanityCheck/frwiki.json
@@ -0,0 +1,14 @@
+{
+"api": "https://fr.wikipedia.org/w/api.php;,
+"queries": {
+"U Arena": [
+"U Arena"
+],
+"DALS": [
+"Danse avec les stars"
+],
+"Tableau périodique des éléments": [
+"Tableau périodique des éléments"
+]
+}
+}
diff --git a/sanityCheck/hewiki.json b/sanityCheck/hewiki.json
new file mode 100644
index 000..c3a014d
--- /dev/null
+++ b/sanityCheck/hewiki.json
@@ -0,0 +1,14 @@
+{
+"api": "https://he.wikipedia.org/w/api.php;,
+"queries": {
+"שלמה ארצי": [
+"שלמה ארצי"
+],
+"רחל בלובשטיין סלע": [
+"רחל המשוררת"
+],
+"Jerusalem": [
+"ירושלים"
+]
+}
+}
diff --git a/sanityCheck/idwiki.json b/sanityCheck/idwiki.json
new file mode 100644
index 000..5e9380e
--- /dev/null
+++ b/sanityCheck/idwiki.json
@@ -0,0 +1,14 @@
+{
+"api": "https://id.wikipedia.org/w/api.php;,
+"queries": {
+"Pemerintahan daerah di Indonesia": [
+"Pemerintahan daerah di Indonesia"
+],
+"Undang-undang dasar": [
+"Konstitusi"
+],
+"Srivijaya": [
+"Sriwijaya"
+ 

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: [WIP] Additional integration test features for cindy

2017-12-11 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/397735 )

Change subject: [WIP] Additional integration test features for cindy
..

[WIP] Additional integration test features for cindy

WIP because the full integration with the barry script to automate
running hasn't been finished, so there is probably still more to do.

* Cindy didn't like that we were using npm dependencies that
 weren't explicitly called out. Perhaps due to a difference in
 npm version of something. Regardless it's good practice to call
 out our dependencies explicitly.
* Add a --tag=... option to request only specific tags are run.
 This can, for example, specify an or of tags with '@foo or @bar'.
 All tests except those marked frozen can be selected with "not
 @frozen". See cucumber-tag-expressions lib for more details.
* Add a configuration file specifically for mwv in labs that generates
 appropriate urls. Triggered by setting MWV_LABS_HOSTNAME environment
 variable to the hostname of the machine. ex:
 MWV_LABS_HOTSNAME=cirrus-browser-bot
* Increase max parallelism of chrome to 8. This will still be limited
 by the top level maxInstances to 1 by default (necessary when mixing
 frozen index tests with the others).
* Add a grunt cli parameter to set parallelism from the command line.

Change-Id: I4d8837b2c56b018f682429756a2ba6efd106969d
---
M Gruntfile.js
M package.json
M tests/integration/config/wdio.conf.js
A tests/integration/config/wdio.conf.mwvlabs.js
A tests/integration/log/.gitkeep
5 files changed, 50 insertions(+), 2 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/35/397735/1

diff --git a/Gruntfile.js b/Gruntfile.js
index b3841f7..16637ca 100644
--- a/Gruntfile.js
+++ b/Gruntfile.js
@@ -19,6 +19,8 @@
 
if ( process.env.JENKINS_HOME ) {
WebdriverIOconfigFile = 
'./tests/integration/config/wdio.conf.jenkins.js';
+   } else if ( process.env.MWV_LABS_HOSTNAME ) {
+   WebdriverIOconfigFile = 
'./tests/integration/config/wdio.conf.mwvlabs.js';
} else {
WebdriverIOconfigFile = 
'./tests/integration/config/wdio.conf.js';
}
@@ -59,6 +61,15 @@
webdriver: {
test: {
configFile: WebdriverIOconfigFile,
+   cucumberOpts: {
+   tagExpression: ( () => {
+   return grunt.option( 'tags' );
+   )()
+   },
+   maxInstances: ( () => {
+   let max = grunt.option( 'maxInstances' 
);
+   return max ? parseInt( max, 10 ) : 1;
+   } )(),
spec: ( () => {
let spec = grunt.option( 'spec' );
if ( !spec ) {
diff --git a/package.json b/package.json
index cf4a6b3..b6855f5 100644
--- a/package.json
+++ b/package.json
@@ -8,8 +8,10 @@
 "selenium": "killall -0 chromedriver 2>/dev/null || chromedriver 
--url-base=/wd/hub --port= & grunt webdriver:test; killall chromedriver"
   },
   "devDependencies": {
+"bluebird": "3.5.1",
 "chai": "^4.1.2",
 "cucumber": "^3.0.1",
+"deepmerge": "2.0.1",
 "grunt": "1.0.1",
 "grunt-banana-checker": "0.5.0",
 "grunt-contrib-jshint": "1.0.0",
@@ -21,8 +23,11 @@
 "stylelint-config-wikimedia": "0.4.1",
 "wdio-cucumber-framework": "^1.0.1",
 "webdriverio": "^4.8.0",
+"wdio-spec-reporter": "1.2.0",
+"wdio-junit-reporter": "1.1.3",
 "restify": "^6.3.4",
 "request": "^2.83.0",
-"request-promise-native": "^1.0.5"
+"request-promise-native": "^1.0.5",
+"semlog": "0.6.10"
   }
 }
diff --git a/tests/integration/config/wdio.conf.js 
b/tests/integration/config/wdio.conf.js
index 82601f7..89bb9a3 100644
--- a/tests/integration/config/wdio.conf.js
+++ b/tests/integration/config/wdio.conf.js
@@ -124,7 +124,7 @@
// maxInstances can get overwritten per capability. So if you 
have an in-house Selenium
// grid with only 5 firefox instances available you can make 
sure that not more than
// 5 instances get started at a time.
-   maxInstances: 1,
+   maxInstances: 8,
//
browserName: 'chrome',
// Since Chrome v57 
https://bugs.chromium.org/p/chromedriver/issues/detail?id=1625
diff --git a/tests/integration/config/wdio.conf.mwvlabs.js 
b/tests/integration/config/wdio.conf.mwvlabs.js
new file mode 100644
index 000..7b3a7a7
--- /dev/null
+++ b/tests/integration/config/wdio.conf.mwvlabs.js
@@ -0,0 +1,32 @@
+/*jshint esversion: 6,  node:true */
+
+/* eslint 

[MediaWiki-commits] [Gerrit] operations/mediawiki-config[master]: Setup MLR AB test for hewiki

2017-12-11 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/397582 )

Change subject: Setup MLR AB test for hewiki
..

Setup MLR AB test for hewiki

Last time around we trained the hewiki model on an analysis chain that
was different than the one used when running the test so the results
were invalid. Re-run the test with a new model trained against the new
analysis chain.

This reverts commit 9bfa5214657476b410399ecb90a62a6b8afd3196.

Change-Id: I681e1e724201337d73867e518fe806cbf5f89636
---
M wmf-config/CirrusSearch-common.php
M wmf-config/InitialiseSettings.php
2 files changed, 13 insertions(+), 60 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/mediawiki-config 
refs/changes/82/397582/1

diff --git a/wmf-config/CirrusSearch-common.php 
b/wmf-config/CirrusSearch-common.php
index 5ead601..35ae235 100644
--- a/wmf-config/CirrusSearch-common.php
+++ b/wmf-config/CirrusSearch-common.php
@@ -296,40 +296,7 @@
],
],
];
-
-   $wgCirrusSearchRescoreProfiles['dbn20'] = 
$wgCirrusSearchRescoreProfiles['mlr-1024rs'];
-   $wgCirrusSearchRescoreProfiles['dbn20']['rescore'][2]['model'] = 
'dbn20_enwiki_v1';
-
-   $wgCirrusSearchRescoreProfiles['dbn35'] = 
$wgCirrusSearchRescoreProfiles['mlr-1024rs'];
-   $wgCirrusSearchRescoreProfiles['dbn35']['rescore'][2]['model'] = 
'dbn35_enwiki_v1';
-
 }
-
-# needed for recall A/B test (T177502)
-$wgCirrusSearchFullTextQueryBuilderProfiles['rec_3t_80_66'] = 
$wgCirrusSearchFullTextQueryBuilderProfiles['perfield_builder'];
-$wgCirrusSearchFullTextQueryBuilderProfiles['rec_3t_80_66']['settings']['filter']
 = [
-   'type' => 'default',
-   'settings' => [
-   'all' => [
-   'minimum_should_match' => '3<80%'
-   ],
-   'all.plain' => [
-   'minimum_should_match' => '3<66%'
-   ],
-   ]
-];
-$wgCirrusSearchFullTextQueryBuilderProfiles['rec_4t_80_66'] = 
$wgCirrusSearchFullTextQueryBuilderProfiles['perfield_builder'];
-$wgCirrusSearchFullTextQueryBuilderProfiles['rec_4t_80_66']['settings']['filter']
 = [
-   'type' => 'default',
-   'settings' => [
-   'all' => [
-   'minimum_should_match' => '4<80%'
-   ],
-   'all.plain' => [
-   'minimum_should_match' => '4<66%'
-   ],
-   ]
-];
 
 $wgCirrusSearchUserTesting = $wmgCirrusSearchUserTesting;
 
diff --git a/wmf-config/InitialiseSettings.php 
b/wmf-config/InitialiseSettings.php
index 8da9e21..9ab29bf 100644
--- a/wmf-config/InitialiseSettings.php
+++ b/wmf-config/InitialiseSettings.php
@@ -18623,44 +18623,30 @@
 
 'wmgCirrusSearchUserTesting' => [
'default' => [],
-   // DBN sizing AB test
-   'enwiki' => [
-   'dbn_sizing' => [
+   'hewiki' => [
+   'ltr' => [
+   'globals' => [],
'buckets' => [
'control' => [
'trigger' => 'control',
],
-   'dbn20' => [
-   'trigger' => 'dbn20',
+   'ltr-1024' => [
+   'trigger' => 'ltr-1024',
'globals' => [
-   'wgCirrusSearchRescoreProfile' 
=> 'dbn20',
+   'wgCirrusSearchRescoreProfile' 
=> 'mlr-1024rs',
]
],
-   'dbn20-i' => [
-   'trigger' => 'dbn20-i',
+   'ltr-1024-i' => [
+   'trigger' => 'ltr-1024-i',
'globals' => [

'wgCirrusSearchInterleaveConfig' => [
-   
'CirrusSearchRescoreProfile' => 'dbn20',
+   
'CirrusSearchRescoreProfile' => 'mlr-1024rs'
],
-   ]
-   ],
-   'dbn35' => [
-   'trigger' => 'dbn35',
-   'globals' => [
-   'wgCirrusSearchRescoreProfile' 
=> 'dbn35',
-   ]
-   ],
-   'dbn35-i' => [
-   'trigger' => 'dbn35-i',
-   'globals' => [
-  

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[wmf/1.31.0-wmf.11]: Simple hack to override mlr model from query string

2017-12-11 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/397569 )

Change subject: Simple hack to override mlr model from query string
..

Simple hack to override mlr model from query string

This is a rather naive attempt to allow us to do a sanity
check on an MLR model before we roll it out. With the new
cirrusMLRModel query parameter we can upload a model to
elasticsearch and try a couple queries by specifying the
model name before we ship a config cange to enable the
model for everyone.

Change-Id: Id258c4ad295eab8cac543f14e9135a1bdb87533a
(cherry picked from commit 8a51602c1e92f32e736a6f101986fd79f9fa0bad)
---
M includes/Search/RescoreBuilders.php
1 file changed, 11 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/69/397569/1

diff --git a/includes/Search/RescoreBuilders.php 
b/includes/Search/RescoreBuilders.php
index 76a4fac..3119a01 100644
--- a/includes/Search/RescoreBuilders.php
+++ b/includes/Search/RescoreBuilders.php
@@ -127,6 +127,17 @@
 * @return AbstractQuery
 */
private function buildLtrQuery( $model ) {
+   // This is a bit fragile, and makes the bold assumption
+   // only a single level of rescore will be used. This is
+   // strictly for debugging/testing before shipping a model
+   // live so shouldn't be a big deal.
+   $override = \RequestContext::getMain()
+   ->getRequest()
+   ->getVal( 'cirrusMLRModel' );
+   if ( $override ) {
+   $model = $override;
+   }
+
$bool = new \Elastica\Query\BoolQuery();
// the ltr query can return negative scores, which mucks with 
elasticsearch
// sorting as that will put these results below documents set 
to 0. Fix

-- 
To view, visit https://gerrit.wikimedia.org/r/397569
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Id258c4ad295eab8cac543f14e9135a1bdb87533a
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: wmf/1.31.0-wmf.11
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port remaining dump_* features to nodejs

2017-12-08 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/396553 )

Change subject: Port remaining dump_* features to nodejs
..

Port remaining dump_* features to nodejs

I changed these up a bit to make them simpler. Basically each one
now has a specific Then( ... ) test for it. The previous tests
were just string includes against the stringified json, this
seemed perhaps better at verifying we have somewhat reasonable output.

Change-Id: I4d2a0789ae17e880304cc42baeb58735e9b2c66b
---
A tests/integration/features/dump_config.feature
A tests/integration/features/dump_mapping.feature
A tests/integration/features/dump_query.feature
A tests/integration/features/dump_settings.feature
M tests/integration/features/step_definitions/page_steps.js
5 files changed, 120 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/53/396553/1

diff --git a/tests/integration/features/dump_config.feature 
b/tests/integration/features/dump_config.feature
new file mode 100644
index 000..fc85cf1
--- /dev/null
+++ b/tests/integration/features/dump_config.feature
@@ -0,0 +1,8 @@
+@clean @dump_config @phantomjs
+Feature: You can dump CirrusSearch's configuration
+  Scenario: You can dump CirrusSearch's configuration
+When I dump the cirrus config
+Then the config dump contains CirrusSearchPhraseSuggestMaxErrors
+ And the config dump contains CirrusSearchNamespaceWeights
+ And the config dump text does not contain Password
+ And the config dump text does not contain password
diff --git a/tests/integration/features/dump_mapping.feature 
b/tests/integration/features/dump_mapping.feature
new file mode 100644
index 000..a0f1776
--- /dev/null
+++ b/tests/integration/features/dump_mapping.feature
@@ -0,0 +1,5 @@
+@clean @dump_mapping @phantomjs
+Feature: You can dump the mapping CirrusSearch set on Elasticsearch's indexes
+  Scenario: You can dump the mapping CirrusSearch set on Elasticsearch's 
indexes
+When I dump the cirrus mapping
+Then A valid mapping dump is produced
diff --git a/tests/integration/features/dump_query.feature 
b/tests/integration/features/dump_query.feature
new file mode 100644
index 000..49aa588
--- /dev/null
+++ b/tests/integration/features/dump_query.feature
@@ -0,0 +1,5 @@
+@clean @dump_quer @phantomjs
+Feature: Can dump the query syntax
+  Scenario: Can dump the query syntax
+Given I request a query dump for main page
+ Then A valid query dump for main page is produced
diff --git a/tests/integration/features/dump_settings.feature 
b/tests/integration/features/dump_settings.feature
new file mode 100644
index 000..d0d9c43
--- /dev/null
+++ b/tests/integration/features/dump_settings.feature
@@ -0,0 +1,5 @@
+@clean @dump_settings @phantomjs
+Feature: You can dump the settings CirrusSearch set on Elasticsearch's indexes
+  Scenario: You can dump the settings CirrusSearch set on Elasticsearch's 
indexes
+When I dump the cirrus settings
+Then A valid settings dump is produced
diff --git a/tests/integration/features/step_definitions/page_steps.js 
b/tests/integration/features/step_definitions/page_steps.js
index bb3eec6..dd1a56d 100644
--- a/tests/integration/features/step_definitions/page_steps.js
+++ b/tests/integration/features/step_definitions/page_steps.js
@@ -496,4 +496,101 @@
} );
} );
} );
+
+   When( /^I dump the cirrus config$/, Promise.coroutine( function* () {
+   let client = yield this.onWiki();
+   try {
+   let response = yield client.request( {
+   action: 'cirrus-config-dump',
+   } );
+   this.setApiResponse( response );
+   } catch ( err ) {
+   this.setApiError( err );
+   }
+   } ) );
+
+   Then( /^the config dump contains (.+)$/, function ( key ) {
+   return withApi( this, () => {
+   expect( this.apiResponse ).to.have.any.keys( key );
+   } );
+   } );
+
+   Then( /^the config dump text does not contain (.+)$/, function ( key ) {
+   return withApi( this, () => {
+   let text = JSON.stringify( this.apiResponse );
+   expect( text ).to.not.include( key );
+   } );
+   } );
+
+   When( /^I dump the cirrus mapping$/, Promise.coroutine( function* () {
+   let client = yield this.onWiki();
+   try {
+   let response = yield client.request( {
+   action: 'cirrus-mapping-dump',
+   } );
+   this.setApiResponse( response );
+   } catch ( err ) {
+   this.setApiError( err );
+   }
+   } ) );
+

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port update_weight_api.feature to nodejs

2017-12-08 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/396552 )

Change subject: Port update_weight_api.feature to nodejs
..

Port update_weight_api.feature to nodejs

Replace 'within ...' calls with a new step that waits for incoming_links
to be appropriately updated. This makes our expectations explicit
instead of waiting on a secondary thing that hopefully updates based on
the first. Additionally to save some time (this test is pretty slow)
skip waiting on most of the edits and just do the final wait for
incoming_links.

I'm not really sure these tests even really need to be performing
searches, it looks like mostly they are checking that incoming links are
updated and counted appropriately.

While working up this patch i noticed multiple steps are all now
utilizing a 'wait for elasticsearch document to have some value' type
step so moved the implementation into stepHelpers and adjusted the uses
to all have similar wording and use the same implementation with
different check functions.

Change-Id: I20b13236e2139026d542de5e376392d6c5a67e47
---
M tests/integration/features/commons.feature
M tests/integration/features/step_definitions/page_step_helpers.js
M tests/integration/features/step_definitions/page_steps.js
M tests/integration/features/update_redirect_api.feature
A tests/integration/features/update_weight_api.feature
5 files changed, 142 insertions(+), 43 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/52/396552/1

diff --git a/tests/integration/features/commons.feature 
b/tests/integration/features/commons.feature
index b36867b..764f74d 100644
--- a/tests/integration/features/commons.feature
+++ b/tests/integration/features/commons.feature
@@ -5,7 +5,7 @@
 Then File:OnCommons.svg is the first api search result
 
   Scenario: A file that exists on commons and the local wiki returns the local 
result
-When within 20 seconds File:DuplicatedLocally.svg has cirrustestwiki as 
local_sites_with_dupe
+When I wait for File:DuplicatedLocally.svg on commons to include 
cirrustestwiki in local_sites_with_dupe
 Then I api search in namespace 6 for duplicated
 Then File:DuplicatedLocally.svg is the first api search result
 And Locally stored file *duplicated* on commons is the highlighted snippet 
of the first api search result
diff --git a/tests/integration/features/step_definitions/page_step_helpers.js 
b/tests/integration/features/step_definitions/page_step_helpers.js
index b5f39cc..0979e90 100644
--- a/tests/integration/features/step_definitions/page_step_helpers.js
+++ b/tests/integration/features/step_definitions/page_step_helpers.js
@@ -171,6 +171,29 @@
} ).call( this );
}
 
+   waitForDocument( title, check ) {
+   return Promise.coroutine( function* () {
+   let timeoutMs = 2;
+   let start = new Date();
+   let lastError;
+   while ( true ) {
+   let doc = yield this.getCirrusIndexedContent( 
title );
+   if ( doc.cirrusdoc && doc.cirrusdoc.length > 0 
) {
+   try {
+   check( doc.cirrusdoc[0] );
+   break;
+   } catch ( err ) {
+   lastError = err;
+   }
+   }
+   if ( new Date() - start >= timeoutMs ) {
+   throw lastError || new Error( `Timeout 
out waiting for ${title}` );
+   }
+   yield this.waitForMs( 200 );
+   }
+   } ).call( this );
+   }
+
waitForMs( ms ) {
return new Promise( ( resolve ) => setTimeout( resolve, ms ) );
}
diff --git a/tests/integration/features/step_definitions/page_steps.js 
b/tests/integration/features/step_definitions/page_steps.js
index f3d7a83..bb3eec6 100644
--- a/tests/integration/features/step_definitions/page_steps.js
+++ b/tests/integration/features/step_definitions/page_steps.js
@@ -386,27 +386,6 @@
return stepHelpers.uploadFile( title, fileName, description );
} );
 
-   Then(/^within (\d+) seconds (.+) has (.+) as local_sites_with_dupe$/, 
function (seconds, title, value) {
-   return Promise.coroutine( function* () {
-   let stepHelpers = this.stepHelpers.onWiki( 'commons' );
-   let time = new Date();
-   let found = false;
-   main: do {
-   let page = yield 
stepHelpers.getCirrusIndexedContent( title );
-   

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: port update_redirect_loop.feature to nodejs

2017-12-08 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/396472 )

Change subject: port update_redirect_loop.feature to nodejs
..

port update_redirect_loop.feature to nodejs

Almost a straight copy from ruby, just changed the steps to not use the
new code that waits for updates to hit elasticsearch. This is necessary
because we don't ever index redirect loops.

Change-Id: I8463ef4e4aef9e272554bcf8bc316c4f3df486a8
---
A tests/integration/features/update_redirect_loop.feature
1 file changed, 12 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/72/396472/1

diff --git a/tests/integration/features/update_redirect_loop.feature 
b/tests/integration/features/update_redirect_loop.feature
new file mode 100644
index 000..893646d
--- /dev/null
+++ b/tests/integration/features/update_redirect_loop.feature
@@ -0,0 +1,12 @@
+@clean @phantomjs @update @redirect_loop
+Feature: Search backend updates containing redirect loops
+  Scenario: Pages that redirect to themself don't throw errors
+Then I don't wait for a page named IAmABad RedirectSelf%{epoch} to exist 
with contents #REDIRECT [[IAmABad RedirectSelf%{epoch}]]
+
+  # The actual creation of the pages will fails if redirect loops fails
+  Scenario: Pages that form a redirect chain don't throw errors
+When I don't wait for a page named IAmABad RedirectChain%{epoch} A to 
exist with contents #REDIRECT [[IAmABad RedirectChain%{epoch} B]]
+  And I don't wait for a page named IAmABad RedirectChain%{epoch} B to 
exist with contents #REDIRECT [[IAmABad RedirectChain%{epoch} C]]
+  And I don't wait for a page named IAmABad RedirectChain%{epoch} C to 
exist with contents #REDIRECT [[IAmABad RedirectChain%{epoch} D]]
+Then I don't wait for a page named IAmABad RedirectChain%{epoch} D to 
exist with contents #REDIRECT [[IAmABad RedirectChain%{epoch} A]]
+  And I don't wait for a page named IAmABad RedirectChain%{epoch} B to 
exist with contents #REDIRECT [[IAmABad RedirectChain%{epoch} D]]

-- 
To view, visit https://gerrit.wikimedia.org/r/396472
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I8463ef4e4aef9e272554bcf8bc316c4f3df486a8
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port update_redirect_api.feature to nodejs

2017-12-08 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/396471 )

Change subject: Port update_redirect_api.feature to nodejs
..

Port update_redirect_api.feature to nodejs

Mostly a straight copy from ruby. While most of the 'within ...' steps
could be removed unfortunately the one that waits for a redirect
converted to not a redirect couldn't be removed. This has to wait for
the previously redirected-to page to be updated. Rather than indirectly
waiting on search results added a new step that directly waits for the
redirect to be removed from the previously redirected to document.

Change-Id: I1febfdad9eac1f8e3577b545274378dd50ae5de0
---
M tests/integration/features/step_definitions/page_step_helpers.js
M tests/integration/features/step_definitions/page_steps.js
A tests/integration/features/update_redirect_api.feature
3 files changed, 44 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/71/396471/1

diff --git a/tests/integration/features/step_definitions/page_step_helpers.js 
b/tests/integration/features/step_definitions/page_step_helpers.js
index 7281da6..b5f39cc 100644
--- a/tests/integration/features/step_definitions/page_step_helpers.js
+++ b/tests/integration/features/step_definitions/page_step_helpers.js
@@ -296,7 +296,8 @@
// Is the requested page and the returned document dont 
have the same
// title that means we have a redirect. In that case 
the revision id
// wont match, but the backend api ensures the redirect 
is now contained
-   // within the document.
+   // within the document. Unfortunately if the page was 
just edited to
+   // now be a redirect anymore this is wrong ...
if ( isOk && revisionId && content[0].source.title === 
page.title ) {
isOk = parseInt( content[0].source.version, 10 
) === revisionId;
}
diff --git a/tests/integration/features/step_definitions/page_steps.js 
b/tests/integration/features/step_definitions/page_steps.js
index 9db07e3..522c8ac 100644
--- a/tests/integration/features/step_definitions/page_steps.js
+++ b/tests/integration/features/step_definitions/page_steps.js
@@ -484,4 +484,24 @@
let found = snippets.reduce( ( a, b ) => a || b.indexOf( within 
) > -1, false );
expect( found ).to.equal( !should_not );
} );
+
+   Then( /^I wait for (.+) to not be included in the redirects of (.+)$/, 
function ( source, redirect ) {
+   return Promise.coroutine( function* () {
+   let timeoutMs = 2;
+   let start = new Date();
+   while (true) {
+   let doc = yield 
this.stepHelpers.getCirrusIndexedContent( redirect );
+   if ( doc.cirrusdoc.length > 0 ) {
+   let exists = 
doc.cirrusdoc[0].source.redirect.reduce( ( a, b ) => a || b.title === source, 
false );
+   if ( !exists ) {
+   break;
+   }
+   }
+   if (new Date() - start >= timeoutMs) {
+   throw new Error( `Timed out waiting for 
${source} to not exist in document of ${redirect}` );
+   }
+   yield this.stepHelpers.waitForMs( 200 );
+   }
+   } ).call( this );
+   } );
 });
diff --git a/tests/integration/features/update_redirect_api.feature 
b/tests/integration/features/update_redirect_api.feature
new file mode 100644
index 000..50171ea
--- /dev/null
+++ b/tests/integration/features/update_redirect_api.feature
@@ -0,0 +1,22 @@
+@clean @api @redirect @update
+Feature: Updating a page from or to a redirect
+  Scenario: Turning a page into a redirect removes it from the search index
+Given a page named RedirectTarget exists
+ When a page named ToBeRedirect%{epoch} exists
+  And I api search for ToBeRedirect%{epoch}
+ Then ToBeRedirect%{epoch} is the first api search result
+ When a page named ToBeRedirect%{epoch} exists with contents #REDIRECT 
[[RedirectTarget]]
+  And I api search for ToBeRedirect%{epoch}
+ Then RedirectTarget is the first api search result
+  And ToBeRedirect%{epoch} is not in the api search results
+
+  Scenario: Turning a page from a redirect to a regular page puts it in the 
index
+Given a page named RedirectTarget exists
+ When a page named StartsAsRedirect%{epoch} exists with contents #REDIRECT 
[[RedirectTarget]]
+  And I api search for StartsAsRedirect%{epoch}
+   

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: port update_non_existent_api.feature to nodejs

2017-12-08 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/396470 )

Change subject: port update_non_existent_api.feature to nodejs
..

port update_non_existent_api.feature to nodejs

Change-Id: I04b7f5a75c05aa8c3ff59d081ffe085c49d0a601
---
M tests/integration/features/step_definitions/page_steps.js
A tests/integration/features/update_non_existent_api.feature
2 files changed, 76 insertions(+), 24 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/70/396470/1

diff --git a/tests/integration/features/step_definitions/page_steps.js 
b/tests/integration/features/step_definitions/page_steps.js
index a22f2f8..9db07e3 100644
--- a/tests/integration/features/step_definitions/page_steps.js
+++ b/tests/integration/features/step_definitions/page_steps.js
@@ -43,28 +43,32 @@
 // TODO: We might need to share this epoch between wdio runner processes?
 const epoch = +new Date();
 const searchVars = {};
-defineParameterType( {
-   // Quite annoyingly this isn't a regexp to match in the step name, 
rather
-   // it is a string literal to match a capture group of the step 
definition.
-   // So basically this only replaces epochs in parameters defined as (.+).
-   regexp: /.+/,
-   transformer: (s) => {
-   if ( s === undefined ) {
-   return s;
-   }
-   if ( s === 'the empty string' ) {
-   return '';
-   }
-   s = s.replace( /%{epoch}/g, epoch );
-   s = s.replace( /%ideographic_whitspace%/g, "\u3000" );
+// These expressions are string matches against capture groups in steps. Yes, 
.+)
+// is intentional. Cucumber's matching of capture groups is broken so (?:foo 
(.+))
+// has to be matched as .+). That broken matching also means (?:foo (.+) bar)
+// would have to be matched as '.+) bar' but we don't bother.
+let expressions = [ '.+', '.+?', '.+)' ];
+for ( let expression of expressions ) {
+   defineParameterType( {
+   regexp: expression,
+   transformer: (s) => {
+   if ( s === undefined ) {
+   return s;
+   }
+   if ( s === 'the empty string' ) {
+   return '';
+   }
+   s = s.replace( /%{epoch}/g, epoch );
+   s = s.replace( /%ideographic_whitspace%/g, "\u3000" );
 
-   // Replace %{\u}% with the appropriate unicode code point
-   s = s.replace(/%\{\\u([\dA-Fa-f]{4,6})\}%/g, ( match, codepoint 
) => JSON.parse( `"\\u${codepoint}"` ) );
-   s = Object.keys(searchVars).reduce( ( str, pattern ) => 
str.replace( pattern, searchVars[pattern] ), s );
-   return s.replace( /%{exact:([^}]*)}/g, '$1' );
-   },
-   name: 'replacements',
-} );
+   // Replace %{\u}% with the appropriate unicode code 
point
+   s = s.replace(/%\{\\u([\dA-Fa-f]{4,6})\}%/g, ( match, 
codepoint ) => JSON.parse( `"\\u${codepoint}"` ) );
+   s = Object.keys(searchVars).reduce( ( str, pattern ) => 
str.replace( pattern, searchVars[pattern] ), s );
+   return s.replace( /%{exact:([^}]*)}/g, '$1' );
+   },
+   typeName: 'replacements_' + expression ,
+   } );
+}
 
 defineSupportCode( function( {Given, When, Then} ) {
 
@@ -244,19 +248,19 @@
return stepHelpers.searchFor( search, options );
} );
 
-   Then( /there are no errors reported by the api/, function () {
+   Then( /^there are no errors reported by the api$/, function () {
return withApi( this, () => {
expect( this.apiError ).to.equal(undefined);
} );
} );
 
-   Then( /there is an api search result/, function () {
+   Then( /^there is an api search result$/, function () {
return withApi( this, () => {
expect( this.apiResponse.query.search 
).to.not.have.lengthOf( 0 );
} );
} );
 
-   Then( /there are no api search results/, function () {
+   Then( /^there are no api search results$/, function () {
return withApi( this, () => {
expect( this.apiResponse.query.search 
).to.have.lengthOf( 0 );
} );
@@ -474,4 +478,10 @@
Then ( /^the page text contains (.+)$/, function( text ) {
expect(browser.getSource()).to.contains(text);
} );
+
+   Then( /^there are( no)? api search results with (.+) in the data$/, 
function ( should_not, within ) {
+   let snippets = this.apiResponse.query.search.map( ( result ) => 
result.snippet );
+   let found = snippets.reduce( ( a, b ) => a 

[MediaWiki-commits] [Gerrit] search/xgboost[master]: Specialize single-node training

2017-12-07 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/396098 )

Change subject: Specialize single-node training
..

Specialize single-node training

XGBoost has a faster training method, tree_method -> hist, which is
currently not implemented for distributed training. We actually train
quite a few models on a single node (but with many models being trained
in parallel) so it would be nice to be able to utilize this where
possible.

This is perhaps not implemented in the most optimal way if we were going
to upstream the patch, but upstreaming is unlikely as upstream does not
support training multiple models in parallel (we do through a custom
hack). Rather than refactoring existing code this mostly adds new
functions for specialized single-node training so that pulling in
upstream changes will be as pain free as possible.

Change-Id: I2760127edd2c3c4ad26abd23e621059ac9609950
---
M 
jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
M 
jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
2 files changed, 87 insertions(+), 4 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/xgboost 
refs/changes/98/396098/1

diff --git 
a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
 
b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index ea18ff2..bc052e2 100644
--- 
a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ 
b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -105,6 +105,57 @@
 }
   }
 
+  private[spark] def buildLocalBoosters(
+  data: RDD[XGBLabeledPoint],
+  params: Map[String, Any],
+  round: Int,
+  obj: ObjectiveTrait,
+  eval: EvalTrait,
+  useExternalMemory: Boolean,
+  missing: Float): RDD[Array[Byte]] = {
+val partitionedData = if (data.getNumPartitions != 1) {
+  logger.info(s"repartitioning training set to 1 partitions")
+  data.coalesce(1)
+} else {
+  data
+}
+val partitionedBaseMargin = partitionedData.map(_.baseMargin)
+val appName = partitionedData.context.appName
+partitionedData.zipPartitions(partitionedBaseMargin) { (labeledPoints, 
baseMargins) =>
+  if (labeledPoints.isEmpty) {
+throw new XGBoostError(
+  s"detected an empty partition in the training data, partition ID:" +
+s" ${TaskContext.getPartitionId()}")
+  }
+  val cacheFileName = if (useExternalMemory) {
+s"$appName-${TaskContext.get().stageId()}-" +
+  s"dtrain_cache-${TaskContext.getPartitionId()}"
+  } else {
+null
+  }
+
+  // Yes it's odd to access this but not do anything. We are ensuring the 
lazily
+  // initialized resource monitor is setup before we enter training.
+  monitor
+
+  val watches = Watches(params,
+fromDenseToSparseLabeledPoints(labeledPoints, missing),
+fromBaseMarginsToArray(baseMargins), cacheFileName)
+  try {
+val numEarlyStoppingRounds = params.get("numEarlyStoppingRounds")
+  .map(_.toString.toInt).getOrElse(0)
+val booster = SXGBoost.train(watches.train, params, round,
+  watches = watches.toMap, obj = obj, eval = eval,
+  earlyStoppingRound = numEarlyStoppingRounds)
+val bytes = booster.toByteArray
+booster.dispose
+Iterator(bytes)
+  } finally {
+watches.delete()
+  }
+}
+  }
+
   private[spark] def buildDistributedBoosters(
   data: RDD[XGBLabeledPoint],
   params: Map[String, Any],
@@ -302,8 +353,40 @@
 val xgbTrainingData = trainingData.map { case MLLabeledPoint(label, 
features) =>
   features.asXGB.copy(label = label.toFloat)
 }
-trainDistributed(xgbTrainingData, params, round, nWorkers, obj, eval,
-  useExternalMemory, missing)
+if (nWorkers == 1) {
+  trainLocal(xgbTrainingData, params, round, obj, eval, useExternalMemory, 
missing)
+} else {
+  trainDistributed(xgbTrainingData, params, round, nWorkers, obj, eval,
+useExternalMemory, missing)
+}
+  }
+
+  @throws(classOf[XGBoostError])
+  private[spark] def trainLocal(
+  trainingData: RDD[XGBLabeledPoint],
+  params: Map[String, Any],
+  round: Int,
+  obj: ObjectiveTrait = null,
+  eval: EvalTrait = null,
+  useExternalMemory: Boolean = false,
+  missing: Float = Float.NaN): XGBoostModel = {
+if (obj != null) {
+  require(params.get("obj_type").isDefined, "parameter \"obj_type\" is not 
defined," +
+" you have to specify the objective type as classification or 
regression with a" +
+" customized objective function")
+}
+val overriddenParams = overrideParamsAccordingToTaskCPUs(params, 
trainingData.sparkContext)
+

[MediaWiki-commits] [Gerrit] operations...cdh[master]: Enable more accurate smaps based rss checking

2017-12-06 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/395923 )

Change subject: Enable more accurate smaps based rss checking
..

Enable more accurate smaps based rss checking

Training xgboost models in the hadoop cluster is running into some
issues where yarn regularly kills containers, but only some of them.
Based on review of yarn's code it appears this is because we are using
the default RSS calculation which is documented as less accurate.
Specifically it includes pages that the kernel is free to evict, and
double(triple, etc) counts read only memory shared by many processes.

A custom implementation of that algorithm was injected into a background
task of training mlr models and found that the more accurate algorithm
shows a constant memory usage. Enabling this will allow us to stop
over-allocating memory to account for this discrepency, and require
250Gb less memory for the 9 hour training process.

Bug: T182276
Change-Id: I0f8223db4d4abc26eb9d04ff106b7e49602f504e
---
M templates/hadoop/yarn-site.xml.erb
1 file changed, 6 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet/cdh 
refs/changes/23/395923/1

diff --git a/templates/hadoop/yarn-site.xml.erb 
b/templates/hadoop/yarn-site.xml.erb
index 5657577..913a028 100644
--- a/templates/hadoop/yarn-site.xml.erb
+++ b/templates/hadoop/yarn-site.xml.erb
@@ -169,6 +169,12 @@
 org.apache.spark.network.yarn.YarnShuffleService
   
 
+  
+RSS usage of a process computed via /proc/pid/stat is not 
very accurate as it includes shared pages of a process. /proc/pid/smaps 
provides useful information like Private_Dirty, Private_Clean, Shared_Dirty, 
Shared_Clean which can be used for computing more accurate RSS. When this flag 
is enabled, RSS is computed as Min(Shared_Dirty, Pss) + Private_Clean + 
Private_Dirty. It excludes read-only shared mappings in RSS 
computation.
+
yarn.nodemanager.container-monitor.procfs-tree.smaps-based-rss.enabled
+true
+  
+
 <% if @datanode_mounts -%>
   
 List of directories to store localized files in.

-- 
To view, visit https://gerrit.wikimedia.org/r/395923
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I0f8223db4d4abc26eb9d04ff106b7e49602f504e
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet/cdh
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search/xgboost[master]: [DNM] more debugging of RssFile explosion

2017-12-06 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/395915 )

Change subject: [DNM] more debugging of RssFile explosion
..

[DNM] more debugging of RssFile explosion

Change-Id: Ia5cdb36f31dd4512b048de74f2b2d769d2fa7acf
---
M 
jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/ResourceMonitorThread.scala
M 
jvm-packages/xgboost4j-spark/src/test/scala/ml/dmlc/xgboost4j/scala/spark/XGBoostGeneralSuite.scala
2 files changed, 117 insertions(+), 8 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/xgboost 
refs/changes/15/395915/1

diff --git 
a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/ResourceMonitorThread.scala
 
b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/ResourceMonitorThread.scala
index 22ed87c..e8f55b4 100644
--- 
a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/ResourceMonitorThread.scala
+++ 
b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/ResourceMonitorThread.scala
@@ -16,13 +16,15 @@
 
 package ml.dmlc.xgboost4j.scala.spark
 
+import java.io.{BufferedReader, File, FileReader}
 import java.lang.management.ManagementFactory
 import java.util.concurrent.atomic.AtomicBoolean
+import java.util.regex.Pattern
 
 import org.apache.commons.logging.LogFactory
 
 import scala.concurrent.duration.Duration
-import scala.io.Source
+import scala.io.{BufferedSource, Source}
 
 class ResourceMonitorThread(reportEvery: Duration) extends Thread {
   super.setDaemon(true)
@@ -37,18 +39,120 @@
   return
 }
 while (keepChecking.get()) {
-  report()
+  report().foreach(logger.info)
   Thread.sleep(reportEvery.toMillis)
 }
   }
 
   def stopChecking(): Unit = keepChecking.set(false)
 
-  private def report(): Unit = {
+  def report(): Seq[String] = {
 val rss = Source.fromFile(s"/proc/$pid/status").getLines()
   .filter(_.startsWith("Rss"))
   .mkString(", ")
-logger.info(rss)
-logger.info(memoryBean.getHeapMemoryUsage)
+Seq(rss,
+  memoryBean.getHeapMemoryUsage.toString,
+  // 5 largest contributors to RSSFile
+  collectSMapInfo().take(5).map({ info =>
+s"${info.mem()}: ${info.name}"
+  }).mkString("\n")
+).filter(_.length > 0)
   }
+
+  private val ADDRESS_PATTERN = 
raw"^([a-f0-9]*)-([a-f0-9]*)(\s)*([rxwps\-]*).*".r
+  private val MEM_INFO_PATTERN = raw"^([A-Z].*):[\s ]*(.*).*".r
+  private val KB = "kB"
+  private val READ_ONLY_WITH_SHARED_PERMISSION = "r--s"
+  private val READ_EXECUTE_WITH_SHARED_PERMISSION = "r-xs"
+
+  val file = new File(s"/proc/$pid/smaps")
+
+  private def collectSMapInfo(): List[ProcessSmapMemoryInfo] = {
+if (!file.exists()) {
+  return Nil
+}
+val lines = Source.fromFile(s"/proc/$pid/smaps").getLines()
+lines.map(_.trim).foldLeft(List[ProcessSmapMemoryInfo]()) { (acc, line) =>
+  line match {
+case ADDRESS_PATTERN(startAddr, endAddr, space, permission) =>
+  new ProcessSmapMemoryInfo(line, permission) :: acc
+case MEM_INFO_PATTERN(key, value) =>
+  acc match {
+case memInfo :: xs => memInfo.setMemInfo(key.trim, 
value.replace(KB, "").trim) :: xs
+case Nil => Nil
+  }
+case _ => acc
+  }
+}.filter { memInfo =>
+  
!memInfo.permission.trim.equalsIgnoreCase(READ_ONLY_WITH_SHARED_PERMISSION) &&
+  
!memInfo.permission.trim.equalsIgnoreCase(READ_EXECUTE_WITH_SHARED_PERMISSION)
+}.sortBy(_.mem()).reverse
+  }
+}
+
+class ProcessSmapMemoryInfo(val name: String, val permission: String) {
+  var size: Int = 0
+  var rss: Int = 0
+  var pss: Int = 0
+  var sharedClean: Int = 0
+  var sharedDirty: Int = 0
+  var privateClean: Int = 0
+  var privateDirty: Int = 0
+  var referenced: Int = 0
+  var regionName: String = ""
+
+  def setMemInfo(key: String, value: String): ProcessSmapMemoryInfo = {
+try {
+  val intval = value.trim.toInt
+  MemInfo(key) match {
+case MemInfo.SIZE => size = intval
+case MemInfo.RSS => rss = intval
+case MemInfo.PSS => pss = intval
+case MemInfo.SHARED_CLEAN => sharedClean = intval
+case MemInfo.SHARED_DIRTY => sharedDirty = intval
+case MemInfo.PRIVATE_CLEAN => privateClean = intval
+case MemInfo.PRIVATE_DIRTY => privateDirty = intval
+case MemInfo.REFERENCED => referenced = intval
+case _ => None
+  }
+} catch {
+  case e: NumberFormatException => Nil
+}
+this
+  }
+
+  def mem(): Int = {
+// Math.min(sharedDirty, pss) + privateDirty + privateClean
+rss
+  }
+}
+
+object MemInfo {
+  sealed abstract class MemInfoVal(val name: String) {
+override def toString: String = name
+  }
+
+  def apply(name: String): MemInfoVal = {
+values.collectFirst { case i if i.name.equalsIgnoreCase(name.trim) => i 

[MediaWiki-commits] [Gerrit] search/xgboost[master]: [DNM] Test xgboost4j-spark with fast hist tree maker

2017-12-06 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/395856 )

Change subject: [DNM] Test xgboost4j-spark with fast hist tree maker
..

[DNM] Test xgboost4j-spark with fast hist tree maker

Change-Id: If8cda596953182a62485df9d8f370f7e6d800b51
---
M 
jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
1 file changed, 4 insertions(+), 4 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/xgboost 
refs/changes/56/395856/1

diff --git 
a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
 
b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index ea18ff2..a0024a7 100644
--- 
a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ 
b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -316,10 +316,10 @@
   eval: EvalTrait = null,
   useExternalMemory: Boolean = false,
   missing: Float = Float.NaN): XGBoostModel = {
-if (params.contains("tree_method")) {
-  require(params("tree_method") != "hist", "xgboost4j-spark does not 
support fast histogram" +
-  " for now")
-}
+// if (params.contains("tree_method")) {
+//   require(params("tree_method") != "hist", "xgboost4j-spark does not 
support fast" +
+//   " histogram for now")
+// }
 require(nWorkers > 0, "you must specify more than 0 workers")
 if (obj != null) {
   require(params.get("obj_type").isDefined, "parameter \"obj_type\" is not 
defined," +

-- 
To view, visit https://gerrit.wikimedia.org/r/395856
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: If8cda596953182a62485df9d8f370f7e6d800b51
Gerrit-PatchSet: 1
Gerrit-Project: search/xgboost
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Simple hack to override mlr model from query string

2017-12-06 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/395826 )

Change subject: Simple hack to override mlr model from query string
..

Simple hack to override mlr model from query string

This is a rather naive attempt to allow us to do a sanity
check on an MLR model before we roll it out. With the new
cirrusMLRModel query parameter we can upload a model to
elasticsearch and try a couple queries by specifying the
model name before we ship a config cange to enable the
model for everyone.

Change-Id: Id258c4ad295eab8cac543f14e9135a1bdb87533a
---
M includes/Search/RescoreBuilders.php
1 file changed, 11 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/26/395826/1

diff --git a/includes/Search/RescoreBuilders.php 
b/includes/Search/RescoreBuilders.php
index 76a4fac..3119a01 100644
--- a/includes/Search/RescoreBuilders.php
+++ b/includes/Search/RescoreBuilders.php
@@ -127,6 +127,17 @@
 * @return AbstractQuery
 */
private function buildLtrQuery( $model ) {
+   // This is a bit fragile, and makes the bold assumption
+   // only a single level of rescore will be used. This is
+   // strictly for debugging/testing before shipping a model
+   // live so shouldn't be a big deal.
+   $override = \RequestContext::getMain()
+   ->getRequest()
+   ->getVal( 'cirrusMLRModel' );
+   if ( $override ) {
+   $model = $override;
+   }
+
$bool = new \Elastica\Query\BoolQuery();
// the ltr query can return negative scores, which mucks with 
elasticsearch
// sorting as that will put these results below documents set 
to 0. Fix

-- 
To view, visit https://gerrit.wikimedia.org/r/395826
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Id258c4ad295eab8cac543f14e9135a1bdb87533a
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: AtLeastNDistinct returns wrong value on merge

2017-12-05 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/395617 )

Change subject: AtLeastNDistinct returns wrong value on merge
..

AtLeastNDistinct returns wrong value on merge

The merge operation wasn't correctly taking buf2 into
account. Add some tests to verify how this should work
and update merge to correctly integrate buf2 into buf1.

Change-Id: Ib37b60e4f4ae2354d1d1181460e1b511c0c13cc2
---
M jvm/src/main/scala/org/wikimedia/search/mjolnir/AtLeastNDistinct.scala
A jvm/src/test/scala/org/wikimedia/search/mjolnir/AtLeastNDistinctSuite.scala
2 files changed, 72 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/17/395617/1

diff --git 
a/jvm/src/main/scala/org/wikimedia/search/mjolnir/AtLeastNDistinct.scala 
b/jvm/src/main/scala/org/wikimedia/search/mjolnir/AtLeastNDistinct.scala
index 1b49e25..74d7467 100644
--- a/jvm/src/main/scala/org/wikimedia/search/mjolnir/AtLeastNDistinct.scala
+++ b/jvm/src/main/scala/org/wikimedia/search/mjolnir/AtLeastNDistinct.scala
@@ -57,7 +57,9 @@
   }
 
   override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
-if (!buffer1.getAs[Boolean](buffer_reached)) {
+if (buffer2.getAs[Boolean](buffer_reached)) {
+  buffer1(buffer_reached) = true
+} else if (!buffer1.getAs[Boolean](buffer_reached)) {
   getSet(buffer1) ++= getSet(buffer2)
   checkReached(buffer1)
 }
diff --git 
a/jvm/src/test/scala/org/wikimedia/search/mjolnir/AtLeastNDistinctSuite.scala 
b/jvm/src/test/scala/org/wikimedia/search/mjolnir/AtLeastNDistinctSuite.scala
new file mode 100644
index 000..962dbb9
--- /dev/null
+++ 
b/jvm/src/test/scala/org/wikimedia/search/mjolnir/AtLeastNDistinctSuite.scala
@@ -0,0 +1,69 @@
+package org.wikimedia.search.mjolnir
+
+import org.apache.spark.sql.expressions.MutableAggregationBuffer
+import org.scalatest.FunSuite
+
+class DummyBuffer(init: Array[Any]) extends MutableAggregationBuffer {
+  val values: Array[Any] = init
+  def update(i: Int, value: Any): Unit = values(i) = value
+  def get(i: Int) = values(i)
+  def length: Int = init.length
+  def copy() = new DummyBuffer(init.clone())
+}
+
+class AtLeastNDistinctSuite extends FunSuite {
+  import org.scalatest.prop.TableDrivenPropertyChecks._
+
+  test("basic operation") {
+val udaf = new AtLeastNDistinct
+val buf = new DummyBuffer(new Array(udaf.bufferSchema.length))
+val row = new DummyBuffer(new Array(udaf.inputSchema.length))
+
+forAll(Table(
+  ("limit", "expected", "values"),
+  (1, false, Seq()),
+  (1, true, Seq("zomg")),
+  (1, true, Seq("hi", "hi", "hi")),
+  (2, false, Seq("hi", "hi", "hi")),
+  (2, true, Seq("hi", "there", "hi"))
+)) { (limit: Int, expect: Boolean, values: Seq[String]) =>
+  udaf.initialize(buf)
+  row(udaf.input_limit) = limit
+  values.foreach { value =>
+row(udaf.input_value) = value
+udaf.update(buf, row)
+  }
+  assert(udaf.evaluate(buf) == expect)
+}
+  }
+
+  test("merge") {
+val udaf = new AtLeastNDistinct
+val buf1 = new DummyBuffer(new Array(udaf.bufferSchema.length))
+val buf2 = new DummyBuffer(new Array(udaf.bufferSchema.length))
+val row = new DummyBuffer(new Array(udaf.inputSchema.length))
+
+forAll(Table(
+  ("limit", "expected", "a", "b"),
+  (1, true, Set("a"), Set[String]()),
+  (1, true, Set[String](), Set("a")),
+  (2, false, Set("a"), Set("a")),
+  (2, true, Set("a"), Set("b"))
+)) { (limit: Int, expect: Boolean, a: Set[String], b: Set[String]) =>
+  udaf.initialize(buf1)
+  udaf.initialize(buf2)
+  row(udaf.input_limit) = limit
+  a.foreach { value =>
+row(udaf.input_value) = value
+udaf.update(buf1, row)
+  }
+  b.foreach { value =>
+row(udaf.input_value) = value
+udaf.update(buf2, row)
+  }
+
+  udaf.merge(buf1, buf2)
+  assert(udaf.evaluate(buf1) == expect)
+}
+  }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/395617
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ib37b60e4f4ae2354d1d1181460e1b511c0c13cc2
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search/xgboost[master]: Add unique tag to log instances in RabitTracker

2017-12-05 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/395611 )

Change subject: Add unique tag to log instances in RabitTracker
..

Add unique tag to log instances in RabitTracker

We often have between 15 and 100 separate RabitTracker instances running
at the same time and it's incredibly difficult to figure out when one
errors out what other logs are related to the one that failed. This
doesn't completely solve the problem of associating non-tracker logs
(like executor kills by yarn), but it at least helps distinguish
the output from the separate trackers.

Change-Id: Ic4189ae318316be405b3be499d95b2849b0e6f61
---
M jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/RabitTracker.java
1 file changed, 9 insertions(+), 7 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/xgboost 
refs/changes/11/395611/1

diff --git 
a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/RabitTracker.java 
b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/RabitTracker.java
index 888d501..4927466 100644
--- 
a/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/RabitTracker.java
+++ 
b/jvm-packages/xgboost4j/src/main/java/ml/dmlc/xgboost4j/java/RabitTracker.java
@@ -5,7 +5,7 @@
 import java.io.*;
 import java.util.HashMap;
 import java.util.Map;
-import java.util.concurrent.TimeUnit;
+import java.util.Random;
 import java.util.concurrent.atomic.AtomicReference;
 
 import org.apache.commons.logging.Log;
@@ -23,8 +23,10 @@
  * The tracker must be started on driver node before running distributed jobs.
  */
 public class RabitTracker implements IRabitTracker {
-  // Maybe per tracker logger?
-  private static final Log logger = LogFactory.getLog(RabitTracker.class);
+  private static final Log classLogger = LogFactory.getLog(RabitTracker.class);
+  private static final Random random = new Random();
+  private final String logTag = Integer.toHexString(random.nextInt());
+  private final Log logger = LogFactory.getLog(RabitTracker.class.getName() + 
'@' + logTag);
   // tracker python file.
   private static String tracker_py = null;
   // environment variable to be pased.
@@ -37,8 +39,8 @@
 try {
   initTrackerPy();
 } catch (IOException ex) {
-  logger.error("load tracker library failed.");
-  logger.error(ex);
+  classLogger.error("load tracker library failed.");
+  classLogger.error(ex);
 }
   }
 
@@ -48,7 +50,7 @@
   private class TrackerProcessLogger implements Runnable {
 public void run() {
 
-  Log trackerProcessLogger = LogFactory.getLog(TrackerProcessLogger.class);
+  Log trackerProcessLogger = 
LogFactory.getLog(TrackerProcessLogger.class.getName() + '@' + logTag);
   BufferedReader reader = new BufferedReader(new InputStreamReader(
   trackerProcess.get().getErrorStream()));
   String line;
@@ -73,7 +75,7 @@
 try {
   tracker_py = NativeLibLoader.createTempFileFromResource("/tracker.py");
 } catch (IOException ioe) {
-  logger.trace("cannot access tracker python script");
+  classLogger.trace("cannot access tracker python script");
   throw ioe;
 }
   }

-- 
To view, visit https://gerrit.wikimedia.org/r/395611
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ic4189ae318316be405b3be499d95b2849b0e6f61
Gerrit-PatchSet: 1
Gerrit-Project: search/xgboost
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search/xgboost[master]: Add background resource monitor task to training

2017-12-05 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/395592 )

Change subject: Add background resource monitor task to training
..

Add background resource monitor task to training

We have executors getting killed by overrunning their memory
allocations, but no clue why that is happening. Training an entire
35M observation set on a single jvm (local spark mode), but training
that 35M observation set in yarn split between three executors usually
works but sometimes yarn comes out and kills our process.

Add a thread on executors that perform training to regularly report
both heap usage and Rss info from /proc/$pid/status. While this wont
tell us exactly what is happening, it will at least hopefully give
some insight into how memory usage develops over time up to the point
that yarn decides to kiil our executors.

This intentionally is implemented in a "once per jvm" way which is a bit
odd but provides us the most information. Basically the first time an
executor performs training the thread is spun up and that thread keeps
running after the current task is complete, up until the executor itself
exits.

Change-Id: I71c121055ea94b997bc018da4fc0d4d86d63bf66
---
A 
jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/ResourceMonitorThread.scala
M 
jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
2 files changed, 64 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/xgboost 
refs/changes/92/395592/1

diff --git 
a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/ResourceMonitorThread.scala
 
b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/ResourceMonitorThread.scala
new file mode 100644
index 000..64309e6
--- /dev/null
+++ 
b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/ResourceMonitorThread.scala
@@ -0,0 +1,52 @@
+/*
+ Copyright (c) 2014 by Contributors
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ */
+
+package ml.dmlc.xgboost4j.scala.spark
+
+import java.lang.management.ManagementFactory
+import java.util.concurrent.atomic.AtomicBoolean
+
+import org.apache.commons.logging.LogFactory
+
+import scala.concurrent.duration.Duration
+import scala.io.Source
+
+class ResourceMonitorThread(reportEvery: Duration) extends Thread {
+  private val keepChecking = new AtomicBoolean(true)
+  private val pid = 
ManagementFactory.getRuntimeMXBean.getName.split('@')(0).toInt
+  private val memoryBean = ManagementFactory.getMemoryMXBean
+  private val logger = LogFactory.getLog(this.getClass)
+
+  override def run(): Unit = {
+if (!logger.isInfoEnabled) {
+  return
+}
+while (keepChecking.get()) {
+  report()
+  Thread.sleep(reportEvery.toMillis)
+}
+  }
+
+  def stopChecking(): Unit = keepChecking.set(false)
+
+  private def report(): Unit = {
+val rss = Source.fromFile(s"/proc/$pid/status").getLines()
+  .filter(_.startsWith("Rss"))
+  .mkString(", ")
+logger.info(rss)
+logger.info(memoryBean.getHeapMemoryUsage)
+  }
+}
diff --git 
a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
 
b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
index 2f64e15..cce063d 100644
--- 
a/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
+++ 
b/jvm-packages/xgboost4j-spark/src/main/scala/ml/dmlc/xgboost4j/scala/spark/XGBoost.scala
@@ -17,8 +17,10 @@
 package ml.dmlc.xgboost4j.scala.spark
 
 import java.io.ByteArrayInputStream
+import java.util.concurrent.TimeUnit
 
 import scala.collection.mutable
+import scala.concurrent.duration.Duration
 import scala.util.Random
 import ml.dmlc.xgboost4j.java.{IRabitTracker, Rabit, XGBoostError, 
RabitTracker => PyRabitTracker}
 import ml.dmlc.xgboost4j.scala.rabit.RabitTracker
@@ -30,6 +32,7 @@
 import org.apache.spark.sql.Dataset
 import org.apache.spark.ml.feature.{LabeledPoint => MLLabeledPoint}
 import org.apache.spark.{SparkContext, SparkParallelismTracker, TaskContext}
+
 
 object TrackerConf {
   def apply(): TrackerConf = TrackerConf(0L, "python")
@@ -51,6 +54,10 @@
 
 object XGBoost extends Serializable {
   private val logger = LogFactory.getLog("XGBoostSpark")
+
+  // By using a lazy val on an object (singleton) we ensure this is only 
performed
+  // once per-jvm. It is 

[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Add option to train using external memory

2017-12-04 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/395062 )

Change subject: Add option to train using external memory
..

Add option to train using external memory

I'm not sure what exactly has changed, but i'm unable to complete a full
round of training on wikis with large (~35M) numbers of observations
keep getting killed by spark. I tried increasing memory overhead from 9G
to 12G but it still keeps dieing. I'm wary of allocating even more
memory than that, as we are asking for a significant % of cluster
memory.

Take advantage of xgboost's external memory implementation to prevent
the memory explosion. This basically writes out the features matrix to
disk and memory maps it, depending on the kernel disk cache to keep it
in memory where possible. This is likely a little slower, but still
faster than killing executors and regularly restarting training.

Change-Id: Ie283c1c58d8395054164f1c0157e1a709d14
---
M example_train.yaml
M mjolnir/test/fixtures/load_config/example_train.expect
M mjolnir/training/xgboost.py
M mjolnir/utilities/training_pipeline.py
4 files changed, 18 insertions(+), 3 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/62/395062/1

diff --git a/example_train.yaml b/example_train.yaml
index 5784421..9cfb8a3 100644
--- a/example_train.yaml
+++ b/example_train.yaml
@@ -138,6 +138,7 @@
 cv-jobs: 22
 folds: 3
 final-trees: 100
+use-external-memory: yes
 
 medium:
 # 4M to 12M observations per executor.
diff --git a/mjolnir/test/fixtures/load_config/example_train.expect 
b/mjolnir/test/fixtures/load_config/example_train.expect
index 23e536f..75233f6 100644
--- a/mjolnir/test/fixtures/load_config/example_train.expect
+++ b/mjolnir/test/fixtures/load_config/example_train.expect
@@ -243,6 +243,7 @@
   folds: '3'
   input: hdfs://analytics-hadoop/user/pytest/mjolnir/marker
   output: /home/pytest/training_size/marker_large
+  use-external-memory: 'True'
   workers: '3'
 environment:
   HOME: /home/pytest
diff --git a/mjolnir/training/xgboost.py b/mjolnir/training/xgboost.py
index 6d1f70b..03e8599 100644
--- a/mjolnir/training/xgboost.py
+++ b/mjolnir/training/xgboost.py
@@ -108,7 +108,7 @@
 return retval
 
 
-def train(df, params, num_workers=None):
+def train(df, params, num_workers=None, use_external_memory=False):
 """Train a single xgboost ranking model.
 
 df : pyspark.sql.DataFrame
@@ -168,6 +168,7 @@
 try:
 return XGBoostModel.trainWithDataFrame(df_grouped, params, num_rounds,
num_workers, 
feature_col='features',
+   
use_external_memory=use_external_memory,
label_col='label')
 finally:
 if unpersist:
diff --git a/mjolnir/utilities/training_pipeline.py 
b/mjolnir/utilities/training_pipeline.py
index 3ee6bd2..dae13ab 100644
--- a/mjolnir/utilities/training_pipeline.py
+++ b/mjolnir/utilities/training_pipeline.py
@@ -51,7 +51,7 @@
 
 
 def run_pipeline(sc, sqlContext, input_dir, output_dir, wikis, 
initial_num_trees, final_num_trees,
- num_workers, num_cv_jobs, num_folds, test_dir, zero_features):
+ num_workers, num_cv_jobs, num_folds, test_dir, zero_features, 
use_external_memory):
 for wiki in wikis:
 print 'Training wiki: %s' % (wiki)
 df_hits_with_features = (
@@ -98,7 +98,8 @@
 df_grouped, j_groups = mjolnir.training.xgboost.prep_training(
 df_hits_with_features, num_workers)
 best_params['groupData'] = j_groups
-model = mjolnir.training.xgboost.train(df_grouped, best_params)
+model = mjolnir.training.xgboost.train(
+df_grouped, best_params, 
use_external_memory=use_external_memory)
 
 tune_results['metrics']['train'] = model.eval(df_grouped, j_groups)
 df_grouped.unpersist()
@@ -142,6 +143,14 @@
 print 'Wrote xgboost binary model to %s' % (xgb_model_output)
 print ''
 
+def str_to_bool(value):
+if value.lower() in ['true', 'yes', '1']:
+return True
+elif value.lower() in ['false', 'no', '0']:
+return False
+else:
+raise ValueError("Unknown boolean string: " + value)
+
 
 def parse_arguments(argv):
 parser = argparse.ArgumentParser(description='Train XGBoost ranking 
models')
@@ -168,6 +177,9 @@
 '--initial-trees', dest='initial_num_trees', default=100, type=int,
 help='Number of trees to perform hyperparamter tuning with.  (Default: 
100)')
 parser.add_argument(
+'-e', '--use-external-memory', dest='use_external_memory', 
default=False,
+type=str_to_bool, help='Use external memory for feature matrix')
+

[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Allow spark to keep the full data pipeline in memory

2017-12-04 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/395063 )

Change subject: Allow spark to keep the full data pipeline in memory
..

Allow spark to keep the full data pipeline in memory

Something with the upgrade to spark 2.1.2 has caused us to
recompute lots of data over and over again in the data pipeline.
This is particularly egregious for normalization and feature collection
steps which take an hour each on a full run of data.

I tested out simply not unpersisting our data and everything seems to
work fine. We have ~100G of memory available for caching and only
end up using 50G by not unpersisting. Figuring out what data is
available is also much easier if we don't have to think about when
to unpersist what data.

Change-Id: Iedf259e481055444f369c528a56bee372e57595e
---
M mjolnir/sampling.py
M mjolnir/utilities/data_pipeline.py
2 files changed, 1 insertion(+), 19 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/63/395063/1

diff --git a/mjolnir/sampling.py b/mjolnir/sampling.py
index d7d1f2b..88a93c7 100644
--- a/mjolnir/sampling.py
+++ b/mjolnir/sampling.py
@@ -196,8 +196,5 @@
 df
 .join(df_queries_sampled, how='inner', on=['wikiid', 'norm_query_id'])
 .cache())
-df_sampled.count()
-df.unpersist()
-df_queries_unique.unpersist()
 
 return hit_page_id_counts, df_sampled
diff --git a/mjolnir/utilities/data_pipeline.py 
b/mjolnir/utilities/data_pipeline.py
index 20b06f0..5d26065 100644
--- a/mjolnir/utilities/data_pipeline.py
+++ b/mjolnir/utilities/data_pipeline.py
@@ -69,11 +69,6 @@
 seed=54321,
 samples_per_wiki=samples_per_wiki)
 
-# This should already be cached from sample, but lets be explicit
-# to prevent future problems with refactoring.
-df_sampled_raw.cache().count()
-df_norm.unpersist()
-
 # Transform our dataframe into the shape expected by the DBN
 df_sampled = (
 df_sampled_raw
@@ -85,13 +80,11 @@
 .drop('click_page_ids')
 .cache())
 
-# materialize df_sampled and unpersist df_norm
-nb_samples = df_sampled.count()
-df_sampled_raw.unpersist()
 
 # Target around 125k rows per partition. Note that this isn't
 # how many the dbn will see, because it gets collected up. Just
 # a rough guess.
+nb_samples = df_sampled.count()
 dbn_partitions = int(max(200, min(2000, nb_samples / 125000)))
 
 # Learn relevances
@@ -114,10 +107,6 @@
 .join(df_rel, how='inner', on=['wikiid', 'norm_query_id', 
'hit_page_id'])
 .cache())
 
-# materialize df_all_hits and drop df_sampled, df_norm
-df_all_hits.count()
-df_sampled.unpersist()
-
 # TODO: Training is per-wiki, should this be as well?
 weightedNdcgAt10 = mjolnir.metrics.ndcg(df_all_hits, 10, 
query_cols=['wikiid', 'query', 'session_id'])
 print 'weighted ndcg@10: %.4f' % (weightedNdcgAt10)
@@ -133,10 +122,6 @@
  F.first('label').alias('label'),
  F.first('relevance').alias('relevance'))
 .cache())
-
-# materialize df_hits and drop df_all_hits
-df_hits.count()
-df_all_hits.unpersist()
 
 actual_samples_per_wiki = 
df_hits.groupby('wikiid').agg(F.count(F.lit(1)).alias('n_obs')).collect()
 actual_samples_per_wiki = {row.wikiid: row.n_obs for row in 
actual_samples_per_wiki}

-- 
To view, visit https://gerrit.wikimedia.org/r/395063
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iedf259e481055444f369c528a56bee372e57595e
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: [WIP] Bad ideas for improved DBN performance

2017-12-02 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/394741 )

Change subject: [WIP] Bad ideas for improved DBN performance
..

[WIP] Bad ideas for improved DBN performance

I'm not sure this is a particularly great idea, but I wanted to explore
the performance limits of the JVM based DBN implementation. This brings
the original benchmark (90s in java, 3-4s in prior patch) to ~900ms. To
get a better idea on performance i increased the size of the benchmark:

* python: 616s
  - only ran once
* orig jvm: min: 21.7, max: 24.1 mean: 23.5s
  - 5 runs
  - 25x- 28x faster than python
* optimized jvm: min: 5.0s max: 5.3s mean: 5.2s
  - 5 runs
  - 116x - 123x faster than python
  - 4x - 5x faster than orig jvm

The improvements made were guided by profiling in visualvm and arn't
all that numerous:

* We were thrashing memory pretty hard at >1GB/sec. To reduce this add
  caches of our intermediate arrays. We are still thrashing memory
  pretty hard but not as bad.

* The caches of the intermediate arrays in scala Maps brought those
  maps up high in the profiler. Replace with arrays of queues. The
  backing linked list still shows up in profiling, but not as bad.

* DefaultMap.apply gets hit *alot* and was showing up in profiling.
  Replacing inner scala maps with java maps helped some. Further
  replacing java maps with trove4j primitive maps helped significantly.

* Find places where we were repeatedly hitting an array for the same
  item (for example getting something by s.queryId in a loop on the
  urls) and fetch it into a local var. Not sure this made much
  difference

visualvm now reports 80% of cpu time is spent in our own functions,
whereas before it was significantly lower. Mostly I just kept looking
for places where the supporting machinery was taking up cpu instead
of our calculations and kept replacing them until it was better.

Change-Id: I08b72b98f515a820675e1ef9b45dd8724cbd070e
---
M jvm/pom.xml
M jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala
M jvm/src/test/scala/org/wikimedia/search/mjolnir/DBNSuite.scala
3 files changed, 246 insertions(+), 58 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/41/394741/1

diff --git a/jvm/pom.xml b/jvm/pom.xml
index b2a7f71..f405975 100644
--- a/jvm/pom.xml
+++ b/jvm/pom.xml
@@ -141,6 +141,11 @@
 3.0.1
 test
 
+
+net.sf.trove4j
+trove4j
+3.0.3
+
 
 
 
diff --git a/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala 
b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala
index 12ef975..cda6778 100644
--- a/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala
+++ b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala
@@ -9,6 +9,9 @@
   * A Dynamic Bayesian Network Click Model for Web Search Ranking - Olivier 
Chapelle and
   * Ya Zang - http://olivier.chapelle.cc/pub/DBN_www2009.pdf
   */
+import gnu.trove.iterator.TIntObjectIterator
+import gnu.trove.map.hash.TIntObjectHashMap
+
 import scala.collection.mutable
 import scala.util.parsing.json.JSON
 
@@ -19,15 +22,22 @@
 
   // This bit maps input queryies/results to array indexes to be used while 
calculating
   private var currentUrlId: Int = 0 // TODO: Why is first returned value 1 
instead of 0?
+  private val urlToIdMap: mutable.Map[String, Int] = mutable.Map()
+  def urlToId(key: String): Int = {
+urlToIdMap.getOrElseUpdate(key, {
+  currentUrlId += 1
+  currentUrlId
+})
+  }
+
   private var currentQueryId: Int = -1
-  private val urlToId: DefaultMap[String, Int] = new DefaultMap({ _ =>
-currentUrlId += 1
-currentUrlId
-  })
-  private val queryToId: DefaultMap[(String, String), Int] = new DefaultMap({ 
_ =>
-currentQueryId += 1
-currentQueryId
-  })
+  private val queryToIdMap: mutable.Map[(String, String), Int] = mutable.Map()
+  def queryToId(key: (String, String)): Int = {
+queryToIdMap.getOrElseUpdate(key, {
+  currentQueryId += 1
+  currentQueryId
+})
+  }
 
   def maxQueryId: Int = currentQueryId + 2
 
@@ -91,8 +101,8 @@
   }
 
   def toRelevances(urlRelevances: Array[Map[Int, UrlRel]]): 
Seq[RelevanceResult] = {
-val idToUrl = urlToId.asMap.map(_.swap)
-val idToQuery = queryToId.asMap.map(_.swap)
+val idToUrl = urlToIdMap.map(_.swap)
+val idToQuery = queryToIdMap.map(_.swap)
 
 urlRelevances.zipWithIndex.flatMap { case (d, queryId) =>
   val (query, region) = idToQuery(queryId)
@@ -101,6 +111,127 @@
 RelevanceResult(query, region, url, urlRel.a * urlRel.s)
   }
 }
+  }
+}
+
+class ArrayCache {
+  val QUEUE_1D_MAX = 20
+  private val queueMap1d: Array[mutable.Queue[Array[Double]]] = 
Array.fill(QUEUE_1D_MAX + 1){ mutable.Queue() }
+
+  def get1d(n: Int): Array[Double] = {
+if (n > QUEUE_1D_MAX) {
+  new Array[Double](n)
+} else {
+  

[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Port DBN from clickmodels to scala

2017-11-30 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/394509 )

Change subject: Port DBN from clickmodels to scala
..

Port DBN from clickmodels to scala

DBN takes quite some time when running against a full run of wikis. On
one run that resulted in 90M observations across 20 wikis it took over
20 minutes.  In the naive benchmark (included) this scala version is >
20x faster than the python implementation, and is perhaps easier to
follow since we remove unused functionality ( the intents and layouts).

This does not yet include the python side of calling this, because the
way the CI works we will need to publish a new mjolnir jar with this
code and wanted to let it get reviewed first.

For performance reasons this almost exclusively uses arrays, and most
inner loops are using while instead of more idiomatic map or fold. This
conversion gave an ~3x speedup, which seems worthwhile. This is probably
very allocation heavy, but optimizing out the allocations seemed like a
big pain.

Change-Id: I7231590a18b7f8fe2552997bc4c702ee635d06e5
---
A jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala
A jvm/src/test/resources/dbn.data
A jvm/src/test/scala/org/wikimedia/search/mjolnir/DBNSuite.scala
3 files changed, 463 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/09/394509/1

diff --git a/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala 
b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala
new file mode 100644
index 000..51ea651
--- /dev/null
+++ b/jvm/src/main/scala/org/wikimedia/search/mjolnir/DBN.scala
@@ -0,0 +1,312 @@
+package org.wikimedia.search.mjolnir
+
+import scala.collection.mutable
+import scala.util.parsing.json.JSON
+
+case class SessionItem(queryId: Int, urlIds: Array[Int], clicks: 
Array[Boolean])
+case class RelevanceResult(query: String, region: String, url: String, 
relevance: Double)
+
+class InputReader(
+  minDocsPerQuery: Int, maxDocsPerQuery: Int, serpSize: Int,
+  discardNoClicks: Boolean
+) {
+
+  private val urlToId: mutable.Map[String, Int] = mutable.Map()
+  private val queryToId: mutable.Map[(String, String), Int] = mutable.Map()
+  private var currentUrlId: Int = 1
+  private var currentQueryId: Int = 0
+
+  def maxQueryId: Int = currentQueryId + 1
+
+  private def getQueryId(query: String, region: String): Int = {
+val key = (query, region)
+queryToId.get(key) match {
+  case Some(queryId) => queryId
+  case None =>
+val queryId = currentQueryId
+currentQueryId += 1
+queryToId.put(key, queryId)
+queryId
+}
+  }
+
+  private def getUrlId(url: String): Int = {
+urlToId.get(url) match {
+  case Some(urlId) => urlId
+  case None =>
+val urlId = currentUrlId
+currentUrlId += 1
+urlToId.put(url, urlId)
+urlId
+}
+  }
+
+
+  private def parseJsonBooleanArray(json: String): Option[Array[Boolean]] = {
+JSON.parseFull(json) match {
+  case Some(x: List[Any]) =>
+if (x.forall(_.isInstanceOf[Boolean])) {
+  Some(x.asInstanceOf[List[Boolean]].toArray)
+} else {
+  None
+}
+  case _ => None
+}
+  }
+
+  private def parseJsonStringArray(json: String): Option[Array[String]] = {
+JSON.parseFull(json) match {
+  case Some(x: List[Any]) =>
+if (x.forall(_.isInstanceOf[String])) {
+  Some(x.asInstanceOf[List[String]].toArray)
+} else {
+  None
+}
+  case _ => None
+}
+  }
+
+  def makeSessionItem(query: String, region: String, urls: Array[String], 
clicks: Array[Boolean]): Option[SessionItem] = {
+
+val n = math.min(serpSize, urls.length)
+val hasClicks = clicks.take(n).foldLeft(false)(_ || _)
+if (urls.length < minDocsPerQuery ||
+(discardNoClicks && !hasClicks)
+) {
+  None
+} else {
+  val queryId = getQueryId(query, region)
+  val urlIds = urls.map(getUrlId)
+  Some(SessionItem(queryId, urlIds, clicks.take(n)))
+}
+  }
+
+  val PIECE_HASH_DIGEST = 0
+  val PIECE_QUERY = 1
+  val PIECE_REGION = 2
+  val PIECE_INTENT_WEIGHT = 3
+  val PIECE_URLS = 4
+  val PIECE_LAYOUT = 5
+  val PIECE_CLICKS = 6
+
+  // TODO: Ideally dont use this and make session items directly without extra 
ser/deser overhead
+  def read(f: Iterator[String]): Seq[SessionItem] = {
+f.flatMap { line => {
+  val pieces = line.split("\t")
+  val query: String = pieces(PIECE_QUERY)
+  val region = pieces(PIECE_REGION)
+  val urls = parseJsonStringArray(pieces(PIECE_URLS)) match {
+case Some(x: Array[String]) => x
+case None => Array[String]()
+  }
+  val clicks = parseJsonBooleanArray(pieces(PIECE_CLICKS)) match {
+case Some(x: Array[Boolean]) => x
+case None => Array[Boolean]()
+  }
+
+  makeSessionItem(query, region, urls, clicks)
+  

[MediaWiki-commits] [Gerrit] mediawiki/core[master]: Include highlight snippets when using search as api generator

2017-11-29 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/394120 )

Change subject: Include highlight snippets when using search as api generator
..

Include highlight snippets when using search as api generator

It turned out when using ApiQuerySearch in generator mode important
highlighting information was never returned. This makes it hard to
figure out why a search for intitle:park returned 'Capilano Suspension
Bridge' (because there is a redirect with the word park). Add all
requested gsrprop to the generator result.

Change-Id: Iea48937662492445783104077666ab1f1b30da2d
---
M includes/api/ApiQuerySearch.php
1 file changed, 10 insertions(+), 3 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/core 
refs/changes/20/394120/1

diff --git a/includes/api/ApiQuerySearch.php b/includes/api/ApiQuerySearch.php
index f0c4180..832d84f 100644
--- a/includes/api/ApiQuerySearch.php
+++ b/includes/api/ApiQuerySearch.php
@@ -145,9 +145,11 @@
// Add the search results to the result
$terms = $wgContLang->convertForSearchResult( 
$matches->termMatches() );
$titles = [];
+   $metadata = [];
$count = 0;
$result = $matches->next();
$limit = $params['limit'];
+   $offset = $params['offset'] + 1;
 
while ( $result ) {
if ( ++$count > $limit ) {
@@ -175,6 +177,12 @@
}
} else {
$titles[] = $result->getTitle();
+   $metadata[] = [
+   'title' => $result->getTitle(),
+   'data' => $this->getSearchResultData( 
$result, $prop, $terms ) + [
+   'index' => $count - 1 + $offset,
+   ],
+   ];
}
 
$result = $matches->next();
@@ -209,9 +217,8 @@
return $current;
} );
$resultPageSet->populateFromTitles( $titles );
-   $offset = $params['offset'] + 1;
-   foreach ( $titles as $index => $title ) {
-   $resultPageSet->setGeneratorData( $title, [ 
'index' => $index + $offset ] );
+   foreach ( $metadata as $data ) {
+   $resultPageSet->setGeneratorData( 
$data['title'], $data['data'] );
}
}
}

-- 
To view, visit https://gerrit.wikimedia.org/r/394120
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iea48937662492445783104077666ab1f1b30da2d
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/core
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search...deploy[master]: Copy spark config into place on deploy

2017-11-29 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/394117 )

Change subject: Copy spark config into place on deploy
..

Copy spark config into place on deploy

Putting this configuration into a standard place allows calling spark
commands without having to explicitly point out where mjolnir is
installed to and shortens command lines.

Also bumps mjolnir sub module to master which has support for this
location. Before this can be deployed a puppet patch must be shipped
to create the /etc/mjolnir directory and set its ownership to the
deploy-service so the copy works.

Change-Id: I7c69481156f543a8258a2e9b2c90f8e15984caaa
---
M scap/checks.yaml
M src
2 files changed, 7 insertions(+), 2 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR/deploy 
refs/changes/17/394117/1

diff --git a/scap/checks.yaml b/scap/checks.yaml
index 411e399..63662bc 100644
--- a/scap/checks.yaml
+++ b/scap/checks.yaml
@@ -11,4 +11,9 @@
 timeout: 300
 group: analytics
 command: bash 
/srv/deployment/search/mjolnir/deploy/scap/checks/virtualenv.sh
-
+spark_config:
+type: command
+stage: promote
+timeout: 10
+group: analytics
+command: cp /srv/deployment/search/mjolnir/deploy/spark.yaml 
/etc/mjolnir/spark.yaml
diff --git a/src b/src
index 5799ac9..c2236ad 16
--- a/src
+++ b/src
@@ -1 +1 @@
-Subproject commit 5799ac99ccb095964d6550a9e45ee7abca768b55
+Subproject commit c2236adfd04280feef29b288ffc113355df83fe1

-- 
To view, visit https://gerrit.wikimedia.org/r/394117
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I7c69481156f543a8258a2e9b2c90f8e15984caaa
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR/deploy
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Better error message if wiki missing in data_pipeline

2017-11-29 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/394108 )

Change subject: Better error message if wiki missing in data_pipeline
..

Better error message if wiki missing in data_pipeline

Not sure how this happens, but on one run through viwiki was
in the input data but didn't make it to the check that we
have expected sampling. This change will still error, but will
give better messages about what went wrong.

Change-Id: If7af29be5022c0b374c9b8836322ccf074467575
---
M mjolnir/utilities/data_pipeline.py
1 file changed, 6 insertions(+), 2 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/08/394108/1

diff --git a/mjolnir/utilities/data_pipeline.py 
b/mjolnir/utilities/data_pipeline.py
index 20b06f0..ee479c8 100644
--- a/mjolnir/utilities/data_pipeline.py
+++ b/mjolnir/utilities/data_pipeline.py
@@ -145,8 +145,12 @@
 for wiki in wikis:
 # We cant have more samples than we started with
 expected = min(samples_per_wiki, hit_page_id_counts[wiki])
-actual = actual_samples_per_wiki[wiki]
-if expected / float(actual) < samples_size_tolerance:
+try:
+actual = actual_samples_per_wiki[wiki]
+except KeyError:
+# This will probably still error, but give better messages.
+actual = 0
+if actual == 0 or expected / float(actual) < samples_size_tolerance:
 not_enough_samples.append(
 'Collected %d samples from %s which is less than %d%% of the 
requested sample size %d'
 % (actual, wiki, samples_size_tolerance*100, expected))

-- 
To view, visit https://gerrit.wikimedia.org/r/394108
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: If7af29be5022c0b374c9b8836322ccf074467575
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] wikimedia...relevanceForge[master]: Rename counter variables to i to make tox happy

2017-11-29 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/394099 )

Change subject: Rename counter variables to i to make tox happy
..

Rename counter variables to i to make tox happy

Tox run by CI has decided that l is an ambiguous name for a variable.
Switching it to i which is a bit of a standard counter variable seems
to make it happy and didn't look to already be used.

Change-Id: Iea9dd0ea2e900c9b0452795d8c732c539511da10
---
M other_tools/augmentdump.py
M other_tools/metastats.py
2 files changed, 7 insertions(+), 7 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/relevanceForge 
refs/changes/99/394099/1

diff --git a/other_tools/augmentdump.py b/other_tools/augmentdump.py
index 5acd27c..1b8a8b4 100755
--- a/other_tools/augmentdump.py
+++ b/other_tools/augmentdump.py
@@ -112,12 +112,12 @@
 
 
 def read_dump(inputf, outputf, data, fieldname):
-l = 0
+i = 0
 pageId = -1
 for line in inputf:
-l += 1
+i += 1
 page = {}
-if l % 2 == 1:
+if i % 2 == 1:
 outputf.write(line)
 page = json.loads(line)
 pageId = -1
diff --git a/other_tools/metastats.py b/other_tools/metastats.py
index 5de85fe..1c01c79 100755
--- a/other_tools/metastats.py
+++ b/other_tools/metastats.py
@@ -43,17 +43,17 @@
 FNULL = open(os.devnull, 'w')
 p = subprocess.Popen('curl -L ' + url + ' | gzip -cd', shell=True,
  stdout=subprocess.PIPE, stderr=FNULL)
-l = 0
+i = 0
 for line in p.stdout:
-l += 1
+i += 1
 page = json.loads(line)
-if(l % 2 == 1):
+if(i % 2 == 1):
 pageId = page['index']['_id']
 continue
 try:
 int(pageId)
 except ValueError:
-print("*** line:" + str(l) + " is not a valid id : '" + 
str(pageId) + "'")
+print("*** line:" + str(i) + " is not a valid id : '" + 
str(pageId) + "'")
 continue
 
 callback(pageId, page)

-- 
To view, visit https://gerrit.wikimedia.org/r/394099
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iea9dd0ea2e900c9b0452795d8c732c539511da10
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/relevanceForge
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] wikimedia...relevanceForge[master]: Add basic pre-deployment sanity check for MLR

2017-11-28 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/394011 )

Change subject: Add basic pre-deployment sanity check for MLR
..

Add basic pre-deployment sanity check for MLR

Implements a very simple configuration-driven sanity checker that
ensures some set of urls is in the top 3 results of a given query. The
intention of this script is to build up a small list of queries and
results for each wiki we deploy MLR to and use that list as a smoke
check before pushing a mediawiki-config change to move a new model to
full production usage.

Not sure relforge is the best place for this, or where the configuration
should really go, but I couldn't think of a better place.

Change-Id: Ie29ef99d2e404fe97e3b2e42b17df22b836385d8
---
A sanityCheck.py
A sanityCheck/enwiki.json
2 files changed, 75 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikimedia/discovery/relevanceForge 
refs/changes/11/394011/1

diff --git a/sanityCheck.py b/sanityCheck.py
new file mode 100644
index 000..bcf7136
--- /dev/null
+++ b/sanityCheck.py
@@ -0,0 +1,64 @@
+from __future__ import print_function
+import argparse
+import functools
+import json
+import requests
+import sys
+import urlparse
+
+
+def check(model, config):
+ok = True
+query_params = {
+'action': 'query',
+'list': 'search',
+'srlimit': 3,
+'cirrusMLRModel': model,
+'format': 'json',
+'formatversion': 2,
+}
+if 'query' in config:
+# Apply overrides from config if requested. This might
+# apply a specific cirrusUserTesting param or some such.
+query_params.update(config['query'])
+
+print('Running sanity check against %s' % (config['api']))
+for query, expected in config['queries'].items():
+print("Query: %s" % (query))
+query_params['srsearch'] = query
+r = requests.get(config['api'], params=query_params)
+results = [x['title'] for x in r.json()['query']['search']]
+diff = set(expected).difference(results)
+if diff:
+ok = False
+print("Results:\n\t" + '\n\t'.join(results))
+print("Expected:")
+for title in expected:
+marker = '+' if title in results else '-'
+print('\t%s %s' % (marker, title))
+print('')
+else:
+print("PASSED\n")
+return ok
+
+
+def parse_arguments(argv):
+parser = argparse.ArgumentParser(description='mlr sanity check')
+parser.add_argument(
+   'config', type=lambda x: json.load(open(x)),
+help='json file containing queries to check and results expected in 
top 3')
+parser.add_argument(
+'model', help='MLR model to use for ranking')
+args = parser.parse_args(argv)
+return dict(vars(args))
+
+
+def main(argv=None):
+args = parse_arguments(argv)
+return check(**args)
+
+
+if __name__ == "__main__":
+ok = main()
+sys.exit(0 if ok else 1)
+
diff --git a/sanityCheck/enwiki.json b/sanityCheck/enwiki.json
new file mode 100644
index 000..a1902cb
--- /dev/null
+++ b/sanityCheck/enwiki.json
@@ -0,0 +1,11 @@
+{
+"api": "https://en.wikipedia.org/w/api.php;,
+"queries": {
+"example": [
+"Example"
+],
+"JFK": [
+"John F. Kennedy"
+]
+}
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/394011
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie29ef99d2e404fe97e3b2e42b17df22b836385d8
Gerrit-PatchSet: 1
Gerrit-Project: wikimedia/discovery/relevanceForge
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Add default path for spark utility config file

2017-11-28 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/394003 )

Change subject: Add default path for spark utility config file
..

Add default path for spark utility config file

This defaults the config to /etc/mjolnir/spark.yaml. The separate deploy
repo will install appropriate configuration there so spark commands can
be called with little fuss.

Change-Id: I9ef11aa00f237ee2486fde0049c88ed568b0f51a
---
M mjolnir/utilities/spark.py
1 file changed, 1 insertion(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/03/394003/1

diff --git a/mjolnir/utilities/spark.py b/mjolnir/utilities/spark.py
index 1ac7114..5954579 100644
--- a/mjolnir/utilities/spark.py
+++ b/mjolnir/utilities/spark.py
@@ -448,6 +448,7 @@
 
 parser.add_argument(
 '-c', '--config', dest='config', type=str, required=True,
+default='/etc/mjolnir/spark.yaml',
 help='Path to yaml configuration file.')
 parser.add_argument(
 '-t', '--template-var', dest='template_vars', action=KeyValueAction,

-- 
To view, visit https://gerrit.wikimedia.org/r/394003
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I9ef11aa00f237ee2486fde0049c88ed568b0f51a
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] operations/puppet[production]: Revert "Revert "Deploy MjoLniR with new deploy repository""

2017-11-28 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/394002 )

Change subject: Revert "Revert "Deploy MjoLniR with new deploy repository""
..

Revert "Revert "Deploy MjoLniR with new deploy repository""

The problems with mjolnir vs MjoLniR have been resolved, it
required adjustments to scap.cfg in the repo.

This is a standard revert with one addition, we now create an
empty directory /etc/mjolnir owned by deploy-service. This gives
the deploy repo a sane place to install a configuration script
to that can be auto-magicaly found by mjolnir.

This reverts commit 6a7753a14ac3cb66593eabbad30e8ac72e184751.

Change-Id: I599341bd16ecba0a2b8d8132fde6fe3d1443d754
---
M hieradata/role/common/deployment_server.yaml
A modules/mjolnir/manifests/init.pp
M modules/profile/manifests/mjolnir/kafka_daemon.pp
M modules/profile/templates/mjolnir/kafka-daemon.service.erb
M modules/role/manifests/elasticsearch/analytics.pp
5 files changed, 28 insertions(+), 14 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/operations/puppet 
refs/changes/02/394002/1

diff --git a/hieradata/role/common/deployment_server.yaml 
b/hieradata/role/common/deployment_server.yaml
index 7c55e05..7444810 100644
--- a/hieradata/role/common/deployment_server.yaml
+++ b/hieradata/role/common/deployment_server.yaml
@@ -173,8 +173,8 @@
   # Netbox software
   netbox/deploy:
 repository: operations/software/netbox-deploy
-  relforge/mjolnir:
-repository: search/MjoLniR
+  search/mjolnir/deploy:
+repository: search/MjoLniR/deploy
   statsv/statsv:
 repository: analytics/statsv
   "docker-pkg/deploy":
diff --git a/modules/mjolnir/manifests/init.pp 
b/modules/mjolnir/manifests/init.pp
new file mode 100644
index 000..e4bb89f
--- /dev/null
+++ b/modules/mjolnir/manifests/init.pp
@@ -0,0 +1,21 @@
+# = Class: mjolnir
+#
+# This class installs the MjoLniR (Machine Learned Ranking) data
+# processing package.
+#
+class mjolnir {
+require_package('virtualenv', 'zip')
+
+file { '/etc/mjolnir':
+ensure => 'directory',
+user   => 'deploy-service',
+group  => 'deploy-service',
+mode   => 0755
+}
+
+scap::target { 'search/mjolnir/deploy':
+deploy_user => 'deploy-service',
+}
+}
+
+
diff --git a/modules/profile/manifests/mjolnir/kafka_daemon.pp 
b/modules/profile/manifests/mjolnir/kafka_daemon.pp
index c1dcaf9..479187c 100644
--- a/modules/profile/manifests/mjolnir/kafka_daemon.pp
+++ b/modules/profile/manifests/mjolnir/kafka_daemon.pp
@@ -9,18 +9,11 @@
 # it is named just 'eqiad'.
 $kafka_config = kafka_config('eqiad'),
 ) {
-scap::target { 'relforge/mjolnir':
-  deploy_user => 'deploy-service',
-}
-
-# This is a limited subset of what the full mjolnir package requires 
because
-# the daemon is a small part of the overall application. The daemon only 
needs
-# to read/write kafka topics and send requests to localhost.
-require_package('python-kafka', 'python-requests')
+class { 'mjolnir': }
 
 systemd::service { 'mjolnir-kafka-daemon':
 content => template('profile/mjolnir/kafka-daemon.service.erb'),
-require => Scap::Target['relforge/mjolnir'],
+require => Scap::Target['search/mjolnir/deploy'],
 }
 
 }
diff --git a/modules/profile/templates/mjolnir/kafka-daemon.service.erb 
b/modules/profile/templates/mjolnir/kafka-daemon.service.erb
index b6947ba..7e40bd8 100644
--- a/modules/profile/templates/mjolnir/kafka-daemon.service.erb
+++ b/modules/profile/templates/mjolnir/kafka-daemon.service.erb
@@ -5,9 +5,7 @@
 [Service]
 User=nobody
 Group=nogroup
-WorkingDirectory=/srv/deployment/relforge/mjolnir
-Environment=PYTHONPATH=/srv/deployment/relforge/mjolnir
-ExecStart=/usr/bin/python2 
/srv/deployment/relforge/mjolnir/mjolnir/cli/kafka_daemon.py --brokers <%= 
@kafka_config['brokers']['string'] %>
+ExecStart=/srv/deployment/search/mjolnir/venv/bin/mjolnir-utilities.py 
kafka_daemon --brokers <%= @kafka_config['brokers']['string'] %>
 StandardInput=null
 StandardOutput=journal
 StandardError=journal
diff --git a/modules/role/manifests/elasticsearch/analytics.pp 
b/modules/role/manifests/elasticsearch/analytics.pp
index a46391e..66d7789 100644
--- a/modules/role/manifests/elasticsearch/analytics.pp
+++ b/modules/role/manifests/elasticsearch/analytics.pp
@@ -1,5 +1,7 @@
 # Supports CirrusSearch usage on the analytics cluster
 class role::elasticsearch::analytics {
+class { 'mjolnir': }
+
 # wikimedia/discovery/analytics will be deployed to this node
 scap::target { 'wikimedia/discovery/analytics':
 deploy_user => 'deploy-service',

-- 
To view, visit https://gerrit.wikimedia.org/r/394002
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I599341bd16ecba0a2b8d8132fde6fe3d1443d754
Gerrit-PatchSet: 1
Gerrit-Project: operations/puppet

[MediaWiki-commits] [Gerrit] search...deploy[master]: bump mjolnir dependency to master

2017-11-28 Thread EBernhardson (Code Review)
EBernhardson has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/393997 )

Change subject: bump mjolnir dependency to master
..


bump mjolnir dependency to master

Change-Id: I4890f19f29827d8858248044322a1266871ad8be
---
M src
1 file changed, 1 insertion(+), 1 deletion(-)

Approvals:
  EBernhardson: Verified; Looks good to me, approved



diff --git a/src b/src
index 0d7fdcf..5799ac9 16
--- a/src
+++ b/src
@@ -1 +1 @@
-Subproject commit 0d7fdcf27b51b848a8c964f3c204f195c376dea5
+Subproject commit 5799ac99ccb095964d6550a9e45ee7abca768b55

-- 
To view, visit https://gerrit.wikimedia.org/r/393997
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I4890f19f29827d8858248044322a1266871ad8be
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR/deploy
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 
Gerrit-Reviewer: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search...deploy[master]: bump mjolnir dependency to master

2017-11-28 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/393997 )

Change subject: bump mjolnir dependency to master
..

bump mjolnir dependency to master

Change-Id: I4890f19f29827d8858248044322a1266871ad8be
---
M src
1 file changed, 1 insertion(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR/deploy 
refs/changes/97/393997/1

diff --git a/src b/src
index 0d7fdcf..5799ac9 16
--- a/src
+++ b/src
@@ -1 +1 @@
-Subproject commit 0d7fdcf27b51b848a8c964f3c204f195c376dea5
+Subproject commit 5799ac99ccb095964d6550a9e45ee7abca768b55

-- 
To view, visit https://gerrit.wikimedia.org/r/393997
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I4890f19f29827d8858248044322a1266871ad8be
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR/deploy
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port prefer_recent_api.feature to nodejs

2017-11-27 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/393693 )

Change subject: Port prefer_recent_api.feature to nodejs
..

Port prefer_recent_api.feature to nodejs

* Remove @expect_failure tag from scenario outlines
* Drop test in final scenario with settings `.4,.0001`. It
  doesn't pass, and not really sure why or what it's supposed
  to do.
* Tune down pause between first and second stage of the hook
  from 20s to 5s. At least locally this seems to still work.
* Drop final pause in hook and replace with deletes at top
  of the hook. Best i can tell the pause was to ensure the
  final edit made it into elasticsearch. We can check the edit,
  but since we dont check revision ids we need to pre-delete
  so the check actually waits.

Change-Id: I7fbf7b9945f71b0e46a769ec5b2ebec6f338af14
---
A tests/integration/features/prefer_recent_api.feature
M tests/integration/features/support/hooks.js
2 files changed, 38 insertions(+), 5 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/93/393693/1

diff --git a/tests/integration/features/prefer_recent_api.feature 
b/tests/integration/features/prefer_recent_api.feature
new file mode 100644
index 000..553b026
--- /dev/null
+++ b/tests/integration/features/prefer_recent_api.feature
@@ -0,0 +1,24 @@
+@clean @api @prefer_recent
+Feature: Searches with prefer-recent
+  Scenario Outline: Recently updated articles are prefered if prefer-recent: 
is specified
+When I api search for PreferRecent First OR Second OR Third
+Then PreferRecent Second Second is the first api search result
+When I api search for prefer-recent: PreferRecent First OR Second 
OR Third
+Then PreferRecent Third is the first api search result
+  Examples:
+|   options   |
+| 1,.001  |
+| 1,0.001 |
+| 1,.0001 |
+| .99,.0001   |
+| .99,.001|
+
+  Scenario Outline: You can specify prefer-recent: in such a way that being 
super recent isn't enough
+When I api search for prefer-recent: PreferRecent First OR Second 
OR Third
+Then PreferRecent Second Second is the first api search result
+  Examples:
+|  options  |
+|   |
+| 1 |
+| 1,1   |
+| 1,.1  |
diff --git a/tests/integration/features/support/hooks.js 
b/tests/integration/features/support/hooks.js
index 2c78666..ea9ad93 100644
--- a/tests/integration/features/support/hooks.js
+++ b/tests/integration/features/support/hooks.js
@@ -399,6 +399,17 @@
} ) );
 
BeforeOnce( { tags: "@prefer_recent", timeout: 6 }, 
Promise.coroutine( function* () {
+   // Deleting the pages first ensures we actually wait around for 
the edits to
+   // make it into the DB. Better might be if runBatch() could 
wait for revision id's,
+   // but it doesn't (yet).
+   yield runBatch( this, false, {
+   delete: [
+   'PreferRecent First',
+   'PreferRecent Second Second',
+   'PreferRecent Third',
+   ]
+   } );
+
yield runBatch( this, false, {
edit: {
// Using epochs as content ensures the page is 
edited.
@@ -407,17 +418,15 @@
}
} );
 
-   // We need to wait around to ensure the next page has enough 
time difference
-   // for prefer-recent to reorder things
-   yield this.stepHelpers.waitForMs( 2 );
+   // We need to wait around to ensure the next page has enough 
time
+   // difference for prefer-recent to reorder things.
+   yield this.stepHelpers.waitForMs( 5000 );
 
yield runBatch( this, false, {
edit: {
'PreferRecent Third': "" + ( new Date() / 1 )
}
} );
-   // TODO: Why are we waiting here?
-   yield this.stepHelpers.waitForMs( 1 );
} ) );
 
BeforeOnce( { tags: "@hastemplate" }, runBatchFn( {

-- 
To view, visit https://gerrit.wikimedia.org/r/393693
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I7fbf7b9945f71b0e46a769ec5b2ebec6f338af14
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port update_general_api.feature to nodejs

2017-11-27 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/393694 )

Change subject: Port update_general_api.feature to nodejs
..

Port update_general_api.feature to nodejs

* Rework cirrusdoc api query to source page ids from archive and
 page table, whichever is more recent. This allows figuring out when
 deletes have made it into elasticsearch.
* Rewrite all the 'within ...' clauses to direct query/check after
 waiting for the previous operation to go through
* The only operation we can't directly wait for is the template
 update, which for unrelated reasons is broken on my MWV so commented
 out
* The archive search is only exposed via browser, so its test
 uses the browser. As Special:Undelete requires special rights this
 meant rigging up a login method for the browser.
* Changed baseurl from dev.wiki -> cirrustest.wiki. This probably
 needs to be handled more generically though to support browser with
 multiple wikis.
* Adjust waitForOperation to take a revision id, and make step
 helpers editPage method pass the new revision id into waitForOperation.
 Without this an edit to page that already exists is not waited for and
 fails.
* Implement step helpers movePage(). Waiting for the move to make it
 into cirrus required adding an additional check to the cirrusdoc query
 that the requested page matches the elastic page. This probably still
 has issues if a redirect points to the moved page, but we don't test
 that.
* Support %{epoch} transformation in steps. This required normalizing
 all parameters that should support this to (.+), removing uses of (.*)
 as cucumber-js doesn't have a generic transformation step, only one on
 individual capture patterns.

Change-Id: I99c0ef1e3453fedea5f3afbe29e5e8f9dd73d7e4
---
M includes/Api/QueryCirrusDoc.php
M tests/integration/config/wdio.conf.js
M tests/integration/features/step_definitions/page_step_helpers.js
M tests/integration/features/step_definitions/page_steps.js
M tests/integration/features/support/hooks.js
M tests/integration/features/support/pages/page.js
A tests/integration/features/support/pages/special_undelete.js
M tests/integration/features/support/world.js
A tests/integration/features/update_general_api.feature
9 files changed, 373 insertions(+), 66 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/94/393694/1

diff --git a/includes/Api/QueryCirrusDoc.php b/includes/Api/QueryCirrusDoc.php
index 27e2216..e03ee1c 100644
--- a/includes/Api/QueryCirrusDoc.php
+++ b/includes/Api/QueryCirrusDoc.php
@@ -3,7 +3,7 @@
 namespace CirrusSearch\Api;
 
 use CirrusSearch\Searcher;
-use CirrusSearch\Updater;
+use PageArchive;
 use Title;
 
 /**
@@ -31,51 +31,139 @@
 class QueryCirrusDoc extends \ApiQueryBase {
use ApiTrait;
 
+   private $config;
+   private $searcher;
+
public function __construct( \ApiQuery $query, $moduleName ) {
parent::__construct( $query, $moduleName, 'cd' );
}
 
public function execute() {
$conn = $this->getCirrusConnection();
-   $config = $this->getSearchConfig();
-   $updater = new Updater( $conn, $config );
-   $searcher = new Searcher( $conn, 0, 0, $config, [], 
$this->getUser() );
-   $result = [];
+   $this->config = $this->getSearchConfig();
+   $this->searcher = new Searcher( $conn, 0, 0, $this->config, [], 
$this->getUser() );
foreach ( $this->getPageSet()->getGoodTitles() as $origPageId 
=> $title ) {
-   list( $page, $redirects ) = $updater->traceRedirects( 
$title );
-
-   $result = [];
-   if ( $page ) {
-   $docId = $config->makeId( $page->getId() );
-   // could be optimized by implementing multi-get 
but not
-   // expecting much usage except debugging/tests.
-   $esSources = $searcher->get( [ $docId ], true );
-   if ( $esSources->isOK() ) {
-   foreach ( $esSources->getValue() as $i 
=> $esSource ) {
-   // If we have followed 
redirects only report the
-   // article dump if the redirect 
has been indexed. If it
-   // hasn't been indexed this 
document does not represent
-   // the original title.
-   if ( count( $redirects ) &&
-   !$this->hasRedirect( 
$esSource->getData(), $title )
-   ) {
-   continue;
-   

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: port froezn_index_api.feature to nodejs

2017-11-20 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/392547 )

Change subject: port froezn_index_api.feature to nodejs
..

port froezn_index_api.feature to nodejs

* Deleted test marked @expect_failure
* Converted `within` to plain search/check steps
* Had to add 3 second pauses for that to work. Not sure why :S
* Implemented missing steps

Change-Id: Ib93a3859334920a0363e1498b124c857c2632d24
---
A tests/integration/features/frozen_index_api.feature
M tests/integration/features/step_definitions/page_step_helpers.js
M tests/integration/features/step_definitions/page_steps.js
3 files changed, 76 insertions(+), 2 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/47/392547/1

diff --git a/tests/integration/features/frozen_index_api.feature 
b/tests/integration/features/frozen_index_api.feature
new file mode 100644
index 000..53657a1
--- /dev/null
+++ b/tests/integration/features/frozen_index_api.feature
@@ -0,0 +1,31 @@
+@frozen
+Feature: Mutations to frozen indexes are properly delayed
+  Scenario: Updates to frozen indexes are delayed
+   Given I delete FrozenTest
+ And a page named FrozenTest exists with contents foobarbaz
+ And I wait 3 seconds
+ And I api search for foobarbaz
+ And FrozenTest is the first api search result
+ And I globally freeze indexing
+ And a page named FrozenTest exists with contents superduperfrozen
+ And I wait 10 seconds
+ And I api search for superduperfrozen
+ And FrozenTest is not in the api search results
+When I globally thaw indexing
+ And I wait 10 seconds
+Then I api search for superduperfrozen yields FrozenTest as the first 
result
+
+  Scenario: Deletes to frozen indexes are delayed
+   Given a page named FrozenDeleteTest exists with contents bazbarfoo
+ And I wait 3 seconds
+ And I api search for bazbarfoo
+ And FrozenDeleteTest is the first api search result
+ And I globally freeze indexing
+ And I delete FrozenDeleteTest
+ And a page named FrozenDeleteTest exists with contents mrfreeze recreated 
this page to work around mediawiki's behavior of not showing deleted pages in 
search results.  mrfreeze is surprisingly helpful.
+ And I wait 10 seconds
+ And I api search for bazbarfoo
+ And FrozenDeleteTest is the first api search result
+When I globally thaw indexing
+ And I wait 10 seconds
+Then I api search for bazbarfoo yields no results
diff --git a/tests/integration/features/step_definitions/page_step_helpers.js 
b/tests/integration/features/step_definitions/page_step_helpers.js
index ca4a92b..07c1c71 100644
--- a/tests/integration/features/step_definitions/page_step_helpers.js
+++ b/tests/integration/features/step_definitions/page_step_helpers.js
@@ -13,7 +13,8 @@
 const expect = require( 'chai' ).expect,
fs = require( 'fs' ),
path = require( 'path' ),
-   Promise = require( 'bluebird' ); // jshint ignore:line
+   Promise = require( 'bluebird' ), // jshint ignore:line
+   articlePath = path.dirname(path.dirname(path.dirname(__dirname))) + 
'/browser/articles/';
 
 class StepHelpers {
constructor( world, wiki ) {
@@ -39,12 +40,23 @@
} );
}
 
+   uploadFile( title, fileName, description ) {
+   return Promise.coroutine( function* () {
+   let client = yield this.apiPromise;
+   let filePath = path.join( articlePath, fileName );
+   yield client.batch( [
+   [ 'upload', fileName, filePath, '', { text: 
description } ]
+   ] );
+   yield this.waitForOperation( 'upload', fileName );
+   } ).call( this );
+   }
+
editPage( title, text, append = false ) {
return Promise.coroutine( function* () {
let client = yield this.apiPromise;
 
if ( text[0] === '@' ) {
-   text = fs.readFileSync( path.join( __dirname, 
'articles', text.substr( 1 ) ) ).toString();
+   text = fs.readFileSync( path.join( articlePath, 
text.substr( 1 ) ) ).toString();
}
let fetchedText = yield this.getWikitext( title );
if ( append ) {
diff --git a/tests/integration/features/step_definitions/page_steps.js 
b/tests/integration/features/step_definitions/page_steps.js
index d7881db..f5349a7 100644
--- a/tests/integration/features/step_definitions/page_steps.js
+++ b/tests/integration/features/step_definitions/page_steps.js
@@ -309,4 +309,35 @@
this.searchVars[varname] = yield 
this.stepHelpers.pageIdOf( title );
} ).call( this );
} );
+
+   Then( /^I wait (\d+) seconds/, function ( seconds ) 

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: port relevancy_api.feature to nodejs

2017-11-20 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/392546 )

Change subject: port relevancy_api.feature to nodejs
..

port relevancy_api.feature to nodejs

* Removed 'within' steps. These are unnecessary now
 that we wait for pages to exist in cirrus after edits
* Removed one test marked @expect_failure

Change-Id: I1804bd4f13c110f960b8b7b04552beecb21658b8
---
A tests/integration/features/relevancy_api.feature
1 file changed, 104 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/46/392546/1

diff --git a/tests/integration/features/relevancy_api.feature 
b/tests/integration/features/relevancy_api.feature
new file mode 100644
index 000..4f40a5a
--- /dev/null
+++ b/tests/integration/features/relevancy_api.feature
@@ -0,0 +1,104 @@
+@clean @api @relevancy
+Feature: Results are ordered from most relevant to least.
+  Scenario: Words in order are worth more then words out of order
+When I api search for Relevancytwo Wordtest
+Then Relevancytwo Wordtest is the first api search result
+  And Wordtest Relevancytwo is the second api search result
+
+  Scenario: Results are sorted based on namespace: main, talk, file, help, 
file talk, etc
+When I api search for all:Relevancynamespacetest
+Then Relevancynamespacetest is the first api search result
+  And Talk:Relevancynamespacetest is the second api search result
+  And File:Relevancynamespacetest is the third api search result
+  And Help:Relevancynamespacetest is the fourth api search result
+  And File talk:Relevancynamespacetest is the fifth api search result
+  And User talk:Relevancynamespacetest is the sixth api search result
+  And Template:Relevancynamespacetest is the seventh api search result
+
+  Scenario: When the user doesn't set a language are sorted with wiki language 
ahead of other languages
+When I api search for Relevancylanguagetest
+Then Relevancylanguagetest/en is the first api search result
+
+  Scenario: Redirects count as incoming links
+Given a page named Relevancyredirecttest Smaller exists with contents 
Relevancyredirecttest A text text text text text text text text text text text 
text text
+  And a page named Relevancyredirecttest Smaller/A exists with contents 
[[Relevancyredirecttest Smaller]]
+  And a page named Relevancyredirecttest Smaller/B exists with contents 
[[Relevancyredirecttest Smaller]]
+  And a page named Relevancyredirecttest Larger exists with contents 
Relevancyredirecttest B text text text text text text text text text text text 
text text
+  And a page named Relevancyredirecttest Larger/Redirect exists with 
contents #REDIRECT [[Relevancyredirecttest Larger]]
+  And a page named Relevancyredirecttest Larger/A exists with contents 
[[Relevancyredirecttest Larger]]
+  And a page named Relevancyredirecttest Larger/B exists with contents 
[[Relevancyredirecttest Larger/Redirect]]
+  And a page named Relevancyredirecttest Larger/C exists with contents 
[[Relevancyredirecttest Larger/Redirect]]
+  And I api search for Relevancyredirecttest
+ Then Relevancyredirecttest Larger is the first api search result
+  And Relevancyredirecttest Smaller is the second api search result
+# Note that this test can fail spuriously in two ways:
+# 1. If the required pages are created as part of the hook for @relevancy 
its quite possible for the large influx
+# of jobs to cause the counting jobs to not pick up all the counts. I'm 
not super sure why that is but moving the
+# creation into its own section makes it pretty consistent.
+# 2. Its quite possible for the second result to be deeper in the result 
list for a few seconds after the pages are
+# created. It gets its position updated by the link counting job which has 
to wait for refreshing and undelaying.
+
+  # Last two tests use "sixth or seventh" because the current implementation 
of the all field
+  # and the copy_to hack will copy the content only one time for both text and 
auxiliary_text
+  # auxiliary_text is set to 0.5 but will be approximated to 1 (similar to 
text)
+  # phrase freq will be identical for both fields making length norms the sole 
discriminating
+  # criteria.
+  Scenario: Results are sorted based on what part of the page matches: title, 
redirect, category, etc
+When I api search with query independent profile classic_noboostlinks for 
"Relevancytestphrase phrase"
+Then Relevancytestphrase phrase is the first api search result
+  And Relevancytestphraseviaredirect is the second api search result
+  And Relevancytestphraseviacategory is the third api search result
+  And Relevancytestphraseviaheading is the fourth api search result
+  And Relevancytestphraseviaopening is the fifth api search result
+  And Relevancytestphraseviatext 

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port linksto and more_like tests to nodejs

2017-11-20 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/392538 )

Change subject: Port linksto and more_like tests to nodejs
..

Port linksto and more_like tests to nodejs

Change-Id: I9edd45add51bfad56dd87650de520bfeb08b9a20
---
A tests/integration/features/linksto_api.feature
A tests/integration/features/more_like_api.feature
M tests/integration/features/step_definitions/page_steps.js
M tests/integration/features/support/hooks.js
4 files changed, 104 insertions(+), 19 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/38/392538/1

diff --git a/tests/integration/features/linksto_api.feature 
b/tests/integration/features/linksto_api.feature
new file mode 100644
index 000..2503cc0
--- /dev/null
+++ b/tests/integration/features/linksto_api.feature
@@ -0,0 +1,23 @@
+@clean @filters @linksto @api
+Feature: Searches with the linksto filter
+  Scenario: linksto only includes pages with the links
+When I api search for linksto:"LinksToTest Target"
+Then LinksToTest Plain is in the api search results
+  And LinksToTest OtherText is in the api search results
+
+  Scenario: linksto can be combined with other text
+When I api search for linksto:"LinksToTest Target" text
+Then LinksToTest OtherText is the first api search result
+
+  Scenario: -linksto excludes pages with the link
+When I api search for -linksto:"LinksToTest Target" LinksToTest
+Then LinksToTest No Link is in the api search results
+  But LinksToTest Plain is not in the api search results
+
+  Scenario: linksto works on links from templates
+When I api search for linksto:"LinksToTest Target" Using Template
+Then LinksToTest Using Template is the first api search result
+
+  Scenario: linksto finds links in non-main namespace
+When I api search for linksto:"Template:LinksToTest Template"
+Then LinksToTest LinksToTemplate is the first api search result
diff --git a/tests/integration/features/more_like_api.feature 
b/tests/integration/features/more_like_api.feature
new file mode 100644
index 000..c04a321
--- /dev/null
+++ b/tests/integration/features/more_like_api.feature
@@ -0,0 +1,29 @@
+@clean @more_like_this @api
+Feature: More like an article
+  Scenario: Searching for morelike: returns no results
+When I api search for morelike:IDontExist
+Then there are no api search results
+
+  Scenario: Searching for morelike: returns pages that are "like" that 
page
+When I api search for morelike:More Like Me 1
+Then More Like Me is in the first api search result
+  But More Like Me 1 is not in the api search results
+
+  Scenario: Searching for morelike: returns pages that are "like" 
the page that it is a redirect to
+When I api search for morelike:More Like Me Rdir
+Then More Like Me is in the first api search result
+  But More Like Me 1 is not in the api search results
+
+  @redirect_loop
+  Scenario: Searching for morelike: returns no results
+When I api search for morelike:Redirect Loop
+Then there are no api search results
+
+  Scenario: Searching for morelike:|| returns pages that are 
"like" all those pages
+When I api search for morelike:More Like Me 1|More Like Me Set 2 Page 
1|More Like Me Set 3 Page 1
+Then More Like Me is part of the api search result
+  And More Like Me Set 2 is part of the api search result
+  And More Like Me Set 3 is part of the api search result
+  But More Like Me 1 is not in the api search results
+  And More Like Me Set 2 Page 1 is not in the api search results
+  And More Like Me Set 3 Page 1 is not in the api search results
diff --git a/tests/integration/features/step_definitions/page_steps.js 
b/tests/integration/features/step_definitions/page_steps.js
index 6763049..d7881db 100644
--- a/tests/integration/features/step_definitions/page_steps.js
+++ b/tests/integration/features/step_definitions/page_steps.js
@@ -147,9 +147,12 @@
}
} );
if ( in_ok ) {
-   // What exactly does this do?
-   // expect(found).to include(include(title))
-   throw new Error( 'Not Implemented' );
+   // Asserts that title is found within the 
strings that make up found.
+   // ex: found = ['foo bar baz'], title = 'bar' 
should pass.
+   // Chai doesnt (yet) have a native assertion 
for this:
+   // https://github.com/chaijs/chai/issues/858
+   let ok = found.reduce( ( a, b ) => a || 
b.indexOf( title ) > -1, false );
+   expect( ok, `expected ${JSON.stringify(found)} 
to include "${title}"` ).to.be.true; // jshint ignore:line
   

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Fixup unicode literals in feature files

2017-11-20 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/392537 )

Change subject: Fixup unicode literals in feature files
..

Fixup unicode literals in feature files

And add all the tests that wern't passing because of it.

Change-Id: Ie371e2fca42cc3298cf06ccb5b29f71af5af108f
---
A tests/integration/features/incategory_api.feature
A tests/integration/features/insource_api.feature
A tests/integration/features/intitle_api.feature
A tests/integration/features/phrase_prefix_api.feature
M tests/integration/features/step_definitions/page_step_helpers.js
M tests/integration/features/step_definitions/page_steps.js
6 files changed, 297 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/37/392537/1

diff --git a/tests/integration/features/incategory_api.feature 
b/tests/integration/features/incategory_api.feature
new file mode 100644
index 000..aa31bfe
--- /dev/null
+++ b/tests/integration/features/incategory_api.feature
@@ -0,0 +1,86 @@
+@clean @filters @incategory @api
+Feature: Searches with the incategory filter
+
+  Scenario: incategory: only includes pages with the category
+When I api search for incategory:weaponry
+Then Catapult is in the api search results
+  And Amazing Catapult is in the api search results
+  But Two Words is not in the api search results
+
+  Scenario: incategory: splits on | to create an OR query
+When I api search for incategory:weaponry|nothing
+Then Catapult is in the api search results
+  And Amazing Catapult is in the api search results
+  But Two Words is not in the api search results
+
+  Scenario Outline: incategory: does not fail when the category is unknown
+When I api search for incategory:
+Then there are no api search results
+  Examples:
+|  category   |
+| doesnotexistatleastihopenot |
+| id:2147483600   |
+
+  Scenario: incategory: finds categories by page id
+When I locate the page id of Category:Weaponry and store it as 
%weaponry_id%
+ And I api search for incategory:id:%weaponry_id%
+Then Catapult is in the api search results
+  And Amazing Catapult is in the api search results
+  But Two Words is not in the api search results
+
+  Scenario: incategory: works on categories from templates
+When I api search for incategory:templatetagged incategory:twowords
+Then Two Words is the first api search result
+
+  Scenario: incategory works with multi word categories
+When I api search for incategory:"Categorywith Twowords"
+Then Two Words is the first api search result
+
+  Scenario: incategory can find categories containing quotes if the quote is 
escaped
+When I api search for incategory:"Categorywith \" Quote"
+Then Two Words is the first api search result
+
+  Scenario: incategory can be repeated
+When I api search for incategory:"Categorywith \" Quote" 
incategory:"Categorywith Twowords"
+Then Two Words is the first api search result
+
+  Scenario: incategory works with can find two word categories with spaces
+When I api search for incategory:Categorywith_Twowords
+Then Two Words is the first api search result
+
+  Scenario: incategory: when passed a quoted category that doesn't exist finds 
nothing even though there is a category that matches one of the words
+When I api search for incategory:"Dontfindme Weaponry"
+Then there are no api search results
+
+  Scenario: incategory when passed a single word category doesn't find a two 
word category that contains that word
+When I api search for incategory:ASpace
+Then there are no api search results
+
+  Scenario: incategory: finds a multiword category when it is surrounded by 
quotes
+When I api search for incategory:"CategoryWith ASpace"
+Then IHaveATwoWordCategory is the first api search result
+
+  Scenario: incategory: can be combined with other text
+When I api search for incategory:weaponry amazing
+Then Amazing Catapult is the first api search result
+
+  Scenario: -incategory: excludes pages with the category
+When I api search for -incategory:weaponry incategory:twowords
+Then Two Words is the first api search result
+
+  Scenario: incategory: can handle a space after the :
+When I api search for incategory: weaponry
+Then Catapult is in the api search results
+  And Amazing Catapult is in the api search results
+  But Two Words is not in the api search results
+
+  Scenario Outline: incategory: can handle multiple spaces between clauses
+When I api search for incategory:weaponryincategory:weaponry
+Then Catapult is in the api search results
+  And Amazing Catapult is in the api search results
+  And Two Words is not in the api search results
+  Examples:
+|   spaces   |
+|%{\u0020}%%{\u0020}%|
+

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: [WIP] Add word ount statistic for articles

2017-11-20 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/392471 )

Change subject: [WIP] Add word ount statistic for articles
..

[WIP] Add word ount statistic for articles

The community survey asked for this feature, and it was pretty
straight forward to add to cirrus.

Change-Id: I847f696405b447ab04972ad0215c09d0012c2098
---
M CirrusSearch.php
M autoload.php
M includes/CirrusSearch.php
M includes/Hooks.php
A includes/Query/CountContentWordsBuilder.php
M includes/Search/ResultsType.php
M includes/Search/SearchContext.php
M includes/Search/SearchRequestBuilder.php
M includes/Searcher.php
9 files changed, 131 insertions(+), 4 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/71/392471/1

diff --git a/CirrusSearch.php b/CirrusSearch.php
index b7a8682..094d656 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -1302,6 +1302,7 @@
 $wgHooks[ 'SoftwareInfo' ][] = 'CirrusSearch\Hooks::onSoftwareInfo';
 $wgHooks[ 'SpecialSearchResults' ][] = 
'CirrusSearch\Hooks::onSpecialSearchResults';
 $wgHooks[ 'SpecialSearchResultsAppend' ][] = 
'CirrusSearch\Hooks::onSpecialSearchResultsAppend';
+$wgHooks[ 'SpecialStatsAddExtra'][] = 
'CirrusSearch\Hooks::onSpecialStatsAddExtra';
 $wgHooks[ 'TitleMove' ][] = 'CirrusSearch\Hooks::onTitleMove';
 $wgHooks[ 'TitleMoveComplete' ][] = 'CirrusSearch\Hooks::onTitleMoveComplete';
 $wgHooks[ 'UnitTestsList' ][] = 'CirrusSearch\Hooks::onUnitTestsList';
diff --git a/autoload.php b/autoload.php
index 094cff9..de0770f 100644
--- a/autoload.php
+++ b/autoload.php
@@ -115,6 +115,7 @@
'CirrusSearch\\Query\\BoostTemplatesFeature' => __DIR__ . 
'/includes/Query/BoostTemplatesFeature.php',
'CirrusSearch\\Query\\CompSuggestQueryBuilder' => __DIR__ . 
'/includes/Query/CompSuggestQueryBuilder.php',
'CirrusSearch\\Query\\ContentModelFeature' => __DIR__ . 
'/includes/Query/ContentModelFeature.php',
+   'CirrusSearch\\Query\\CountContentWordsBuilder' => __DIR__ . 
'/includes/Query/CountContentWordsBuilder.php',
'CirrusSearch\\Query\\FileNumericFeature' => __DIR__ . 
'/includes/Query/FileNumericFeature.php',
'CirrusSearch\\Query\\FileTypeFeature' => __DIR__ . 
'/includes/Query/FileTypeFeature.php',
'CirrusSearch\\Query\\FullTextQueryBuilder' => __DIR__ . 
'/includes/Query/FullTextQueryBuilder.php',
@@ -195,6 +196,7 @@
'CirrusSearch\\Search\\SearchMetricsProvider' => __DIR__ . 
'/includes/Search/SearchMetricsProvider.php',
'CirrusSearch\\Search\\SearchRequestBuilder' => __DIR__ . 
'/includes/Search/SearchRequestBuilder.php',
'CirrusSearch\\Search\\ShortTextIndexField' => __DIR__ . 
'/includes/Search/ShortTextIndexField.php',
+   'CirrusSearch\\Search\\SingleAggResultsType' => __DIR__ . 
'/includes/Search/ResultsType.php',
'CirrusSearch\\Search\\SourceTextIndexField' => __DIR__ . 
'/includes/Search/SourceTextIndexField.php',
'CirrusSearch\\Search\\StaticCrossProjectBlockScorer' => __DIR__ . 
'/includes/Search/CrossProjectBlockScorer.php',
'CirrusSearch\\Search\\TeamDraftInterleaver' => __DIR__ . 
'/includes/Search/TeamDraftInterleaver.php',
diff --git a/includes/CirrusSearch.php b/includes/CirrusSearch.php
index 089589a..52b8e3f 100644
--- a/includes/CirrusSearch.php
+++ b/includes/CirrusSearch.php
@@ -798,10 +798,7 @@
return Status::newGood( [] );
}
 
-   $searcher = new Searcher( $this->connection, $this->offset, 
$this->limit, $this->config, $this->namespaces,
-   null, $this->indexBaseName );
-   $searcher->setOptionsFromRequest( $this->request );
-
+   $searcher = $this->makeSearcher();
$status = $searcher->searchArchive( $term );
if ( $status->isOK() && $searcher->isReturnRaw() ) {
$status->setResult( true,
@@ -810,4 +807,22 @@
return $status;
}
 
+   public function countContentWords() {
+   $this->limit = 1;
+   $searcher = $this->makeSearcher();
+   $status = $searcher->countContentWords();
+
+   if ( $status->isOK() && $searcher->isReturnRaw() ) {
+   $status->setResult( true,
+   $searcher->processRawReturn( 
$status->getValue(), $this->request, $this->dumpAndDie ) );
+   }
+   return $status;
+   }
+
+   private function makeSearcher() {
+   $searcher = new Searcher( $this->connection, $this->offset, 
$this->limit, $this->config, $this->namespaces,
+   null, $this->indexBaseName );
+   $searcher->setOptionsFromRequest( $this->request );
+   return $searcher;
+   }
 }
diff --git a/includes/Hooks.php b/includes/Hooks.php
index 44cbc9d..8b783f8 100644
--- 

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Handle errors better in the tag tracker

2017-11-20 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/392472 )

Change subject: Handle errors better in the tag tracker
..

Handle errors better in the tag tracker

When working on test sif you broke a tag all future
uses of the tag would just wait until cucumber
times them out, which is very painful. Rework
the tracking so it remembers failures.

Change-Id: I236780e30cab37884a569f0c6d27d11751fc4ee6
---
M tests/integration/features/support/hooks.js
M tests/integration/features/support/world.js
M tests/integration/lib/tracker.js
3 files changed, 46 insertions(+), 26 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/72/392472/1

diff --git a/tests/integration/features/support/hooks.js 
b/tests/integration/features/support/hooks.js
index 19fa8d0..45f18c1 100644
--- a/tests/integration/features/support/hooks.js
+++ b/tests/integration/features/support/hooks.js
@@ -12,9 +12,20 @@
const BeforeOnce = function ( options, fn ) {
Before( options, Promise.coroutine( function* () {
const status = yield this.tags.check( options.tags );
-   if ( status === 'new' ) {
-   yield fn.call( this );
+   if ( status === 'complete' ) {
+   return;
+   } else if ( status === 'new' ) {
+   try {
+   yield fn.call( this );
+   } catch ( err ) {
+   yield this.tags.reject( options.tags );
+   return;
+   }
yield this.tags.complete( options.tags );
+   } else if ( status === 'reject' ) {
+   throw new Error( 'Tag failed to initialize 
previously' );
+   } else {
+   throw new Error( 'Unknown tag check status: ' + 
status );
}
} ) );
};
diff --git a/tests/integration/features/support/world.js 
b/tests/integration/features/support/world.js
index 7e8c9c2..9df4f45 100644
--- a/tests/integration/features/support/world.js
+++ b/tests/integration/features/support/world.js
@@ -12,6 +12,7 @@
  */
 const {defineSupportCode} = require( 'cucumber' ),
net = require( 'net' ),
+   log = require( 'semlog' ).log,
Bot = require( 'mwbot' ),
StepHelpers = require( '../step_definitions/page_step_helpers' ),
Page = require( './pages/page' ),
@@ -29,7 +30,7 @@
this.pendingResponses = {};
this.connection.on( 'data', ( data ) => {
let parsed = JSON.parse( data );
-   console.log( `received response for request 
${parsed.requestId}: ${data}` );
+   log( `received response for request 
${parsed.requestId}: ${data}` );
if ( parsed && this.pendingResponses[parsed.requestId] 
) {
this.pendingResponses[parsed.requestId]( parsed 
);
delete this.pendingResponses[parsed.requestId];
@@ -41,7 +42,7 @@
req.requestId = this.nextRequestId++;
return new Promise( ( resolve ) => {
let data = JSON.stringify( req );
-   console.log( `Issuing request: ${data}` );
+   log( `Issuing request: ${data}` );
this.pendingResponses[req.requestId] = resolve;
this.connection.write( data );
} );
@@ -50,17 +51,27 @@
check( tag ) {
return Promise.coroutine( function* () {
if ( this.tags[tag] ) {
-   return 'complete';
+   return this.tags[tag];
}
let response = yield this.request( {
check: tag
} );
-   this.tags[tag] = true;
+   if ( response.status === 'complete' || response.status 
=== 'reject' ) {
+   this.tags[tag] = response.status;
+   }
return response.status;
} ).call( this );
}
 
+   reject( tag ) {
+   this.tags[tag] = 'reject';
+   return this.request( {
+   reject: tag
+   } );
+   }
+
complete( tag ) {
+   this.tags[tag] = 'complete';
return this.request( {
complete: tag
} );
@@ -157,10 +168,10 @@
if ( !tmpUrl ) {

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Add API action for dumping cirrus articles

2017-11-20 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/392469 )

Change subject: Add API action for dumping cirrus articles
..

Add API action for dumping cirrus articles

This is particularly convenient for the browser tests to use, so they
can ping the api to see if an article that it created/updated is now
in cirrussearch.

Change-Id: I5dbd02592eebb166362c7cb9dabcd2b93bae66c5
---
M CirrusSearch.php
M autoload.php
A includes/Api/ArticleDump.php
M includes/Connection.php
M tests/integration/features/step_definitions/page_step_helpers.js
5 files changed, 133 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/69/392469/1

diff --git a/CirrusSearch.php b/CirrusSearch.php
index bf53382..b7a8682 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -1336,6 +1336,7 @@
 $wgAPIModules['cirrus-config-dump'] = 'CirrusSearch\Api\ConfigDump';
 $wgAPIModules['cirrus-mapping-dump'] = 'CirrusSearch\Api\MappingDump';
 $wgAPIModules['cirrus-settings-dump'] = 'CirrusSearch\Api\SettingsDump';
+$wgAPIModules['cirrus-article-dump'] = 'CirrusSearch\Api\ArticleDump';
 
 /**
  * Configs
diff --git a/autoload.php b/autoload.php
index 1cb17de..094cff9 100644
--- a/autoload.php
+++ b/autoload.php
@@ -6,6 +6,7 @@
 $wgAutoloadClasses += [
'CirrusSearch' => __DIR__ . '/includes/CirrusSearch.php',
'CirrusSearch\\Api\\ApiBase' => __DIR__ . '/includes/Api/ApiBase.php',
+   'CirrusSearch\\Api\\ArticleDump' => __DIR__ . 
'/includes/Api/ArticleDump.php',
'CirrusSearch\\Api\\ConfigDump' => __DIR__ . 
'/includes/Api/ConfigDump.php',
'CirrusSearch\\Api\\FreezeWritesToCluster' => __DIR__ . 
'/includes/Api/FreezeWritesToCluster.php',
'CirrusSearch\\Api\\MappingDump' => __DIR__ . 
'/includes/Api/MappingDump.php',
diff --git a/includes/Api/ArticleDump.php b/includes/Api/ArticleDump.php
new file mode 100644
index 000..5840eb9
--- /dev/null
+++ b/includes/Api/ArticleDump.php
@@ -0,0 +1,93 @@
+http://www.gnu.org/copyleft/gpl.html
+ */
+class ArticleDump extends ApiBase {
+   public function execute() {
+   $conn = $this->getCirrusConnection();
+   $config = $conn->getConfig();
+   $searcher = new Searcher( $conn, 0, 0, $config, [], 
$this->getUser() );
+
+   $params = $this->extractRequestParams();
+   $title = Title::newFromText( $params['title'] );
+   if ( !$title->exists() ) {
+   $this->dieWithError( 'apierror-missingtitle' );
+   }
+
+   // Reuse updater to find the final target post-redirect
+   $updater = new Updater( $conn, $config );
+   list( $page, $redirects ) = $updater->traceRedirects( $title );
+
+   if ( !$page ) {
+   // Slight lie .. the title itself exists but not the 
redirect target.
+   // Use custom error message?
+   $this->dieWithError( 'apierror-missingtitle' );
+   }
+
+   $docId = $config->makeId( $page->getId() );
+   $esSources = $searcher->get( [ $docId ], true );
+   $result = [];
+   if ( $esSources->isOK() ) {
+   foreach ( $esSources->getValue() as $i => $esSource ) {
+   $result[] = [
+   'index' => $esSource->getIndex(),
+   'type' => $esSource->getType(),
+   'id' => $esSource->getId(),
+   'version' => $esSource->getVersion(),
+   'source' => $esSource->getData(),
+   ];
+   }
+   }
+   $this->getResult()->addValue( null, 'cirrus-article-dump', 
$result );
+   }
+
+   public function getAllowedParams() {
+   return [
+   'title' => [
+   ApiBase::PARAM_TYPE => 'string',
+   ApiBase::PARAM_REQUIRED => true,
+   ],
+   ];
+   }
+
+   /**
+* @deprecated since MediaWiki core 1.25
+*/
+   public function getDescription() {
+   return 'Dump stored CirrusSearch document for article.';
+   }
+
+   /**
+* @see ApiBase::getExamplesMessages
+* @return array
+*/
+   protected function getExamplesMessages() {
+   return [
+   'action=cirrus-article-dump' =>
+   'apihelp-cirrus-article-dump-example'
+   ];
+   }
+
+}
diff --git a/includes/Connection.php b/includes/Connection.php
index a17ce88..8e2e389 100644
--- a/includes/Connection.php
+++ b/includes/Connection.php

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Get all nodejs tests passing from empty database

2017-11-20 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/392470 )

Change subject: Get all nodejs tests passing from empty database
..

Get all nodejs tests passing from empty database

* Use the new cirrus-article-dump api to wait for edits to make it into
  elastic. Failures from an empty database seem almost entirely tied to
  tests running before the articles have made it into cirrus.
* Convert one-off batch calls in hooks.js to use a single function
  so we don't duplicate checking the batch has made it into elastic
* While at it reduce some promise spaghetti by converting things
  over to bluebird coroutines. If we require nodejs >= 7.6 we could
  use async/await directly, but coroutines allow us to support node 6
  which is default on many distributions.
* Swap config over to headless while we are here.
* Put the @suggest hook, that builds the completion suggester, at the
  end of the hooks file. Cucumberjs seems to run these hooks in the
  order they are defined, so this ensures all the other tags from
  prefix_search_api.feature have run already
* Merge suggest_api.feature with prefix_search_api.feature, as they
  both use the @suggest tag.

Change-Id: Ie2f3142d8af9036a6a6e473a2a7d2fd557abeaca
---
M tests/integration/config/wdio.conf.js
M tests/integration/features/prefix_search_api.feature
M tests/integration/features/step_definitions/page_step_helpers.js
M tests/integration/features/step_definitions/page_steps.js
D tests/integration/features/suggest_api.feature
M tests/integration/features/support/hooks.js
M tests/integration/features/support/world.js
7 files changed, 363 insertions(+), 370 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/70/392470/1

diff --git a/tests/integration/config/wdio.conf.js 
b/tests/integration/config/wdio.conf.js
index 179e302..2cd2fb5 100644
--- a/tests/integration/config/wdio.conf.js
+++ b/tests/integration/config/wdio.conf.js
@@ -122,7 +122,7 @@
browserName: 'chrome',
// Since Chrome v57 
https://bugs.chromium.org/p/chromedriver/issues/detail?id=1625
chromeOptions: {
-   args: [ '--enable-automation' ]
+   args: [ '--enable-automation', '--headless' ]
}
} ],
//
diff --git a/tests/integration/features/prefix_search_api.feature 
b/tests/integration/features/prefix_search_api.feature
index e84e830..e524c1b 100644
--- a/tests/integration/features/prefix_search_api.feature
+++ b/tests/integration/features/prefix_search_api.feature
@@ -45,7 +45,6 @@
 
   Scenario: Searching for a bare namespace finds everything in the namespace
 Given a page named Template talk:Foo exists
-  And within 20 seconds api searching for Template talk:Foo yields 
Template talk:Foo as the first result
 When I get api suggestions for template talk:
 Then Template talk:Foo is in the api suggestions
 
@@ -155,3 +154,85 @@
   # And there are 1000 redirects to IHaveTonsOfRedirects of the form 
TonsOfRedirects%s
   #   When I type TonsOfRedirects into the search box
   #   Then suggestions should appear
+
+  Scenario: Search suggestions
+When I ask suggestion API for main
+ Then the API should produce list containing Main Page
+
+  Scenario: Created pages suggestions
+When I ask suggestion API for x-m
+  Then the API should produce list containing X-Men
+
+  Scenario: Nothing to suggest
+When I ask suggestion API for jabberwocky
+  Then the API should produce empty list
+
+  Scenario: Ordering
+When I ask suggestion API for x-m
+  Then the API should produce list starting with X-Men
+
+  Scenario: Fuzzy
+When I ask suggestion API for xmen
+  Then the API should produce list starting with X-Men
+
+  Scenario: Empty tokens
+When I ask suggestion API for はー
+  Then the API should produce list starting with はーい
+  And I ask suggestion API for はい
+  Then the API should produce list starting with はーい
+
+  Scenario Outline: Search redirects shows the best redirect
+When I ask suggestion API for 
+  Then the API should produce list containing 
+  Examples:
+|   term  |suggested  |
+| eise| Eisenhardt, Max   |
+| max | Max Eisenhardt|
+| magnetu | Magneto   |
+
+  Scenario Outline: Search prefers exact match over fuzzy match and ascii 
folded
+When I ask suggestion API for 
+  Then the API should produce list starting with 
+  Examples:
+|   term  |suggested  |
+| max | Max Eisenhardt|
+| mai | Main Page |
+| eis | Eisenhardt, Max   |
+| ele | Elektra   |
+| éle | Électricité   |
+
+  Scenario Outline: Search prefers exact db match over partial prefix match
+When I ask suggestion API at 

[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Speed up DBN evaluation.

2017-11-15 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/391728 )

Change subject: Speed up DBN evaluation.
..

Speed up DBN evaluation.

The toDF() call in dbn.py causes us to evaluate one partition on its own
for spark to figure out what the field types are. Later spark will
evaluate the other 199 partitions. On a a test with a dataframe
containing enwiki and dewiki a single partition can take up to 15
minutes. Save this by defining the schema explicitly instead of making
spark figure it out.

15 minutes is also a long time for a single partition to run. Use a
heuristic to increase the number of partitions from 200 up to 2000 when
we have more data. In tests this patch cut the total dbn time from 23
minutes to 8.

Change-Id: I14d663f49a54b7bd130186aebfbeffde1e1a6d82
---
M mjolnir/dbn.py
M mjolnir/utilities/data_pipeline.py
2 files changed, 19 insertions(+), 6 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/28/391728/1

diff --git a/mjolnir/dbn.py b/mjolnir/dbn.py
index 536064e..8c4ab76 100644
--- a/mjolnir/dbn.py
+++ b/mjolnir/dbn.py
@@ -9,6 +9,7 @@
 import json
 import pyspark.sql
 from pyspark.sql import functions as F
+from pyspark.sql import types as T
 import mjolnir.spark
 
 
@@ -179,7 +180,7 @@
 model.train(sessions)
 return _extract_labels_from_dbn(model, reader)
 
-return (
+rdd_rel = (
 df
 # group and collect up the hits for individual (wikiid, norm_query_id,
 # session_id) tuples to match how the dbn expects to receive data.
@@ -192,7 +193,14 @@
 # of grouping into python, but that could just as well end up worse?
 .repartition(num_partitions, 'wikiid', 'norm_query_id')
 # Run each partition through the DBN to generate relevance scores.
-.rdd.mapPartitions(train_partition)
-# Convert the rdd of tuples back into a DataFrame so the fields all
-# have a name.
-.toDF(['wikiid', 'norm_query_id', 'hit_page_id', 'relevance']))
+.rdd.mapPartitions(train_partition))
+
+# Using toDF() is very slow as it has to run some of the partitions to 
check their
+# types, and then run all the partitions later to get the actual data. To 
prevent
+# running twice specify the schema we expect.
+return df.sql_ctx.createDataFrame(rdd_rel, T.StructType([
+T.StructField('wikiid', T.StringType(), False),
+T.StructField('norm_query_id', T.LongType(), False),
+T.StructField('hit_page_id', T.LongType(), False),
+T.StructField('relevance', T.DoubleType(), False)
+]))
diff --git a/mjolnir/utilities/data_pipeline.py 
b/mjolnir/utilities/data_pipeline.py
index a5c37d1..c8e676f 100644
--- a/mjolnir/utilities/data_pipeline.py
+++ b/mjolnir/utilities/data_pipeline.py
@@ -85,9 +85,14 @@
 print 'Fetched a total of %d samples for %d wikis' % (nb_samples, 
len(wikis))
 df_norm.unpersist()
 
+# Target around 125k rows per partition. Note that this isn't
+# how many the dbn will see, because it gets collected up. Just
+# a rough guess.
+dbn_partitions = int(max(200, min(2000, nb_samples / 125000 ) ))
+
 # Learn relevances
 df_rel = (
-mjolnir.dbn.train(df_sampled, {
+mjolnir.dbn.train(df_sampled, num_partitions=dbn_partitions, 
dbn_config={
 'MAX_ITERATIONS': 40,
 'DEBUG': False,
 'PRETTY_LOG': True,

-- 
To view, visit https://gerrit.wikimedia.org/r/391728
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I14d663f49a54b7bd130186aebfbeffde1e1a6d82
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Repair ability to collect data for undersized wikis

2017-11-15 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/391729 )

Change subject: Repair ability to collect data for undersized wikis
..

Repair ability to collect data for undersized wikis

When attempting to collect data for small wikis that have much less than
the provided samples_per_wiki in data_pipeline.py we would fail, because
the collected data was much less than expected. Rework this code to
allow for wikis that start with much less data than available.

While digging into this I realized that this check was being done much
too early. It was calculating against data that was not of the same
shape, so not the same counts, as the final data we feed into feature
collection. Everything between sampling and feature collection is
relatively cheap (compared to sending millions of queries to
elasticsearch) so move the check down to just before feature collection
where we know exactly how many observations we have.

Change-Id: Ib9f8d9b6204d7568e02356c1062cf3263d8eedd6
---
M mjolnir/sampling.py
M mjolnir/test/test_sampling.py
M mjolnir/utilities/data_pipeline.py
3 files changed, 48 insertions(+), 24 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/29/391729/1

diff --git a/mjolnir/sampling.py b/mjolnir/sampling.py
index 50f527a..50722b5 100644
--- a/mjolnir/sampling.py
+++ b/mjolnir/sampling.py
@@ -165,15 +165,18 @@
 .agg(F.sum('num_hit_page_ids').alias('num_hit_page_ids'))
 .collect())
 
+hit_page_id_counts = {row.wikiid: row.num_hit_page_ids for row in 
hit_page_id_counts}
+
 wiki_percents = {}
 needs_sampling = False
-for row in hit_page_id_counts:
-wiki_percents[row.wikiid] = min(1., float(samples_per_wiki) / 
row.num_hit_page_ids)
-if wiki_percents[row.wikiid] < 1.:
+
+for wikiid, num_hit_page_ids in hit_page_id_counts.items():
+wiki_percents[wikiid] = min(1., float(samples_per_wiki) / 
num_hit_page_ids)
+if wiki_percents[wikiid] < 1.:
 needs_sampling = True
 
 if not needs_sampling:
-return df
+return hit_page_id_counts, df
 
 # Aggregate down into a unique set of (wikiid, norm_query_id) and add in a
 # count of the number of unique sessions per pair. We will sample 
per-strata
@@ -187,11 +190,11 @@
 # Spark will eventually throw this away in an LRU fashion.
 .cache())
 
-# materialize df_queries_unique so we can unpersist the input df
-df_queries_unique.count()
-df.unpersist()
-
 df_queries_sampled = _sample_queries(df_queries_unique, wiki_percents, 
seed=seed)
 
 # Select the rows chosen by sampling from the input df
-return df.join(df_queries_sampled, how='inner', on=['wikiid', 
'norm_query_id'])
+df_sampled = df.join(df_queries_sampled, how='inner', on=['wikiid', 
'norm_query_id'])
+df_sampled.cache().count()
+df.unpersist()
+
+return hit_page_id_counts, df_sampled
diff --git a/mjolnir/test/test_sampling.py b/mjolnir/test/test_sampling.py
index 2feeb29..66a4605 100644
--- a/mjolnir/test/test_sampling.py
+++ b/mjolnir/test/test_sampling.py
@@ -20,8 +20,9 @@
 ('foo', 'e', 5, 'eee', list(range(3))),
 ]).toDF(['wikiid', 'query', 'norm_query_id', 'session_id', 'hit_page_ids'])
 
-sampled = mjolnir.sampling.sample(df, samples_per_wiki=100,
-  seed=12345).collect()
+hit_page_id_counts, df_sampled = mjolnir.sampling.sample(
+df, samples_per_wiki=100, seed=12345)
+sampled = df_sampled.collect()
 # The sampling rate should have been chosen as 1.0, so we should have all 
data
 # regardless of probabilities.
 assert len(sampled) == 5
@@ -60,8 +61,8 @@
 # Using a constant seed ensures deterministic testing. Because this code
 # actually relies on the law of large numbers, and we do not have large
 # numbers here, many seeds probably fail.
-df_sampled = mjolnir.sampling.sample(df, samples_per_wiki=samples_per_wiki,
- seed=12345)
+hit_page_id_counts, df_sampled = mjolnir.sampling.sample(
+df, samples_per_wiki=samples_per_wiki, seed=12345)
 sampled = (
 df_sampled
 .select('wikiid', 'query', 
F.explode('hit_page_ids').alias('hit_page_id'))
diff --git a/mjolnir/utilities/data_pipeline.py 
b/mjolnir/utilities/data_pipeline.py
index c8e676f..3827e08 100644
--- a/mjolnir/utilities/data_pipeline.py
+++ b/mjolnir/utilities/data_pipeline.py
@@ -64,11 +64,17 @@
 min_sessions_per_query=min_sessions_per_query)
 
 # Sample to some subset of queries per wiki
+hit_page_id_counts, df_sampled_raw = mjolnir.sampling.sample(
+df_norm,
+seed=54321,
+samples_per_wiki=samples_per_wiki)
+
+df_sampled_raw.count()
+df_norm.unpersist()
+
+# Transform our dataframe into the shape expected by the DBN
 df_sampled = (
-  

[MediaWiki-commits] [Gerrit] search/MjoLniR[master]: Replace custom array_contains with Column.isin

2017-11-15 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/391622 )

Change subject: Replace custom array_contains with Column.isin
..

Replace custom array_contains with Column.isin

Not sure if this is new or I just wasn't aware of it at the time,
but spark has a native Column.isin that does the same as our
usage of the custom _array_contains method (checking a column has
a value one of a provided array of values).

Change-Id: I504492070c7cde4a4d93f2ff9c104b3f127b2757
---
M mjolnir/sampling.py
M mjolnir/utilities/data_pipeline.py
2 files changed, 1 insertion(+), 32 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR 
refs/changes/22/391622/1

diff --git a/mjolnir/sampling.py b/mjolnir/sampling.py
index e65281c..9ba3cf8 100644
--- a/mjolnir/sampling.py
+++ b/mjolnir/sampling.py
@@ -15,37 +15,6 @@
 from pyspark.sql.column import Column, _to_java_column
 
 
-def _array_contains(array, value):
-"""Generic version of pyspark.sql.functions.array_contains
-
-array_contains provided by pyspark only allow checking if a value is inside
-a column, but the value has to be a literal and not a column from the row.
-This generalizes the function to allow the value to be a column, checking
-if a column is within a provided literal array.
-
->>> df = sc.parallelize([['foo'], ['bar']]).toDF(['id'])
->>> df.select(_array_contains(F.array(map(F.lit, ['this', 'is', 'foo'])), 
F.col('id'))).collect()
-[Row(array_contains(array(this,is,foo),id)=True), 
Row(array_contains(array(this,is,foo),id)=False)]
-
-Parameters
---
-array : pyspark.sql.Column
-value : pyspark.sql.Column
-
-Returns
----
-pyspark.sql.Column
-Column representing the array_contains expression
-"""
-j_array_expr = _to_java_column(array).expr()
-j_value_expr = _to_java_column(value).expr()
-
-sql = pyspark.SparkContext._active_spark_context._jvm.org.apache.spark.sql
-j_expr = sql.catalyst.expressions.ArrayContains(j_array_expr, j_value_expr)
-jc = sql.Column(j_expr)
-return Column(jc)
-
-
 def _calc_splits(df, num_buckets=100):
 """Calculate the right edge of num_session buckets
 
diff --git a/mjolnir/utilities/data_pipeline.py 
b/mjolnir/utilities/data_pipeline.py
index 62ec121..a5c37d1 100644
--- a/mjolnir/utilities/data_pipeline.py
+++ b/mjolnir/utilities/data_pipeline.py
@@ -40,7 +40,7 @@
 df_clicks = (
 sqlContext.read.parquet(input_dir)
 # Limit to the wikis we are working against
-.where(mjolnir.sampling._array_contains(F.array(map(F.lit, wikis)), 
F.col('wikiid')))
+.where(F.col('wikiid').isin(wikis))
 # Drop requests from 'too busy' IP's. These are plausibly bots, or 
maybe just proxys.
 .where(F.col('q_by_ip_day') < 50)
 .drop('q_by_ip_day')

-- 
To view, visit https://gerrit.wikimedia.org/r/391622
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I504492070c7cde4a4d93f2ff9c104b3f127b2757
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port exact_quotes_api.feature to nodejs

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/391439 )

Change subject: Port exact_quotes_api.feature to nodejs
..

Port exact_quotes_api.feature to nodejs

Change-Id: I537684d737132755d726a6f7dad4e3f84dbe7b7b
---
A tests/integration/features/exact_quotes_api.feature
M tests/integration/features/support/hooks.js
2 files changed, 109 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/39/391439/1

diff --git a/tests/integration/features/exact_quotes_api.feature 
b/tests/integration/features/exact_quotes_api.feature
new file mode 100644
index 000..9af0958
--- /dev/null
+++ b/tests/integration/features/exact_quotes_api.feature
@@ -0,0 +1,94 @@
+@clean @exact_quotes @api
+Feature: Searches that contain quotes
+  Scenario: Searching for a word in quotes disbles stemming (can still find 
plural with exact match)
+When I api search for "pickles"
+Then Two Words is the first api search result
+
+  Scenario: Searching for a phrase in quotes disbles stemming (can't find 
plural with singular)
+When I api search for "catapult pickle"
+Then there are no api search results
+
+  Scenario: Searching for a phrase in quotes disbles stemming (can still find 
plural with exact match)
+When I api search for "catapult pickles"
+Then Two Words is the first api search result
+
+  Scenario: Quoted phrases have a default slop of 0
+When I api search for "ffnonesenseword pickles"
+Then none is the first api search result
+When I api search for "ffnonesenseword pickles"~1
+Then Two Words is the first api search result
+
+  Scenario: Quoted phrases match stop words
+When I api search for "Contains A Stop Word"
+Then Contains A Stop Word is the first api search result
+
+  Scenario: Adding a ~ to a phrase keeps stemming enabled
+When I api search for "catapult pickle"~
+Then Two Words is the first api search result
+
+  Scenario: Adding a ~ to a phrase switches the default slop to 0
+When I api search for "ffnonesenseword pickle"~
+Then none is the first api search result
+When I api search for "ffnonesenseword pickle"~1~
+Then Two Words is the first api search result
+
+  Scenario: Adding a ~ to a phrase stops it from matching stop words so long 
as there is enough slop
+When I api search for "doesn't actually Contain A Stop Words"~1~
+Then Doesn't Actually Contain Stop Words is the first api search result
+
+  Scenario: Adding a ~~ to a phrase keeps stemming enabled
+When I api search for "catapult pickle"~0~
+Then Two Words is the first api search result
+
+  Scenario: Adding a ~ to a phrase turns off because it is a 
proximity search
+When I api search for "catapult pickle"~0
+Then there are no api search results
+
+  Scenario: Searching for a quoted * actually searches for a *
+When I api search with query independent profile empty for "pick*"
+Then Pick* is the first api search result
+
+  Scenario Outline: Searching for " "~ activates a 
proximity search
+When I api search for "ffnonesenseword anotherword"~
+Then  is the first api search result
+  Examples:
+| proximity | result|
+| 0 | none  |
+| 1 | none  |
+| 2 | Two Words |
+| 3 | Two Words |
+| 77| Two Words |
+
+  Scenario Outline: Prefixing a quoted phrase with - or ! or NOT negates it
+When I api search for catapult "two words"
+Then Catapult is in the api search results
+  And Two Words is not in the api search results
+  Examples:
+|negation| suffix |
+| -  ||
+| !  ||
+| NOT||
+| -  | ~  |
+| !  | ~  |
+| NOT| ~  |
+| -  | ~1 |
+| !  | ~1 |
+| NOT| ~1 |
+| -  | ~7~|
+| !  | ~7~|
+| NOT| ~7~|
+
+  Scenario: Can combine positive and negative phrase search
+When I api search for catapult "catapult" -"two words" -"some stuff"
+Then Catapult is in the api search results
+  And Two Words is not in the api search results
+
+  Scenario: Can combine positive and negative phrase search (backwards)
+When I api search for catapult -"asdf" "two words"
+Then Two Words is in the api search results
+  And Catapult is not in the api search results
+
+  @setup_main
+  Scenario: Searching for a word in quotes disbles stemming (can't find plural 
with singular)
+When I api search for "pickle"
+Then there are no api search results
diff --git a/tests/integration/features/support/hooks.js 
b/tests/integration/features/support/hooks.js
index e1e5082..c035dbc 100644
--- a/tests/integration/features/support/hooks.js
+++ 

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port all cucumber hooks to nodejs

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/391441 )

Change subject: Port all cucumber hooks to nodejs
..

Port all cucumber hooks to nodejs

Port over all the hooks from ruby to nodejs and add the feature
files that now pass. WIP because I'm suspicious from other feature
files that didn't pass that there remains a problem with mwbot
sessions.

Change-Id: I01b8192c4e2ef5f0b3b720034aa38c4686ffbeb3
---
A tests/integration/features/combined_filters_api.feature
A tests/integration/features/full_text_api.feature
A tests/integration/features/fuzzy_api.feature
A tests/integration/features/go_api.feature
A tests/integration/features/hastemplate_api.feature
A tests/integration/features/prefix_api.feature
A tests/integration/features/removed_text_api.feature
M tests/integration/features/support/hooks.js
8 files changed, 983 insertions(+), 190 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/41/391441/1

diff --git a/tests/integration/features/combined_filters_api.feature 
b/tests/integration/features/combined_filters_api.feature
new file mode 100644
index 000..665dd58
--- /dev/null
+++ b/tests/integration/features/combined_filters_api.feature
@@ -0,0 +1,10 @@
+@clean @filters @api
+Feature: Searches with combined filters
+  Scenario Outline: Filters can be combined
+When I api search for 
+Then  is the first api search result
+  Examples:
+|  term   | first_result |
+| incategory:twowords intitle:catapult| none |
+| incategory:twowords intitle:"Two Words" | Two Words|
+| incategory:alpha incategory:beta| AlphaBeta|
diff --git a/tests/integration/features/full_text_api.feature 
b/tests/integration/features/full_text_api.feature
new file mode 100644
index 000..9ebcab5
--- /dev/null
+++ b/tests/integration/features/full_text_api.feature
@@ -0,0 +1,145 @@
+@clean @api
+Feature: Full text search
+  @headings
+  Scenario: Pages can be found by their headings
+When I api search for incategory:HeadingsTest "I am a heading"
+Then HasHeadings is the first api search result
+
+  @headings
+  Scenario: Ignored headings aren't searched so text with the same word is wins
+When I api search for incategory:HeadingsTest References
+Then HasReferencesInText is the first api search result
+
+ @setup_main
+  Scenario: Searching for a page using its title and another word not in the 
page's text doesn't find the page
+When I api search for DontExistWord Two Words
+Then there are no api search results
+
+  @setup_main
+  Scenario: Searching for a page using its title and another word in the 
page's text does find it
+When I api search for catapult Two Words
+Then Two Words is the first api search result
+
+  @setup_phrase_rescore
+  Scenario: Searching for an unquoted phrase finds the phrase first
+When I api search for Words Test Rescore
+Then Rescore Test Words Chaff is the first api search result
+
+  @setup_phrase_rescore
+  Scenario: Searching for a quoted phrase finds higher scored matches before 
the whole query interpreted as a phrase
+When I api search for Rescore "Test Words"
+Then Test Words Rescore Rescore Test Words is the first api search result
+
+  # Note that other tests will catch this situation as well but this test 
should be pretty specific
+  @setup_phrase_rescore
+  Scenario: Searching for an unquoted phrase still prioritizes titles over text
+When I api search for Rescore Test TextContent
+Then Rescore Test TextContent is the first api search result
+
+  @setup_phrase_rescore
+  Scenario: Searching with a quoted word just treats the word as though it 
didn't have quotes
+When I api search for "Rescore" Words Test
+Then Test Words Rescore Rescore Test Words is the first api search result
+
+  @programmer_friendly
+  Scenario Outline: Programmer friendly searches
+When I api search for 
+Then  is the first api search result
+  Examples:
+|term |page |
+| namespace aliases   | $wgNamespaceAliases |
+| namespaceAliases| $wgNamespaceAliases |
+| $wgNamespaceAliases | $wgNamespaceAliases |
+| namespace_aliases   | $wgNamespaceAliases |
+| NamespaceAliases| $wgNamespaceAliases |
+| wgnamespacealiases  | $wgNamespaceAliases |
+| snake case  | PFSC|
+| snakeCase   | PFSC|
+| snake_case  | PFSC|
+| SnakeCase   | PFSC|
+| Pascal Case | PascalCase  |
+| pascalCase  | PascalCase  |
+| pascal_case | PascalCase  |
+| PascalCase  | PascalCase  |
+| pascalcase  | PascalCase  |
+| numeric 7   | NumericCase7|
+  

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Add did you mean api feature test to nodejs integ tests

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/391438 )

Change subject: Add did you mean api feature test to nodejs integ tests
..

Add did you mean api feature test to nodejs integ tests

Change-Id: I943aedd0bc13111906a2aab7481250215c8dd2c9
---
A tests/integration/features/did_you_mean_api.feature
M tests/integration/features/step_definitions/page_steps.js
2 files changed, 132 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/38/391438/1

diff --git a/tests/integration/features/did_you_mean_api.feature 
b/tests/integration/features/did_you_mean_api.feature
new file mode 100644
index 000..66be262
--- /dev/null
+++ b/tests/integration/features/did_you_mean_api.feature
@@ -0,0 +1,100 @@
+@clean @api @suggestions
+Feature: Did you mean
+  Scenario: Uncommon phrases spelled correctly don't get suggestions even if 
one of the words is very uncommon
+When I api search for nobel prize
+Then there is no api suggestion
+
+  Scenario: No suggestions on pages that are not the first
+When I api search with offset 20 for popular cultur
+Then there is no api suggestion
+
+  @stemming
+  Scenario: Suggestions do not show up when a full title matches but with 
stemming
+When I api search for stemmingsingleword
+Then there is no api suggestion
+
+  @stemming
+  Scenario: Suggestions do not show up when a full multi word title matches 
but with stemming
+When I api search for stemming multiword
+Then there is no api suggestion
+
+  @stemming
+  Scenario: Suggestions do not show up when a full multi word title matches 
but with apostrophe normalization
+When I api search for stemming possessive's
+Then there is no api suggestion
+
+  Scenario: Suggestions don't come from redirect titles when it matches an 
actual title
+When I api search for Noble Gasses
+Then there is no api suggestion
+
+  Scenario: Common phrases spelled incorrectly get suggestions
+When I api search for popular cultur
+Then popular *culture* is suggested by api
+
+  Scenario Outline: Uncommon phrases spelled incorrectly get suggestions even 
if the typos is in the first 2 characters
+When I api search for 
+Then  is suggested by api
+  Examples:
+|term   |  suggested   
   |
+| nabel prize   | *nobel* prize
   |
+| onbel prize   | *nobel* prize
   |
+
+  Scenario: Uncommon phrases spelled incorrectly get suggestions even if they 
contain words that are spelled correctly on their own
+When I api search for noble prize
+Then *nobel* prize is suggested by api
+
+  Scenario: Suggestions can come from redirect titles when redirects are 
included in search
+When I api search for Rrr Worrd
+Then rrr *word* is suggested by api
+
+  Scenario Outline: Special search syntax is preserved in suggestions (though 
sometimes moved around)
+When I api search for 
+Then  is suggested by api
+  Examples:
+|term   |  suggested   
   |
+| prefer-recent:noble prize | prefer-recent:*nobel* prize  
   |
+| Template:nobel piep   | Template:*noble pipe*
   |
+| prefer-recent:noble prize | prefer-recent:*nobel* prize  
   |
+| incategory:prize noble prize  | incategory:prize *nobel* 
prize  |
+| noble incategory:prize prize  | incategory:prize *nobel* 
prize  |
+| hastemplate:prize noble prize | hastemplate:prize *nobel* 
prize |
+| -hastemplate:prize noble prize| -hastemplate:prize *nobel* 
prize|
+| boost-templates:"prize\|150%" noble prize | 
boost-templates:"prize\|150%" *nobel* prize |
+| noble prize prefix:n  | *nobel* prize prefix:n   
   |
+
+  Scenario: Customize prefix length of did you mean suggestions
+When I set did you mean suggester option cirrusSuggPrefixLength to 5
+And I api search for noble prize
+Then there is no api suggestion
+
+  Scenario: Did you mean option suggests
+When I api search for grammo awards
+Then there is no api suggestion
+
+  Scenario: Customize max term freq did you mean suggestions
+When I set did you mean suggester option cirrusSuggMaxTermFreq to 0.4
+And I set did you mean suggester option cirrusSuggConfidence to 1
+And I api search for grammo
+Then *grammy* is suggested by api
+
+  Scenario: Customize prefix length of did you mean suggestions below the hard 
limit
+When I reset did you mean suggester options
+And I 

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port highlighting feature to nodejs

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/391442 )

Change subject: Port highlighting feature to nodejs
..

Port highlighting feature to nodejs

Change-Id: I15dffe85e6366b21d0fd5742cb013589a12e24f6
---
A tests/integration/features/highlighting_api.feature
M tests/integration/features/step_definitions/page_steps.js
2 files changed, 204 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/42/391442/1

diff --git a/tests/integration/features/highlighting_api.feature 
b/tests/integration/features/highlighting_api.feature
new file mode 100644
index 000..87b07ee
--- /dev/null
+++ b/tests/integration/features/highlighting_api.feature
@@ -0,0 +1,178 @@
+@clean @highlighting @api
+Feature: Highlighting
+  @setup_main
+  Scenario Outline: Found words are highlighted
+When I api search for 
+Then  is the highlighted title of the first api search 
result
+  And  is the highlighted snippet of the first api 
search result
+  Examples:
+| term   | highlighted_title| highlighted_text 
|
+| two words  | *Two* *Words*| ffnonesenseword 
catapult pickles anotherword |
+| pickles| Two Words| ffnonesenseword 
catapult *pickles* anotherword   |
+| ffnonesenseword pickles| Two Words| 
*ffnonesenseword* catapult *pickles* anotherword |
+| two words catapult pickles | *Two* *Words*| ffnonesenseword 
*catapult* *pickles* anotherword |
+| template:test pickle   | Template:Template *Test* | *pickles*
|
+# Verify highlighting the presence of accent squashing
+| Africa test| *África* | for *testing*
|
+# Verify highlighting on large pages.
+| "discuss problems of social and cultural importance" | Rashidun 
Caliphate | community centers as well where the faithful gathered to *discuss* 
*problems* *of* *social* *and* *cultural* *importance*. During the caliphate of 
Umar as many as four thousand |
+| "discuss problems of social and cultural importance"~ | Rashidun 
Caliphate | community centers as well where the faithful gathered to *discuss* 
*problems* *of* *social* *and* *cultural* *importance*. During the caliphate of 
Umar as many as four thousand |
+# Auxiliary text
+| tallest alborz | Rashidun Caliphate   | Mount Damavand, 
Iran's *tallest* mountain is located in *Alborz* mountain range. |
+
+  Scenario: Even stopwords are highlighted
+When I api search for the once and future king
+Then *The* *Once* *and* *Future* *King* is the highlighted title of the 
first api search result
+
+  Scenario: Found words are highlighted even if found by different analyzers
+When I api search for "threatening the unity" community
+Then Troubles emerged soon after Abu Bakr's succession, *threatening* 
*the* *unity* and stability of the new *community* and state. Apostasy had 
actually begun in the lifetime is the highlighted snippet of the first api 
search result
+
+  @headings
+  Scenario: Found words are highlighted in headings
+When I api search for "i am a heading"
+Then *I* *am* *a* *heading* is the highlighted sectionsnippet of the first 
api search result
+
+  @headings
+  Scenario: References are not included in headings
+When I api search for "Reference in heading"
+Then *Reference* *in* *heading* is the highlighted sectionsnippet of the 
first api search result
+
+  Scenario: Found words are highlighted in headings even in large documents
+When I api search for "Succession of Umar"
+Then *Succession* *of* *Umar* is the highlighted sectionsnippet of the 
first api search result
+
+  Scenario: Found words are highlighted in text even in large documents
+When I api search for Allowance to non-Muslims
+Then *Allowance* *to* *non*-*Muslims* is in the highlighted snippet of the 
first api search result
+
+  Scenario: Found words are highlighted in text even in large documents
+When I api search for "Allowance to non-Muslims"
+Then *Allowance* *to* *non*-*Muslims* is in the highlighted snippet of the 
first api search result
+
+  Scenario: Words are not found in image captions unless there are no matches 
in the page
+When I api search for The Rose Trellis Egg
+Then *The* *Rose* *Trellis* Faberge *Egg* is a jewelled enameled imperial 
Easter *egg* made in St. Petersburg, Russia under *the* supervision of *the* 
jeweler Peter Carl is the highlighted snippet of the first api search result
+
+  @headings
+  Scenario: Found words are highlighted in headings even if they contain both 
a phrase and a non-phrase
+When I api search for i "am a heading"
+

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Port filesearch_api.feature to nodejs

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/391440 )

Change subject: Port filesearch_api.feature to nodejs
..

Port filesearch_api.feature to nodejs

Change-Id: I057e51ae4755244f29d2d7f789b5737695cc7e63
---
A tests/integration/features/filesearch_api.feature
M tests/integration/features/step_definitions/page_steps.js
M tests/integration/features/support/hooks.js
3 files changed, 150 insertions(+), 65 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/40/391440/1

diff --git a/tests/integration/features/filesearch_api.feature 
b/tests/integration/features/filesearch_api.feature
new file mode 100644
index 000..2ea145e
--- /dev/null
+++ b/tests/integration/features/filesearch_api.feature
@@ -0,0 +1,70 @@
+@clean @api @filesearch
+Feature: Searches with the file size filters
+
+  Scenario Outline: filesize finds files with given size
+When I api search in namespace 6 for  -intitle:frozen
+Then there are  api search results
+And  is in the api search results
+And  is not in the api search results
+  Examples:
+| search| count | musthave 
 | mustnot|
+| filesize:>10  | 2 | File:Linux Distribution Timeline text 
version.pdf | File:OnCommons.svg |
+| filesize:<10  | 4 | File:DuplicatedLocally.svg   
 | File:Linux Distribution Timeline text version.pdf |
+| filesize:10   | 2 | File:Linux Distribution Timeline text 
version.pdf | File:OnCommons.svg |
+| filesize:5,20 | 1 | File:Savepage-greyed.png 
 | File:Linux Distribution Timeline text version.pdf |
+
+  Scenario Outline: filetype finds files with given internal type
+When I api search in namespace 6 for  -intitle:frozen
+Then there are  api search results
+And  is in the api search results
+And  is not in the api search results
+Examples:
+  | search   | count | musthave
  | mustnot  |
+  | filetype:bitmap  | 1 | File:Savepage-greyed.png
  | File:DuplicatedLocally.svg |
+  | filetype:office  | 1 | File:Linux Distribution Timeline text 
version.pdf | File:Savepage-greyed.png |
+  | filetype:Drawing | 4 | File:DuplicatedLocally.svg  
  | File:Savepage-greyed.png |
+
+  Scenario Outline: filemime finds files with given MIME type
+When I api search in namespace 6 for  -intitle:frozen
+Then there are  api search results
+And  is in the api search results
+And  is not in the api search results
+Examples:
+  | search | count | musthave  
| mustnot|
+  | filemime:image/PNG | 1 | File:Savepage-greyed.png  
| File:DuplicatedLocally.svg |
+  | filemime:image/svg+xml | 4 | File:DuplicatedLocally.svg
| File:Savepage-greyed.png   |
+  | filemime:application/pdf | 1   | File:Linux Distribution Timeline text 
version.pdf | File:OnCommons.svg |
+
+  Scenario Outline: Resolution filters find files with given dimensions
+When I api search in namespace 6 for  -intitle:frozen
+Then there are  api search results
+And  is in the api search results
+And  is not in the api search results
+  Examples:
+| search | count | musthave
  | mustnot  |
+| fileres:>1000  |  1| File:Linux Distribution Timeline text 
version.pdf | File:Savepage-greyed.png |
+| filew:>1000|  1| File:Linux Distribution Timeline text 
version.pdf | File:Savepage-greyed.png |
+| fileh:>1000|  1| File:Linux Distribution Timeline text 
version.pdf | File:Savepage-greyed.png |
+| filewidth:>1000|  1| File:Linux Distribution Timeline text 
version.pdf | File:Savepage-greyed.png |
+| fileheight:>1000   |  1| File:Linux Distribution Timeline text 
version.pdf | File:Savepage-greyed.png |
+| fileres:300,600|  1| File:Savepage-greyed.png
  | DuplicatedLocally.svg|
+| fileres:<500   |  1| File:Savepage-greyed.png
  | File:Linux Distribution Timeline text version.pdf |
+| filew:300,900  |  5| File:DuplicatedLocally.svg  
  | File:Linux Distribution Timeline text version.pdf |
+| filew:<500 |  1| File:Savepage-greyed.png
  | File:Linux Distribution Timeline text version.pdf |
+| fileh:>200 |  6| File:Linux Distribution Timeline text 
version.pdf | anything |
+| filew:300,600 fileh:200,300 | 1 | File:Savepage-greyed.png 

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[master]: Add integration feature files that already pass

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/391437 )

Change subject: Add integration feature files that already pass
..

Add integration feature files that already pass

These feature files were 95% of the way to implemented, they
needed only a single new definition each to pass.

Change-Id: I546fe7a8a2b32b8e705cd27278cb9195105e1e49
---
M tests/integration/features/step_definitions/page_steps.js
M tests/integration/features/suggest_api.feature
A tests/integration/features/wildcard_api.feature
3 files changed, 95 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/37/391437/1

diff --git a/tests/integration/features/step_definitions/page_steps.js 
b/tests/integration/features/step_definitions/page_steps.js
index 2e370d8..8e4a46c 100644
--- a/tests/integration/features/step_definitions/page_steps.js
+++ b/tests/integration/features/step_definitions/page_steps.js
@@ -216,6 +216,12 @@
} );
} );
 
+   Then( /there are no api search results/, function () {
+   withApi( this, () => {
+   expect( this.apiResponse.query.search 
).to.have.lengthOf( 0 );
+   } );
+   } );
+
Then( /^(.+) is( not)? in the api search results$/, function( title, 
not ) {
withApi( this, () => {
let titles = this.apiResponse.query.search.map( res => 
res.title );
@@ -226,4 +232,10 @@
}
} );
} );
+
+   Then( /^this error is reported by api: (.+)$/, function ( 
expected_error ) {
+   withApi( this, () => {
+   expect( this.apiError.info ).to.equal( 
expected_error.trim() )
+   } );
+   } );
 });
diff --git a/tests/integration/features/suggest_api.feature 
b/tests/integration/features/suggest_api.feature
index 79beb0c..0d70c7f 100644
--- a/tests/integration/features/suggest_api.feature
+++ b/tests/integration/features/suggest_api.feature
@@ -65,4 +65,31 @@
   Examples:
 |   term  |   first  | other  |
 | Ic  |  Iceman  |  Ice   |
-| Ice |   Ice| Iceman |
\ No newline at end of file
+| Ice |   Ice| Iceman |
+
+  Scenario: Ordering & limit
+When I ask suggestion API at most 1 item for x-m
+  Then the API should produce list starting with X-Men
+  And the API should produce list of length 1
+
+  Scenario Outline: Search fallback to prefix search if namespace is provided
+When I ask suggestion API for 
+  Then the API should produce list starting with 
+  Examples:
+|   term  |suggested|
+| Special:| Special:ActiveUsers |
+| Special:Act | Special:ActiveUsers |
+
+  Scenario Outline: Search prefers main namespace over crossns redirects
+When I ask suggestion API for 
+  Then the API should produce list starting with 
+  Examples:
+|   term  |suggested  |
+| V   | Venom |
+| V:  | V:N   |
+| Z   | Zam Wilson|
+| Z:  | Z:Navigation  |
+
+  Scenario: Default sort can be used as search input
+When I ask suggestion API for Wilson
+  Then the API should produce list starting with Sam Wilson
diff --git a/tests/integration/features/wildcard_api.feature 
b/tests/integration/features/wildcard_api.feature
new file mode 100644
index 000..fc86115
--- /dev/null
+++ b/tests/integration/features/wildcard_api.feature
@@ -0,0 +1,55 @@
+@clean @api @wildcard
+Feature: Searches that contain wildcard matches
+  Scenario Outline: Wildcards match plain matches
+When I api search for piles
+Then Two Words is the first api search result
+  Examples:
+| wildcard |
+| *|
+| \\?k |
+| c\\? |
+
+  Scenario Outline: Wildcards don't match stemmed matches
+When I api search for pikle
+Then there are no api search results
+  Examples:
+| wildcard |
+| *|
+| \\?k |
+
+  Scenario Outline: Wildcards in leading intitle: terms match
+When I api search for intitle:functiona intitle:programming
+Then Functional programming is the first api search result
+  Examples:
+| wildcard |
+| *|
+| \\?  |
+
+  Scenario Outline: Wildcard suffixes in trailing intitle: terms match stemmed 
matches
+When I api search for intitle:functional intitle:programmin
+Then Functional programming is the first api search result
+  Examples:
+| wildcard |
+| *|
+| \\?  |
+
+  Scenario Outline: Wildcards within trailing intitle: terms match stemmed 
matches
+When I api search for intitle:functional intitle:progamming
+Then Functional programming is the first api search result
+  Examples:
+| wildcard |
+| *|
+| \\?  |
+
+  

[MediaWiki-commits] [Gerrit] search...deploy[master]: Dont recreate virtualenv unless necessary

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/391296 )

Change subject: Dont recreate virtualenv unless necessary
..


Dont recreate virtualenv unless necessary

It appears that while the first run of the virtualenv.sh script will
succeed on a debian jessie based system, future runs will attempt
to overwrite the upgraded pip with the older system version of pip. This
causes pip to completly break with a mismatch between versions.

Change-Id: I7c6fd8ea1e1ee35a3e7386fd9b628c0605a11fda
---
M scap/checks/virtualenv.sh
1 file changed, 7 insertions(+), 3 deletions(-)

Approvals:
  EBernhardson: Verified; Looks good to me, approved



diff --git a/scap/checks/virtualenv.sh b/scap/checks/virtualenv.sh
index 2f8ebba..56da498 100644
--- a/scap/checks/virtualenv.sh
+++ b/scap/checks/virtualenv.sh
@@ -13,9 +13,13 @@
 
 PIP="${VENV}/bin/pip"
 
-# Ensure that the virtual environment exists
-mkdir -p "$VENV"
-virtualenv --never-download --python python2.7 $VENV || /bin/true
+# Ensure that the virtual environment exists. Don't recreate if already
+# existing, as this will try and downgrade pip on debian jessie from the one
+# installed later which then breaks pip.
+if [ ! -x "$PIP" ]; then
+mkdir -p "$VENV"
+virtualenv --never-download --python python2.7 $VENV || /bin/true
+fi
 
 # Debian jessie based hosts need updated versions of pip and wheel or they will
 # fail to install some binary packages (numpy, scipy, maybe others)

-- 
To view, visit https://gerrit.wikimedia.org/r/391296
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I7c6fd8ea1e1ee35a3e7386fd9b628c0605a11fda
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR/deploy
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 
Gerrit-Reviewer: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search...deploy[master]: Dont recreate virtualenv unless necessary

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/391296 )

Change subject: Dont recreate virtualenv unless necessary
..

Dont recreate virtualenv unless necessary

It appears that while the first run of the virtualenv.sh script will
succeed on a debian jessie based system, future runs will attempt
to overwrite the upgraded pip with the older system version of pip. This
causes pip to completly break with a mismatch between versions.

Change-Id: I7c6fd8ea1e1ee35a3e7386fd9b628c0605a11fda
---
M scap/checks/virtualenv.sh
1 file changed, 7 insertions(+), 3 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR/deploy 
refs/changes/96/391296/1

diff --git a/scap/checks/virtualenv.sh b/scap/checks/virtualenv.sh
index 2f8ebba..56da498 100644
--- a/scap/checks/virtualenv.sh
+++ b/scap/checks/virtualenv.sh
@@ -13,9 +13,13 @@
 
 PIP="${VENV}/bin/pip"
 
-# Ensure that the virtual environment exists
-mkdir -p "$VENV"
-virtualenv --never-download --python python2.7 $VENV || /bin/true
+# Ensure that the virtual environment exists. Don't recreate if already
+# existing, as this will try and downgrade pip on debian jessie from the one
+# installed later which then breaks pip.
+if [ ! -x "$PIP" ]; then
+mkdir -p "$VENV"
+virtualenv --never-download --python python2.7 $VENV || /bin/true
+fi
 
 # Debian jessie based hosts need updated versions of pip and wheel or they will
 # fail to install some binary packages (numpy, scipy, maybe others)

-- 
To view, visit https://gerrit.wikimedia.org/r/391296
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I7c6fd8ea1e1ee35a3e7386fd9b628c0605a11fda
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR/deploy
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search...deploy[master]: Bump mjolnir submodule

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/391294 )

Change subject: Bump mjolnir submodule
..


Bump mjolnir submodule

This brings in the patch that made working_dir configurable. We
need that so the deployment to stat1005 is able to use the
new configuration file.

Change-Id: Ia73ba83edd1166412878008a32a09f36dd9a7572
---
M src
1 file changed, 1 insertion(+), 1 deletion(-)

Approvals:
  EBernhardson: Verified; Looks good to me, approved



diff --git a/src b/src
index 96337a0..0d7fdcf 16
--- a/src
+++ b/src
@@ -1 +1 @@
-Subproject commit 96337a0ab1931278f93b752ca3be5f30e8124762
+Subproject commit 0d7fdcf27b51b848a8c964f3c204f195c376dea5

-- 
To view, visit https://gerrit.wikimedia.org/r/391294
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ia73ba83edd1166412878008a32a09f36dd9a7572
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR/deploy
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 
Gerrit-Reviewer: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search...deploy[master]: Bump mjolnir submodule

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/391294 )

Change subject: Bump mjolnir submodule
..

Bump mjolnir submodule

This brings in the patch that made working_dir configurable. We
need that so the deployment to stat1005 is able to use the
new configuration file.

Change-Id: Ia73ba83edd1166412878008a32a09f36dd9a7572
---
M src
1 file changed, 1 insertion(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR/deploy 
refs/changes/94/391294/1

diff --git a/src b/src
index 96337a0..0d7fdcf 16
--- a/src
+++ b/src
@@ -1 +1 @@
-Subproject commit 96337a0ab1931278f93b752ca3be5f30e8124762
+Subproject commit 0d7fdcf27b51b848a8c964f3c204f195c376dea5

-- 
To view, visit https://gerrit.wikimedia.org/r/391294
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ia73ba83edd1166412878008a32a09f36dd9a7572
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR/deploy
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search...deploy[master]: Deploy pip and wheel packages for jessie

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/391286 )

Change subject: Deploy pip and wheel packages for jessie
..


Deploy pip and wheel packages for jessie

Debian jessie based hosts (relforge100*) fail to install the
provided numpy and scipy packages. Included pip and wheel in
the artifacts and explicitly install them into the deployed
virtualenv.

Also turn on verbose output for pip when installing, to give
more information in `scap deploy-log` output for failures.

Change-Id: Ie7a6788aec81b6b8dad4f0df0c17c26c8f2a3275
---
A artifacts/pip-9.0.1-py2.py3-none-any.whl
A artifacts/wheel-0.30.0-py2.py3-none-any.whl
M make_wheels.sh
M scap/checks/virtualenv.sh
M upload_wheels.py
5 files changed, 25 insertions(+), 3 deletions(-)

Approvals:
  EBernhardson: Verified; Looks good to me, approved



diff --git a/artifacts/pip-9.0.1-py2.py3-none-any.whl 
b/artifacts/pip-9.0.1-py2.py3-none-any.whl
new file mode 100644
index 000..2b56f48
--- /dev/null
+++ b/artifacts/pip-9.0.1-py2.py3-none-any.whl
@@ -0,0 +1 @@
+#$# git-fat c70393185d27ae8b49a117e6dcc18bc5f8f3a1c3  1254803
diff --git a/artifacts/wheel-0.30.0-py2.py3-none-any.whl 
b/artifacts/wheel-0.30.0-py2.py3-none-any.whl
new file mode 100644
index 000..4869eee
--- /dev/null
+++ b/artifacts/wheel-0.30.0-py2.py3-none-any.whl
@@ -0,0 +1 @@
+#$# git-fat 11694b2cfb611fd4accb1135c7d0fef9db4cd92649751
diff --git a/make_wheels.sh b/make_wheels.sh
index 7ad71c5..5c29233 100755
--- a/make_wheels.sh
+++ b/make_wheels.sh
@@ -23,7 +23,12 @@
 virtualenv --python python2.7 $VENV || /bin/true
 $PIP install "${MJOLNIR}"
 $PIP freeze --local | grep -v mjolnir | grep -v pkg-resources > $REQUIREMENTS
-$PIP install wheel
+$PIP install pip wheel
+# Debian jessie based hosts require updated pip and wheel packages or they will
+# refuse to install some packages (numpy, scipy, maybe others)
+$PIP wheel --find-links "${WHEEL_DIR}" \
+--wheel-dir "${WHEEL_DIR}" \
+pip wheel
 $PIP wheel --find-links "${WHEEL_DIR}" \
 --wheel-dir "${WHEEL_DIR}" \
 --requirement "${REQUIREMENTS}"
diff --git a/scap/checks/virtualenv.sh b/scap/checks/virtualenv.sh
index 55c0a33..2f8ebba 100644
--- a/scap/checks/virtualenv.sh
+++ b/scap/checks/virtualenv.sh
@@ -17,15 +17,30 @@
 mkdir -p "$VENV"
 virtualenv --never-download --python python2.7 $VENV || /bin/true
 
+# Debian jessie based hosts need updated versions of pip and wheel or they will
+# fail to install some binary packages (numpy, scipy, maybe others)
+$PIP install \
+-vv \
+--no-index \
+--find-links "${WHEEL_DIR}" \
+--upgrade \
+--force-reinstall \
+pip wheel
 # Install or upgrade our packages
 $PIP install \
+-vv \
 --no-index \
 --find-links "${WHEEL_DIR}" \
 --upgrade \
 --force-reinstall \
 -r "${REQUIREMENTS}"
 
-$PIP install --upgrade --no-deps "${MJOLNIR_DIR}"
+$PIP install \
+-vv \
+--no-index \
+--upgrade \
+--no-deps \
+"${MJOLNIR_DIR}"
 
 # Build a .zip of the virtualenv that can be shipped to spark workers
 cd "${VENV}"
diff --git a/upload_wheels.py b/upload_wheels.py
index e16453b..5d93792 100755
--- a/upload_wheels.py
+++ b/upload_wheels.py
@@ -3,7 +3,7 @@
 Uploads python wheels to archiva
 
 Usage:
-upload-wheels.py wheels/*.whl
+upload-wheels.py artifacts/*.whl
 """
 
 from __future__ import print_function

-- 
To view, visit https://gerrit.wikimedia.org/r/391286
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ie7a6788aec81b6b8dad4f0df0c17c26c8f2a3275
Gerrit-PatchSet: 3
Gerrit-Project: search/MjoLniR/deploy
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 
Gerrit-Reviewer: DCausse 
Gerrit-Reviewer: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search...deploy[master]: Deploy pip and wheel packages for jessie

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/391286 )

Change subject: Deploy pip and wheel packages for jessie
..

Deploy pip and wheel packages for jessie

Debian jessie based hosts (relforge100*) fail to install the
provided numpy and scipy packages. Included pip and wheel in
the artifacts and explicitly install them into the deployed
virtualenv.

Change-Id: Ie7a6788aec81b6b8dad4f0df0c17c26c8f2a3275
---
A artifacts/pip-9.0.1-py2.py3-none-any.whl
A artifacts/wheel-0.30.0-py2.py3-none-any.whl
M make_wheels.sh
M scap/checks/virtualenv.sh
M upload_wheels.py
5 files changed, 17 insertions(+), 2 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR/deploy 
refs/changes/86/391286/1

diff --git a/artifacts/pip-9.0.1-py2.py3-none-any.whl 
b/artifacts/pip-9.0.1-py2.py3-none-any.whl
new file mode 100644
index 000..2b56f48
--- /dev/null
+++ b/artifacts/pip-9.0.1-py2.py3-none-any.whl
@@ -0,0 +1 @@
+#$# git-fat c70393185d27ae8b49a117e6dcc18bc5f8f3a1c3  1254803
diff --git a/artifacts/wheel-0.30.0-py2.py3-none-any.whl 
b/artifacts/wheel-0.30.0-py2.py3-none-any.whl
new file mode 100644
index 000..4869eee
--- /dev/null
+++ b/artifacts/wheel-0.30.0-py2.py3-none-any.whl
@@ -0,0 +1 @@
+#$# git-fat 11694b2cfb611fd4accb1135c7d0fef9db4cd92649751
diff --git a/make_wheels.sh b/make_wheels.sh
index 7ad71c5..5c29233 100755
--- a/make_wheels.sh
+++ b/make_wheels.sh
@@ -23,7 +23,12 @@
 virtualenv --python python2.7 $VENV || /bin/true
 $PIP install "${MJOLNIR}"
 $PIP freeze --local | grep -v mjolnir | grep -v pkg-resources > $REQUIREMENTS
-$PIP install wheel
+$PIP install pip wheel
+# Debian jessie based hosts require updated pip and wheel packages or they will
+# refuse to install some packages (numpy, scipy, maybe others)
+$PIP wheel --find-links "${WHEEL_DIR}" \
+--wheel-dir "${WHEEL_DIR}" \
+pip wheel
 $PIP wheel --find-links "${WHEEL_DIR}" \
 --wheel-dir "${WHEEL_DIR}" \
 --requirement "${REQUIREMENTS}"
diff --git a/scap/checks/virtualenv.sh b/scap/checks/virtualenv.sh
index 55c0a33..000c22c 100644
--- a/scap/checks/virtualenv.sh
+++ b/scap/checks/virtualenv.sh
@@ -17,6 +17,14 @@
 mkdir -p "$VENV"
 virtualenv --never-download --python python2.7 $VENV || /bin/true
 
+# Debian jessie based hosts need updated versions of pip and wheel or they will
+# fail to install some binary packages (numpy, scipy, maybe others)
+$PIP install \
+--no-index \
+--find-links "${WHEEL_DIR}" \
+--upgrade \
+--force-reinstall \
+pip wheel
 # Install or upgrade our packages
 $PIP install \
 --no-index \
diff --git a/upload_wheels.py b/upload_wheels.py
index e16453b..5d93792 100755
--- a/upload_wheels.py
+++ b/upload_wheels.py
@@ -3,7 +3,7 @@
 Uploads python wheels to archiva
 
 Usage:
-upload-wheels.py wheels/*.whl
+upload-wheels.py artifacts/*.whl
 """
 
 from __future__ import print_function

-- 
To view, visit https://gerrit.wikimedia.org/r/391286
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie7a6788aec81b6b8dad4f0df0c17c26c8f2a3275
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR/deploy
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search...deploy[master]: Remove --no-cache-dir from pip command

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/391268 )

Change subject: Remove --no-cache-dir from pip command
..

Remove --no-cache-dir from pip command

--no-cache-dir is not available for the pip version available in debian
jessie. We could perhaps ship an updated pip version with this deploy
repo, but that doesn't seem necessary yet.

Skipping --no-cache-dir should hopefully have no effect on building the
virtualenv. It's mostly just a stricter guarantee that we really are
installing only wheels from the artifacts directory
Change-Id: I00b96b84821a6bedf0d93400ca31842a0cd4e658
---
M scap/checks/virtualenv.sh
1 file changed, 0 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR/deploy 
refs/changes/68/391268/1

diff --git a/scap/checks/virtualenv.sh b/scap/checks/virtualenv.sh
index eb05944..55c0a33 100644
--- a/scap/checks/virtualenv.sh
+++ b/scap/checks/virtualenv.sh
@@ -19,7 +19,6 @@
 
 # Install or upgrade our packages
 $PIP install \
---no-cache-dir \
 --no-index \
 --find-links "${WHEEL_DIR}" \
 --upgrade \

-- 
To view, visit https://gerrit.wikimedia.org/r/391268
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I00b96b84821a6bedf0d93400ca31842a0cd4e658
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR/deploy
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search...deploy[master]: Remove --no-cache-dir from pip command

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/391268 )

Change subject: Remove --no-cache-dir from pip command
..


Remove --no-cache-dir from pip command

--no-cache-dir is not available for the pip version available in debian
jessie. We could perhaps ship an updated pip version with this deploy
repo, but that doesn't seem necessary yet.

Skipping --no-cache-dir should hopefully have no effect on building the
virtualenv. It's mostly just a stricter guarantee that we really are
installing only wheels from the artifacts directory
Change-Id: I00b96b84821a6bedf0d93400ca31842a0cd4e658
---
M scap/checks/virtualenv.sh
1 file changed, 0 insertions(+), 1 deletion(-)

Approvals:
  EBernhardson: Verified; Looks good to me, approved



diff --git a/scap/checks/virtualenv.sh b/scap/checks/virtualenv.sh
index eb05944..55c0a33 100644
--- a/scap/checks/virtualenv.sh
+++ b/scap/checks/virtualenv.sh
@@ -19,7 +19,6 @@
 
 # Install or upgrade our packages
 $PIP install \
---no-cache-dir \
 --no-index \
 --find-links "${WHEEL_DIR}" \
 --upgrade \

-- 
To view, visit https://gerrit.wikimedia.org/r/391268
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I00b96b84821a6bedf0d93400ca31842a0cd4e658
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR/deploy
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 
Gerrit-Reviewer: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search...deploy[master]: Fix typo in checks.yaml

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/391266 )

Change subject: Fix typo in checks.yaml
..

Fix typo in checks.yaml

Change-Id: Ic261295a7ac7ff5bc155ea01928a08d4e49e7b2e
---
M scap/checks.yaml
1 file changed, 1 insertion(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR/deploy 
refs/changes/66/391266/1

diff --git a/scap/checks.yaml b/scap/checks.yaml
index 3f0ed03..411e399 100644
--- a/scap/checks.yaml
+++ b/scap/checks.yaml
@@ -5,7 +5,7 @@
 timeout: 300
 group: relforge
 command: bash 
/srv/deployment/search/mjolnir/deploy/scap/checks/virtualenv.sh
-virtualenv_analytics
+virtualenv_analytics:
 type: command
 stage: promote
 timeout: 300

-- 
To view, visit https://gerrit.wikimedia.org/r/391266
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ic261295a7ac7ff5bc155ea01928a08d4e49e7b2e
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR/deploy
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search...deploy[master]: Fix typo in checks.yaml

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/391266 )

Change subject: Fix typo in checks.yaml
..


Fix typo in checks.yaml

Change-Id: Ic261295a7ac7ff5bc155ea01928a08d4e49e7b2e
---
M scap/checks.yaml
1 file changed, 1 insertion(+), 1 deletion(-)

Approvals:
  EBernhardson: Verified; Looks good to me, approved



diff --git a/scap/checks.yaml b/scap/checks.yaml
index 3f0ed03..411e399 100644
--- a/scap/checks.yaml
+++ b/scap/checks.yaml
@@ -5,7 +5,7 @@
 timeout: 300
 group: relforge
 command: bash 
/srv/deployment/search/mjolnir/deploy/scap/checks/virtualenv.sh
-virtualenv_analytics
+virtualenv_analytics:
 type: command
 stage: promote
 timeout: 300

-- 
To view, visit https://gerrit.wikimedia.org/r/391266
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ic261295a7ac7ff5bc155ea01928a08d4e49e7b2e
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR/deploy
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 
Gerrit-Reviewer: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search...deploy[master]: Setup scap checks for separate server groups

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/391265 )

Change subject: Setup scap checks for separate server groups
..


Setup scap checks for separate server groups

Doesn't look like the 'default' checks run for the other
groups, so define a check per group.

Change-Id: I139d27d7eea21c21592f09266b5d20d909f905ea
---
M scap/checks.yaml
1 file changed, 8 insertions(+), 2 deletions(-)

Approvals:
  EBernhardson: Verified; Looks good to me, approved



diff --git a/scap/checks.yaml b/scap/checks.yaml
index 825..3f0ed03 100644
--- a/scap/checks.yaml
+++ b/scap/checks.yaml
@@ -1,8 +1,14 @@
 checks:
-virtualenv:
+virtualenv_relforge:
 type: command
 stage: promote
 timeout: 300
-group: default
+group: relforge
+command: bash 
/srv/deployment/search/mjolnir/deploy/scap/checks/virtualenv.sh
+virtualenv_analytics
+type: command
+stage: promote
+timeout: 300
+group: analytics
 command: bash 
/srv/deployment/search/mjolnir/deploy/scap/checks/virtualenv.sh
 

-- 
To view, visit https://gerrit.wikimedia.org/r/391265
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I139d27d7eea21c21592f09266b5d20d909f905ea
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR/deploy
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 
Gerrit-Reviewer: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search...deploy[master]: Setup scap checks for separate server groups

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/391265 )

Change subject: Setup scap checks for separate server groups
..

Setup scap checks for separate server groups

Doesn't look like the 'default' checks run for the other
groups, so define a check per group.

Change-Id: I139d27d7eea21c21592f09266b5d20d909f905ea
---
M scap/checks.yaml
1 file changed, 8 insertions(+), 2 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR/deploy 
refs/changes/65/391265/1

diff --git a/scap/checks.yaml b/scap/checks.yaml
index 825..3f0ed03 100644
--- a/scap/checks.yaml
+++ b/scap/checks.yaml
@@ -1,8 +1,14 @@
 checks:
-virtualenv:
+virtualenv_relforge:
 type: command
 stage: promote
 timeout: 300
-group: default
+group: relforge
+command: bash 
/srv/deployment/search/mjolnir/deploy/scap/checks/virtualenv.sh
+virtualenv_analytics
+type: command
+stage: promote
+timeout: 300
+group: analytics
 command: bash 
/srv/deployment/search/mjolnir/deploy/scap/checks/virtualenv.sh
 

-- 
To view, visit https://gerrit.wikimedia.org/r/391265
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I139d27d7eea21c21592f09266b5d20d909f905ea
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR/deploy
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search...deploy[master]: scap.cfg: MjoLniR -> mjolnir

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/391262 )

Change subject: scap.cfg: MjoLniR -> mjolnir
..


scap.cfg: MjoLniR -> mjolnir

Change-Id: I6b17fd077f7794fb3354496207cc9ec19c142d1e
---
M scap/scap.cfg
1 file changed, 1 insertion(+), 1 deletion(-)

Approvals:
  EBernhardson: Verified; Looks good to me, approved



diff --git a/scap/scap.cfg b/scap/scap.cfg
index 23f370e..8913d7a 100644
--- a/scap/scap.cfg
+++ b/scap/scap.cfg
@@ -1,5 +1,5 @@
 [global]
-git_repo: search/MjoLniR/deploy
+git_repo: search/mjolnir/deploy
 ssh_user: deploy-service
 server_groups: analytics, relforge
 analytics_dsh_targets: discovery-analytics

-- 
To view, visit https://gerrit.wikimedia.org/r/391262
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I6b17fd077f7794fb3354496207cc9ec19c142d1e
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR/deploy
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 
Gerrit-Reviewer: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] search...deploy[master]: scap.cfg: MjoLniR -> mjolnir

2017-11-14 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/391262 )

Change subject: scap.cfg: MjoLniR -> mjolnir
..

scap.cfg: MjoLniR -> mjolnir

Change-Id: I6b17fd077f7794fb3354496207cc9ec19c142d1e
---
M scap/scap.cfg
1 file changed, 1 insertion(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/search/MjoLniR/deploy 
refs/changes/62/391262/1

diff --git a/scap/scap.cfg b/scap/scap.cfg
index 23f370e..8913d7a 100644
--- a/scap/scap.cfg
+++ b/scap/scap.cfg
@@ -1,5 +1,5 @@
 [global]
-git_repo: search/MjoLniR/deploy
+git_repo: search/mjolnir/deploy
 ssh_user: deploy-service
 server_groups: analytics, relforge
 analytics_dsh_targets: discovery-analytics

-- 
To view, visit https://gerrit.wikimedia.org/r/391262
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I6b17fd077f7794fb3354496207cc9ec19c142d1e
Gerrit-PatchSet: 1
Gerrit-Project: search/MjoLniR/deploy
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] mediawiki...WikimediaEvents[wmf/1.31.0-wmf.7]: Turn off AB test for DBN sizing on enwiki

2017-11-09 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/390289 )

Change subject: Turn off AB test for DBN sizing on enwiki
..

Turn off AB test for DBN sizing on enwiki

Change-Id: I8cb7b54ddd4f7f39c482a183950081006f0262ab
(cherry picked from commit 72028be976dc42a75b8934228070292a3f4dee7a)
---
M modules/all/ext.wikimediaEvents.searchSatisfaction.js
1 file changed, 3 insertions(+), 5 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikimediaEvents 
refs/changes/89/390289/1

diff --git a/modules/all/ext.wikimediaEvents.searchSatisfaction.js 
b/modules/all/ext.wikimediaEvents.searchSatisfaction.js
index a48355d..7517df7 100644
--- a/modules/all/ext.wikimediaEvents.searchSatisfaction.js
+++ b/modules/all/ext.wikimediaEvents.searchSatisfaction.js
@@ -114,9 +114,7 @@
function initialize( session ) {
 
var sessionId = session.get( 'sessionId' ),
-   validBuckets = mw.config.get( 'wgDBname' ) === 
'enwiki' ?
-   [ 'control', 'dbn20', 'dbn20-i', 
'dbn35', 'dbn35-i' ] :
-   [],
+   validBuckets = [],
sampleSize = ( function () {
var dbName = mw.config.get( 'wgDBname' 
),
// Provides a place to handle 
wiki-specific sampling,
@@ -141,8 +139,8 @@
// .15 increases that 
to 810k per week. Giving
// 160k sessions per 
bucket per week.
enwiki: {
-   test: 0.15,
-   subTest: 0.996
+   test: 2000,
+   subTest: null
},
enwiktionary: {
test: 40,

-- 
To view, visit https://gerrit.wikimedia.org/r/390289
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I8cb7b54ddd4f7f39c482a183950081006f0262ab
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/WikimediaEvents
Gerrit-Branch: wmf/1.31.0-wmf.7
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] mediawiki...WikimediaEvents[master]: Turn off AB test for DBN sizing on enwiki

2017-11-09 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/390286 )

Change subject: Turn off AB test for DBN sizing on enwiki
..

Turn off AB test for DBN sizing on enwiki

Change-Id: I8cb7b54ddd4f7f39c482a183950081006f0262ab
---
M modules/ext.wikimediaEvents.searchSatisfaction.js
1 file changed, 3 insertions(+), 5 deletions(-)


  git pull 
ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/WikimediaEvents 
refs/changes/86/390286/1

diff --git a/modules/ext.wikimediaEvents.searchSatisfaction.js 
b/modules/ext.wikimediaEvents.searchSatisfaction.js
index 40b1cc1..d0697cf 100644
--- a/modules/ext.wikimediaEvents.searchSatisfaction.js
+++ b/modules/ext.wikimediaEvents.searchSatisfaction.js
@@ -113,9 +113,7 @@
function initialize( session ) {
 
var sessionId = session.get( 'sessionId' ),
-   validBuckets = mw.config.get( 'wgDBname' ) === 
'enwiki' ?
-   [ 'control', 'dbn20', 'dbn20-i', 
'dbn35', 'dbn35-i' ] :
-   [],
+   validBuckets = [],
sampleSize = ( function () {
var dbName = mw.config.get( 'wgDBname' 
),
// Provides a place to handle 
wiki-specific sampling,
@@ -140,8 +138,8 @@
// .15 increases that 
to 810k per week. Giving
// 160k sessions per 
bucket per week.
enwiki: {
-   test: 0.15,
-   subTest: 0.996
+   test: 2000,
+   subTest: null
},
enwiktionary: {
test: 40,

-- 
To view, visit https://gerrit.wikimedia.org/r/390286
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I8cb7b54ddd4f7f39c482a183950081006f0262ab
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/WikimediaEvents
Gerrit-Branch: master
Gerrit-Owner: EBernhardson 

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] wikimedia...analytics[master]: Fetch inner hits and only the first page

2017-11-08 Thread EBernhardson (Code Review)
EBernhardson has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/384989 )

Change subject: Fetch inner hits and only the first page
..


Fetch inner hits and only the first page

Change-Id: Ifc2dcb24111bfececa5c448f886f2db3a2b39aff
---
M oozie/query_clicks/hourly/query_clicks_hourly.hql
1 file changed, 4 insertions(+), 2 deletions(-)

Approvals:
  EBernhardson: Looks good to me, approved
  Bearloga: Looks good to me, but someone else must approve
  jenkins-bot: Verified
  Chelsyx: Checked; Looks good to me, but someone else must approve



diff --git a/oozie/query_clicks/hourly/query_clicks_hourly.hql 
b/oozie/query_clicks/hourly/query_clicks_hourly.hql
index 8b825b5..7cb82eb 100644
--- a/oozie/query_clicks/hourly/query_clicks_hourly.hql
+++ b/oozie/query_clicks/hourly/query_clicks_hourly.hql
@@ -138,7 +138,7 @@
 csrs.identity,
 csrs.id AS request_set_token,
 csrs.ts AS timestamp,
-csrs.hits
+get_main_search_request(csrs.wikiid, csrs.requests).hits AS hits
 FROM
 ${source_cirrus_table} csrs
 JOIN
@@ -156,9 +156,11 @@
 -- Make sure we only extract from content index
 AND SIZE(get_main_search_request(csrs.wikiid, csrs.requests).indices) 
== 1
 AND get_main_search_request(csrs.wikiid, csrs.requests).indices[0] 
LIKE '%_content'
+-- Only fetch first page for simplicity
+AND get_main_search_request(csrs.wikiid, csrs.requests).hitsoffset = 0
 -- We only want 'normal' requests here. if the user requested more than
 -- the default 20 results filter them out
-AND SIZE(csrs.hits) <= 20
+AND SIZE(get_main_search_request(csrs.wikiid, csrs.requests).hits) <= 
20
 )
 
 INSERT OVERWRITE TABLE

-- 
To view, visit https://gerrit.wikimedia.org/r/384989
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ifc2dcb24111bfececa5c448f886f2db3a2b39aff
Gerrit-PatchSet: 2
Gerrit-Project: wikimedia/discovery/analytics
Gerrit-Branch: master
Gerrit-Owner: DCausse 
Gerrit-Reviewer: Bearloga 
Gerrit-Reviewer: Chelsyx 
Gerrit-Reviewer: EBernhardson 
Gerrit-Reviewer: jenkins-bot <>

___
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits


[MediaWiki-commits] [Gerrit] wikimedia...analytics[master]: Calculate click data for top queries

2017-11-08 Thread EBernhardson (Code Review)
EBernhardson has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/317019 )

Change subject: Calculate click data for top queries
..


Calculate click data for top queries

Joins search request logs against the web requests table to locate
click throughs on search requests on a daily basis.  This is the first
step in figuring out a way to judge engine relevance based on user click
throughs.

* Applies some of dcausse's approaches to limit the search logs to
  full text searches performed via web on Special:Search.
* Takes +1 hour of webrequest data as compared to search log data, to
  ensure we collect the clicks against searches near the end of the last
  hour
* Applies naive sessionization of queries, defining a new session as
  being more than 30 minutes without a search query from a user
  identity.
* Identities performing more than 1k queries per day are filtered out of
  the daily table for performance reasons. It may be desirable to limit
  even further in sources that consume this data.

Bug: T162054
Depends-On: I67d5f0e7674f970b353ab5992fec1431f4592256
Depends-On: I458e7ac724fefe813732b48fcfcef4728359fca9
Change-Id: I09f253849d8a1d28a3c26dc6b0f60233074d6a90
---
A hive/query_clicks/create_query_clicks_daily.hql
A hive/query_clicks/create_query_clicks_hourly.hql
M oozie/datasets.xml
A oozie/query_clicks/daily/coordinator.properties
A oozie/query_clicks/daily/coordinator.xml
A oozie/query_clicks/daily/drop_query_clicks_hourly_partitions.hql
A oozie/query_clicks/daily/query_clicks_daily.hql
A oozie/query_clicks/daily/workflow.xml
A oozie/query_clicks/datasets.xml
A oozie/query_clicks/hourly/coordinator.properties
A oozie/query_clicks/hourly/coordinator.xml
A oozie/query_clicks/hourly/query_clicks_hourly.hql
A oozie/query_clicks/hourly/workflow.xml
13 files changed, 1,178 insertions(+), 0 deletions(-)

Approvals:
  jenkins-bot: Verified
  DCausse: Looks good to me, approved



diff --git a/hive/query_clicks/create_query_clicks_daily.hql 
b/hive/query_clicks/create_query_clicks_daily.hql
new file mode 100644
index 000..1c8ff68
--- /dev/null
+++ b/hive/query_clicks/create_query_clicks_daily.hql
@@ -0,0 +1,19 @@
+CREATE TABLE `discovery.query_clicks_daily`(
+  `query` string,
+  `q_by_ip_day` int,
+  `timestamp` bigint,
+  `wikiid` string,
+  `project` string,
+  `hits` 
array>,
+  `clicks` array>,
+  `session_id` string
+)
+PARTITIONED BY (
+  `year` int,
+  `month` int,
+  `day` int
+)
+STORED AS PARQUET
+LOCATION 'hdfs://analytics-hadoop/wmf/data/discovery/query_clicks/daily'
+;
+
diff --git a/hive/query_clicks/create_query_clicks_hourly.hql 
b/hive/query_clicks/create_query_clicks_hourly.hql
new file mode 100644
index 000..77f5971
--- /dev/null
+++ b/hive/query_clicks/create_query_clicks_hourly.hql
@@ -0,0 +1,20 @@
+CREATE TABLE `discovery.query_clicks_hourly` (
+  `query` string,
+  `ip` string,
+  `identity` string,
+  `timestamp` bigint,
+  `wikiid` string,
+  `project` string,
+  `hits` 
array>,
+  `clicks` array>
+)
+PARTITIONED BY (
+  `year` int,
+  `month` int,
+  `day` int,
+  `hour` int
+)
+STORED AS PARQUET
+LOCATION 'hdfs://analytics-hadoop/wmf/data/discovery/query_clicks/hourly'
+;
+
diff --git a/oozie/datasets.xml b/oozie/datasets.xml
index 68c7ec2..e33f73d 100644
--- a/oozie/datasets.xml
+++ b/oozie/datasets.xml
@@ -31,4 +31,5 @@
 
${popularity_score_data_directory}/agg_days=${days_aggregated}/year=${YEAR}/month=${"$"}{MONTH
 + 0}/day=${"$"}{DAY + 0}
 _SUCCESS
 
+
 
diff --git a/oozie/query_clicks/daily/coordinator.properties 
b/oozie/query_clicks/daily/coordinator.properties
new file mode 100644
index 000..3874b0b
--- /dev/null
+++ b/oozie/query_clicks/daily/coordinator.properties
@@ -0,0 +1,63 @@
+# Configures a coordinator to manage automatically merging
+# query_clicks_hourly into a daily table.
+#
+# Any of the following properties are overidable with -D.
+# Usage:
+#   oozie job -Duser=$USER -Dstart_time=2016-12-01T00:00Z -submit \
+#   -config oozie/query_clicks/daily/coordinator.properties
+#
+# NOTE:  Both *_oozie_directory must be synced to HDFS so that all relevant
+#.xml files exist there when this job is submitted.
+
+# Base path in HDFS to this repository oozie files.
+# Other files will be used relative to this path.
+discovery_oozie_directory = 
${name_node}/wmf/discovery/current/oozie
+
+# Base path in HDFS to the analytics team oozie files.
+# Other files will be used relative to this path
+refinery_directory= ${name_node}/wmf/refinery/current
+analytics_oozie_directory = ${refinery_directory}/oozie
+
+name_node = 

[MediaWiki-commits] [Gerrit] mediawiki...CirrusSearch[wmf/1.31.0-wmf.6]: Try to unify phrase rescore with RescoreBuilder

2017-11-08 Thread EBernhardson (Code Review)
EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/390162 )

Change subject: Try to unify phrase rescore with RescoreBuilder
..

Try to unify phrase rescore with RescoreBuilder

- kill Searcher::installBoosts
- single place to build rescore
- all sort options handled in the same switch/case

benefits: allow to customize positioning of the phrase rescore
drawbacks: rescore profiles need to add a placeholder for the phrase rescore

Bug: T178906
Change-Id: I438153c9fe52d8275868ddf3f0a0bd7a0cc5627f
(cherry picked from commit bc5a8a63929c1e4cbec65ef16b5221c4c1264285)
---
M includes/Query/FullTextQueryStringQueryBuilder.php
M includes/Search/RescoreBuilders.php
M includes/Search/SearchContext.php
M includes/Searcher.php
M profiles/RescoreProfiles.config.php
M tests/unit/fixtures/searchText/ltr_001.query
6 files changed, 97 insertions(+), 80 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/62/390162/1

diff --git a/includes/Query/FullTextQueryStringQueryBuilder.php 
b/includes/Query/FullTextQueryStringQueryBuilder.php
index d1883a5..0c727b3 100644
--- a/includes/Query/FullTextQueryStringQueryBuilder.php
+++ b/includes/Query/FullTextQueryStringQueryBuilder.php
@@ -223,19 +223,12 @@
$rescoreFields = $nonAllFields;
}
 
-   $searchContext->addRescore( [
-   'window_size' => $this->config->get( 
'CirrusSearchPhraseRescoreWindowSize' ),
-   'query' => [
-   'rescore_query' => 
$this->buildPhraseRescoreQuery(
+   $searchContext->setPhraseRescoreQuery( 
$this->buildPhraseRescoreQuery(
$searchContext,
$rescoreFields,
$this->queryStringQueryString,
$this->config->getElement( 
'CirrusSearchPhraseSlop', 'boost' )
-   ),
-   'query_weight' => 1.0,
-   'rescore_query_weight' => 
$this->config->get( 'CirrusSearchPhraseRescoreBoost' ),
-   ]
-   ] );
+   ) );
}
 
if ( $showSuggestion ) {
@@ -269,7 +262,6 @@
'query' => $this->queryStringQueryString,
'default_operator' => 'AND',
] ] ) );
-   $searchContext->clearRescore();
 
return true;
}
@@ -639,9 +631,7 @@
// Queries with the quote already contain a phrase query and we
// can't build phrase queries out of phrase queries at this
// point.
-   if ( $this->config->get( 'CirrusSearchPhraseRescoreBoost' ) > 
0.0 &&
-   $this->config->get( 
'CirrusSearchPhraseRescoreWindowSize' ) &&
-   !$searchContext->isSpecialKeywordUsed() &&
+   if ( !$searchContext->isSpecialKeywordUsed() &&
strpos( $this->queryStringQueryString, '"' ) === false 
&&
( $this->useTokenCountRouter || strpos( 
$this->queryStringQueryString, ' ' ) !== false )
) {
diff --git a/includes/Search/RescoreBuilders.php 
b/includes/Search/RescoreBuilders.php
index f63826d..12d4054 100644
--- a/includes/Search/RescoreBuilders.php
+++ b/includes/Search/RescoreBuilders.php
@@ -52,6 +52,7 @@
 
const FUNCTION_SCORE_TYPE = "function_score";
const LTR_TYPE = "ltr";
+   const PHRASE = "phrase";
 
/**
 * @var SearchContext
@@ -82,11 +83,14 @@
$rescores = [];
foreach ( $this->profile['rescore'] as $rescoreDef ) {
$windowSize = $this->windowSize( $rescoreDef );
+   if ( $windowSize <= 0 ) {
+   continue;
+   }
$rescore = [
'window_size' => $windowSize,
];
 
-   $rescore['query'] = array_intersect_key( $rescoreDef, 
array_flip( self::$rescoreMainParams ) );
+   $rescore['query'] = $this->prepareQueryParams( 
$rescoreDef );
$rescoreQuery = $this->buildRescoreQuery( $rescoreDef );
if ( $rescoreQuery === null ) {
continue;
@@ -111,6 +115,8 @@
return $funcChain->buildRescoreQuery();
case self::LTR_TYPE:
return $this->buildLtrQuery( $rescoreDef['model'] );
+   case self::PHRASE:
+   

  1   2   3   4   5   6   7   8   9   10   >