EBernhardson has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/366194 )

Change subject: Team Draft interleaved AB testing
......................................................................

Team Draft interleaved AB testing

Adjusts primary full text search handing to support team
draft interleaving via wgCirrusSearchUserTesting triggers.
This method of AB testing search changes is significantly
more robust to noise, as all users are in both the control
and test buckets. See linked ticket/papers for more information.

Implementation is perhaps not perfect, but for AB testing should
be sufficient. Specifically a way to decorate a ResultSet, rather
than copying all the data over into an extended class would be
useful.

Bug: T150032
Change-Id: I936d2432cc94a67d373f77779ca6550a0f127bf1
---
M CirrusSearch.php
M autoload.php
M docs/settings.txt
M includes/CirrusSearch.php
A includes/Search/InterleavedResultSet.php
M includes/Search/ResultSet.php
A includes/Search/SearchMetricsProvider.php
A includes/Search/TeamDraftInterleaver.php
M includes/Searcher.php
A tests/unit/Search/TeamDraftInterleaverTest.php
10 files changed, 320 insertions(+), 9 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/94/366194/1

diff --git a/CirrusSearch.php b/CirrusSearch.php
index 19b03a2..6e499e6 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -1477,6 +1477,15 @@
  */
 $wgCirrusSearchEnableArchive = false;
 
+/**
+ * Map of configuration variable name to value used to override cirrus config
+ * during interleaved full text search. Generally this should *not* be set
+ * directly, and instead set via $wgCirrusSearchUserTesting triggers. It is
+ * useful to perform Team-Draft interleaved search experiments to compare the
+ * performance of two different search configurations.
+ */
+$wgCirrusSearchInterleaveConfig = [];
+
 /*
  * Please update docs/settings.txt if you add new values!
  */
diff --git a/autoload.php b/autoload.php
index 85cdb60..1cbae77 100644
--- a/autoload.php
+++ b/autoload.php
@@ -163,6 +163,7 @@
        'CirrusSearch\\Search\\IdResultsType' => __DIR__ . 
'/includes/Search/ResultsType.php',
        'CirrusSearch\\Search\\IncomingLinksFunctionScoreBuilder' => __DIR__ . 
'/includes/Search/RescoreBuilders.php',
        'CirrusSearch\\Search\\IntegerIndexField' => __DIR__ . 
'/includes/Search/IntegerIndexField.php',
+       'CirrusSearch\\Search\\InterleavedResultSet' => __DIR__ . 
'/includes/Search/InterleavedResultSet.php',
        'CirrusSearch\\Search\\InvalidRescoreProfileException' => __DIR__ . 
'/includes/Search/RescoreBuilders.php',
        'CirrusSearch\\Search\\KeywordIndexField' => __DIR__ . 
'/includes/Search/KeywordIndexField.php',
        'CirrusSearch\\Search\\LangWeightFunctionScoreBuilder' => __DIR__ . 
'/includes/Search/RescoreBuilders.php',
@@ -180,8 +181,10 @@
        'CirrusSearch\\Search\\SatuFunctionScoreBuilder' => __DIR__ . 
'/includes/Search/RescoreBuilders.php',
        'CirrusSearch\\Search\\ScriptScoreFunctionScoreBuilder' => __DIR__ . 
'/includes/Search/RescoreBuilders.php',
        'CirrusSearch\\Search\\SearchContext' => __DIR__ . 
'/includes/Search/SearchContext.php',
+       'CirrusSearch\\Search\\SearchMetricsProvider' => __DIR__ . 
'/includes/Search/SearchMetricsProvider.php',
        'CirrusSearch\\Search\\ShortTextIndexField' => __DIR__ . 
'/includes/Search/ShortTextIndexField.php',
        'CirrusSearch\\Search\\SourceTextIndexField' => __DIR__ . 
'/includes/Search/SourceTextIndexField.php',
+       'CirrusSearch\\Search\\TeamDraftInterleaver' => __DIR__ . 
'/includes/Search/TeamDraftInterleaver.php',
        'CirrusSearch\\Search\\TextIndexField' => __DIR__ . 
'/includes/Search/TextIndexField.php',
        'CirrusSearch\\Search\\TitleHelper' => __DIR__ . 
'/includes/Search/TitleHelper.php',
        'CirrusSearch\\Search\\TitleResultsType' => __DIR__ . 
'/includes/Search/ResultsType.php',
diff --git a/docs/settings.txt b/docs/settings.txt
index 215e48e..55631ad 100644
--- a/docs/settings.txt
+++ b/docs/settings.txt
@@ -1565,3 +1565,14 @@
 
 Whether deletes are indexed for archive search when page is deleted. Note that 
searching
 for archived pages can be done by manually indexing them too.
+
+; $wgCirrusSearchInterleaveConfig
+
+Default:
+       $wgCirrusSearchInterleaveConfig = [];
+
+Map of configuration variable name to value used to override cirrus config
+during interleaved full text search. Generally tis should *not* be set
+directly, and instead set via $wgCirrusSearchUserTesting triggers. It is
+usefull to perform Team-Draft interleaved search experiments to compare the
+performance of two different search configurations.
diff --git a/includes/CirrusSearch.php b/includes/CirrusSearch.php
index 866a8dc..cdfc8d4 100644
--- a/includes/CirrusSearch.php
+++ b/includes/CirrusSearch.php
@@ -5,6 +5,7 @@
 use CirrusSearch\InterwikiSearcher;
 use CirrusSearch\InterwikiResolver;
 use CirrusSearch\Search\FullTextResultsType;
+use CirrusSearch\Search\SearchMetricsProvider;
 use CirrusSearch\Searcher;
 use CirrusSearch\CompletionSuggester;
 use CirrusSearch\Search\ResultSet;
@@ -175,6 +176,9 @@
                        $status = $this->searchTextSecondTry( $term, $status );
                }
                ElasticsearchIntermediary::setResultPages( [ 
$status->getValue() ] );
+               if ( $status->getValue() instanceof SearchMetricsProvider ) {
+                       $this->extraSearchMetrics += 
$status->getValue()->getMetrics();
+               }
 
                return $status;
        }
diff --git a/includes/Search/InterleavedResultSet.php 
b/includes/Search/InterleavedResultSet.php
new file mode 100644
index 0000000..132f311
--- /dev/null
+++ b/includes/Search/InterleavedResultSet.php
@@ -0,0 +1,62 @@
+<?php
+
+namespace CirrusSearch\Search;
+
+use ArrayIterator;
+
+class InterleavedResultSet extends ResultSet implements SearchMetricsProvider {
+       /** @var ArrayIterator */
+       private $interleaved;
+       /** @var string[] Doc ID's belonging to team A */
+       private $teamA;
+       /** @var string[] Doc ID's belonging to team B */
+       private $teamB;
+
+       /**
+        * @param ResultSet $nested Original result set for team A (control)
+        * @param Result[] $interleaved Interleaved results
+        * @param string[] $teamA Document id's belonging to team A
+        * @param string[] $teamB Document id's belonging to team B
+        * @param int $offset Offset of next unused result in team A
+        */
+       public function __construct(
+               ResultSet $nested,
+               array $interleaved,
+               array $teamA,
+               array $teamB,
+               $offset
+       ) {
+               $this->interleaved = new ArrayIterator( $interleaved );
+               $this->teamA = $teamA;
+               $this->teamB = $teamB;
+               $this->offset = $offset;
+
+               $nested->copyTo( $this );
+       }
+
+       public function next() {
+               $current = $this->interleaved->current();
+               if ( $current ) {
+                       $this->interleaved->next();
+                       return $current;
+               }
+               return false;
+       }
+
+       public function rewind() {
+               $this->interleaved->rewind();
+       }
+
+       public function getMetrics() {
+               return [
+                       'wgCirrusSearchTeamDraft' => [
+                               'a' => $this->teamA,
+                               'b' => $this->teamB,
+                       ],
+               ];
+       }
+
+       public function getOffset() {
+               return $this->offset;
+       }
+}
diff --git a/includes/Search/ResultSet.php b/includes/Search/ResultSet.php
index 34a0e49..901378b 100644
--- a/includes/Search/ResultSet.php
+++ b/includes/Search/ResultSet.php
@@ -29,47 +29,47 @@
        /**
         * @var \Elastica\ResultSet
         */
-       private $result;
+       protected $result;
 
        /**
         * @var int
         */
-       private $hits;
+       protected $hits;
 
        /**
         * @var int
         */
-       private $totalHits;
+       protected $totalHits;
 
        /**
         * @var string|null
         */
-       private $suggestionQuery;
+       protected $suggestionQuery;
 
        /**
         * @var string|null
         */
-       private $suggestionSnippet;
+       protected $suggestionSnippet;
 
        /**
         * @var bool
         */
-       private $searchContainedSyntax;
+       protected $searchContainedSyntax;
 
        /**
         * @var array
         */
-       private $interwikiResults = [];
+       protected $interwikiResults = [];
 
        /**
         * @var string|null
         */
-       private $rewrittenQuery;
+       protected $rewrittenQuery;
 
        /**
         * @var string|null
         */
-       private $rewrittenQuerySnippet;
+       protected $rewrittenQuerySnippet;
 
        /**
         * @param string[] $suggestPrefixes
@@ -103,6 +103,28 @@
        }
 
        /**
+        * Copy object state into another object
+        *
+        * Copies the state of this object into another class
+        * (likely extendde from this class). Used in place of a decorator
+        * because core does not expose an interface for this, and we cannot
+        * otherwise satisfy type constraints matching this class.
+        *
+        * @param ResultSet $other
+        */
+       protected function copyTo( ResultSet $other ) {
+               $other->result = $this->result;
+               $other->hits = $this->hits;
+               $other->totalHits = $this->totalHits;
+               $other->suggestionQuery = $this->suggestionQuery;
+               $other->suggestionSnippet = $this->suggestionSnippet;
+               $other->searchContainedSyntax = $this->searchContainedSyntax;
+               $other->interwikiResults = $this->interwikiResults;
+               $other->rewrittenQuery = $this->rewrittenQuery;
+               $other->rewrittenQuerySnippet = $this->rewrittenQuerySnippet;
+       }
+
+       /**
         * Is rewriting this query OK?
         *
         * @param int $threshold Minimum number of results to reach before 
rewriting is not allowed.
diff --git a/includes/Search/SearchMetricsProvider.php 
b/includes/Search/SearchMetricsProvider.php
new file mode 100644
index 0000000..b0cd6ab
--- /dev/null
+++ b/includes/Search/SearchMetricsProvider.php
@@ -0,0 +1,7 @@
+<?php
+
+namespace CirrusSearch\Search;
+
+interface SearchMetricsProvider {
+       public function getMetrics();
+}
diff --git a/includes/Search/TeamDraftInterleaver.php 
b/includes/Search/TeamDraftInterleaver.php
new file mode 100644
index 0000000..114598c
--- /dev/null
+++ b/includes/Search/TeamDraftInterleaver.php
@@ -0,0 +1,101 @@
+<?php
+
+namespace CirrusSearch\Search;
+
+/**
+ * Implementation of algorithm 2, Team-Draft Interleaving, from F. Radlinski, 
M.
+ * Kurup, and T. Joachims. How does clickthrough data reflect retrieval
+ * quality? In Proc 17th Intl. Conf. on Information and Knowledge Management.
+ * ACM New York, NY, USA, 2008.
+ */
+class TeamDraftInterleaver {
+       /**
+        * @param ResultSet $a Results for team A
+        * @param ResultSet $b Results for team B
+        * @param int $limit Maximum number of results to return
+        * @return ResultSet
+        */
+       public function interleave( ResultSet $a, ResultSet $b, $limit ) {
+               $aResults = $this->extractResults( $a );
+               $bResults = $this->extractResults( $b );
+               list( $interleaved, $teamA, $teamB, $aOffset ) = 
$this->interleaveResults(
+                       $aResults,
+                       $bResults,
+                       $limit
+               );
+
+               return new InterleavedResultSet( $a, $interleaved, $teamA, 
$teamB, $aOffset );
+       }
+
+       private function extractResults( ResultSet $results ) {
+               $results->rewind();
+               $res = [];
+               while ( true ) {
+                       $next = $results->next();
+                       if ( $next === false ) {
+                               break;
+                       }
+                       $res[$next->getDocId()] = $next;
+               }
+               return $res;
+       }
+
+       /**
+        * Interleave two arrays using team draft interleaving. Only public
+        * for unit tests. The id's used as keys in $a and $b must represent
+        * the same thing, to prevent duplicates in the returned results.
+        *
+        * @param mixed[] $a Map from id to anything in preferred order for 
team A
+        * @param mixed[] $b Map from id to anything in preferred order for 
team B
+        * @param int $limit Maximum number of results to interleave
+        * @return int[][] Three item array, first item being the values from $a
+        *  and $b in the interleaved ordering, second the ids belonging to team
+        *  A, and third the ids belonging to team B.
+        */
+       public static function interleaveResults($a, $b, $limit) {
+               $interleaved = [];
+               $teamA = [];
+               $teamB = [];
+               $aIds = array_combine( array_keys( $a ), array_keys( $a ) );
+               $bIds = array_combine( array_keys( $b ), array_keys( $b ) );
+               while ( count( $aIds ) && count( $bIds ) ) {
+                       // If team b has already chosen, then a needs to go
+                       if ( count( $teamA ) < count( $teamB )
+                               || (
+                               // If the counts are equal choose randomly 
which team goes next
+                                       count( $teamA ) == count( $teamB )
+                                       && mt_rand( 0, 1 ) === 1
+                       ) ) {
+                               $id = reset( $aIds );
+                               $teamA[] = $id;
+                               $interleaved[] = $a[$id];
+                       } else {
+                               $id = reset( $bIds );
+                               $teamB[] = $id;
+                               $interleaved[] = $b[$id];
+                       }
+                       unset( $aIds[$id], $bIds[$id] );
+                       if ( count( $interleaved ) >= $limit ) {
+                               break;
+                       }
+               }
+
+               if ( $aIds ) {
+                       // $offset needs to be set such that results starting 
at $offset +
+                       // $limit are new. As such if we have 20 items and 10 
of a are
+                       // used, then offset should be -10. If 15 are used we 
want -5.
+                       //
+                       // This doesn't guarantee we don't show duplicates. 
Items from B
+                       // could be in second page of A. The goal here is to 
ensure
+                       // everything from A is seen when paginating even if 
there are
+                       // dupes.
+                       $nextA = reset( $aIds );
+                       $nextAIdx = array_search( $nextA, array_keys( $a ) );
+                       $offset = -( $limit - $nextAIdx );
+               } else {
+                       $offset = 0;
+               }
+
+               return [$interleaved, $teamA, $teamB, $offset];
+       }
+}
diff --git a/includes/Searcher.php b/includes/Searcher.php
index f0bcc1b..03dd3fc 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -8,6 +8,8 @@
 use CirrusSearch\Search\ResultsType;
 use CirrusSearch\Search\RescoreBuilder;
 use CirrusSearch\Search\SearchContext;
+use CirrusSearch\Search\ResultSet;
+use CirrusSearch\Search\TeamDraftInterleaver;
 use CirrusSearch\Query\FullTextQueryBuilder;
 use CirrusSearch\Elastica\MultiSearch as MultiSearch;
 use Elastica\Exception\RuntimeException;
@@ -155,6 +157,10 @@
                $this->indexBaseName = $index ?: $config->get( 
SearchConfig::INDEX_BASE_NAME );
                $this->language = $config->get( 'ContLang' );
                $this->searchContext = new SearchContext( $this->config, 
$namespaces );
+       }
+
+       public function __clone() {
+               $this->searchContext = clone $this->searchContext;
        }
 
        /**
@@ -394,7 +400,47 @@
                        return $checkLengthStatus;
                }
 
+               $searches = [];
                $qb = $this->buildFullTextSearch( $term, $showSuggestion );
+               $searches[] = $this->buildSearch();
+
+               if ( !$this->searchContext->areResultsPossible() ) {
+                       return Status::newGood( new SearchResultSet( true ) );
+               }
+
+               $interleave = $this->config->get( 
'CirrusSearchInterleaveConfig' );
+               // Only interleave the first page of results.
+               if ( !$this->offset && $interleave ) {
+                       $otherSearcher = clone $this;
+                       $otherSearcher->config = $this->buildInterleaveConfig( 
$interleave );
+                       $otherSearcher->buildFullTextSearch( $term, 
$showSuggestion );
+                       $searches[] = $otherSearcher->buildSearch();
+               }
+
+               $result = $this->searchMulti( $searches );
+               if ( !$result->isOK() ) {
+                       return $result;
+               }
+
+               if ( $interleave ) {
+                       // Evil hax to support cirrusDumpResult. This is 
probably
+                       // very fragile.
+                       $value = $result->getValue();
+                       if ( $value[0] instanceof ResultSet ) {
+                               $interleaver = new TeamDraftInterleaver();
+                               $response = $interleaver->interleave( 
$value[0], $value[1], $this->limit );
+                       } else {
+                               $response = $value;
+                       }
+               } else {
+                       // Convert array of responses to single value
+                       $value = $result->getValue();
+                       $response = reset( $value );
+               }
+               $result->setResult( true, $response );
+
+               return $result;
+
 
                $status = $this->searchOne();
                if ( !$status->isOK() && ElasticaErrorHandler::isParseError( 
$status ) ) {
@@ -1135,4 +1181,14 @@
                return $this->searchOne();
        }
 
+       private function buildInterleaveConfig( array $interleave ) {
+               $overrides = [];
+               if ( isset( $interleave['global'] ) ) {
+                       $overrides += $interleave['global'];
+               }
+               if ( isset( $interleave['rescore'] ) ) {
+                       $overrides['CirrusSearchRescoreProfiles'] = 
$interleave['rescore'] + $this->config->get( 'CirrusSearchRescoreProfiles' );
+               }
+               return new HashSearchConfig( $overrides, ['inherit'] );
+       }
 }
diff --git a/tests/unit/Search/TeamDraftInterleaverTest.php 
b/tests/unit/Search/TeamDraftInterleaverTest.php
new file mode 100644
index 0000000..47ad990
--- /dev/null
+++ b/tests/unit/Search/TeamDraftInterleaverTest.php
@@ -0,0 +1,36 @@
+<?php
+
+namespace CirrusSearch\Search;
+
+class TeamDraftInterleaverTest extends \PHPUnit_Framework_TestCase {
+       public function testInterleaveResults() {
+               // Construct some pointless array with all overlapping values
+               $limit = 20;
+               $a = range( 5, 5 + $limit - 1 );
+               $b = range( 1, 1 + $limit - 1 );
+               $a = array_combine( $a, $a );
+               $b = array_combine( $b, $b );
+
+               for ( $i = 0; $i < 10; ++$i ) {
+                       // Use a constant seed to allow determinism
+                       mt_srand( 12345 * $i );
+                       list( $interleave, $teamA, $teamB, $aOffset ) = 
TeamDraftInterleaver::interleaveResults( $a, $b, $limit );
+                       // Very basic assertions about the shape of results
+                       $this->assertCount( 10, $teamA );
+                       $this->assertCount( 10, $teamB );
+                       $this->assertCount( 0, array_intersect( $teamA, $teamB 
) );
+                       $this->assertCount( $limit, $interleave );
+
+                       // Verify offset is last used iteam in a. Note that 
this isn't
+                       // perfect, there could be items in a after this that 
were
+                       // used by B, but it's good enough.
+                       // Remember that offset is < 0, because its a number 
where
+                       // the next page starts at $offset + $limit (0 indexed).
+                       $nextIdx = $aOffset + $limit;
+                       $last = array_slice( $a, $nextIdx - 1, 1);
+                       $this->assertCount( 1, array_intersect( $interleave, 
$last ) );
+                       $next = array_slice( $a, $nextIdx, 1);
+                       $this->assertCount( 0, array_intersect( $interleave, 
$next ) );
+               }
+       }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/366194
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I936d2432cc94a67d373f77779ca6550a0f127bf1
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to