DCausse has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/368771 )

Change subject: Refactor ordering of crosspoject blocks
......................................................................

Refactor ordering of crosspoject blocks

to allow fine-grained configuration.

Introduce yet another profile system for ordering cross project
blocks. It allows to fine-tune ordering of the blocks based on
different scoring implementations:
- recall
- static
- random

and also permits to combine them using composite implementaiton
(weighted sum).
It should then be easier to add other implementations such as a
scoring method based on the density of the highlighted snippets.

Bug: T171803
Change-Id: I9618b9e9bee51d91a9ec931725abcd39eb730b39
---
M CirrusSearch.php
M autoload.php
M includes/InterwikiSearcher.php
A includes/Search/CrossProjectBlockScorer.php
A profiles/CrossProjectBlockScorerProfiles.config.php
A profiles/CrossProjectBlockScorerProfiles.php
A tests/unit/Search/CrossProjectBlockScorerTest.php
7 files changed, 372 insertions(+), 32 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/71/368771/1

diff --git a/CirrusSearch.php b/CirrusSearch.php
index 4e471d1..ce94339 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -27,6 +27,7 @@
 require_once __DIR__ . "/profiles/SimilarityProfiles.php";
 require_once __DIR__ . "/profiles/SaneitizeProfiles.php";
 require_once __DIR__ . "/profiles/FullTextQueryBuilderProfiles.config.php";
+require_once __DIR__ . "/profiles/CrossProjectBlockScorerProfiles.config.php";
 
 $wgExtensionCredits['other'][] = [
        'path'           => __FILE__,
diff --git a/autoload.php b/autoload.php
index 3e5f32d..6b8cdda 100644
--- a/autoload.php
+++ b/autoload.php
@@ -34,6 +34,7 @@
        'CirrusSearch\\CompletionRequestLog' => __DIR__ . 
'/includes/CompletionRequestLog.php',
        'CirrusSearch\\CompletionSuggester' => __DIR__ . 
'/includes/CompletionSuggester.php',
        'CirrusSearch\\Connection' => __DIR__ . '/includes/Connection.php',
+       'CirrusSearch\\CrossProjectBlockScorerProfiles' => __DIR__ . 
'/profiles/CrossProjectBlockScorerProfiles.php',
        'CirrusSearch\\DataSender' => __DIR__ . '/includes/DataSender.php',
        'CirrusSearch\\Dump' => __DIR__ . '/includes/Dump.php',
        'CirrusSearch\\ElasticaErrorHandler' => __DIR__ . 
'/includes/ElasticaErrorHandler.php',
@@ -150,6 +151,9 @@
        'CirrusSearch\\Search\\BoostTemplatesFunctionScoreBuilder' => __DIR__ . 
'/includes/Search/RescoreBuilders.php',
        'CirrusSearch\\Search\\CirrusIndexField' => __DIR__ . 
'/includes/Search/CirrusIndexField.php',
        'CirrusSearch\\Search\\CirrusSearchIndexFieldFactory' => __DIR__ . 
'/includes/Search/CirrusSearchIndexFieldFactory.php',
+       'CirrusSearch\\Search\\CompositeCrossProjectBlockScorer' => __DIR__ . 
'/includes/Search/CrossProjectBlockScorer.php',
+       'CirrusSearch\\Search\\CrossProjectBlockScorer' => __DIR__ . 
'/includes/Search/CrossProjectBlockScorer.php',
+       'CirrusSearch\\Search\\CrossProjectBlockScorerFactory' => __DIR__ . 
'/includes/Search/CrossProjectBlockScorer.php',
        'CirrusSearch\\Search\\CustomFieldFunctionScoreBuilder' => __DIR__ . 
'/includes/Search/RescoreBuilders.php',
        'CirrusSearch\\Search\\DatetimeIndexField' => __DIR__ . 
'/includes/Search/DatetimeIndexField.php',
        'CirrusSearch\\Search\\EmptyResultSet' => __DIR__ . 
'/includes/Search/EmptyResultSet.php',
@@ -175,6 +179,8 @@
        'CirrusSearch\\Search\\NumberIndexField' => __DIR__ . 
'/includes/Search/NumberIndexField.php',
        'CirrusSearch\\Search\\OpeningTextIndexField' => __DIR__ . 
'/includes/Search/OpeningTextIndexField.php',
        'CirrusSearch\\Search\\PreferRecentFunctionScoreBuilder' => __DIR__ . 
'/includes/Search/RescoreBuilders.php',
+       'CirrusSearch\\Search\\RandomCrossProjectBlockScorer' => __DIR__ . 
'/includes/Search/CrossProjectBlockScorer.php',
+       'CirrusSearch\\Search\\RecallCrossProjectBlockScorer' => __DIR__ . 
'/includes/Search/CrossProjectBlockScorer.php',
        'CirrusSearch\\Search\\RescoreBuilder' => __DIR__ . 
'/includes/Search/RescoreBuilders.php',
        'CirrusSearch\\Search\\Result' => __DIR__ . 
'/includes/Search/Result.php',
        'CirrusSearch\\Search\\ResultSet' => __DIR__ . 
'/includes/Search/ResultSet.php',
@@ -185,6 +191,7 @@
        'CirrusSearch\\Search\\SearchMetricsProvider' => __DIR__ . 
'/includes/Search/SearchMetricsProvider.php',
        'CirrusSearch\\Search\\ShortTextIndexField' => __DIR__ . 
'/includes/Search/ShortTextIndexField.php',
        'CirrusSearch\\Search\\SourceTextIndexField' => __DIR__ . 
'/includes/Search/SourceTextIndexField.php',
+       'CirrusSearch\\Search\\StaticCrossProjectBlockScorer' => __DIR__ . 
'/includes/Search/CrossProjectBlockScorer.php',
        'CirrusSearch\\Search\\TeamDraftInterleaver' => __DIR__ . 
'/includes/Search/TeamDraftInterleaver.php',
        'CirrusSearch\\Search\\TextIndexField' => __DIR__ . 
'/includes/Search/TextIndexField.php',
        'CirrusSearch\\Search\\TitleHelper' => __DIR__ . 
'/includes/Search/TitleHelper.php',
diff --git a/includes/InterwikiSearcher.php b/includes/InterwikiSearcher.php
index 09527d1..d9b241d 100644
--- a/includes/InterwikiSearcher.php
+++ b/includes/InterwikiSearcher.php
@@ -2,6 +2,7 @@
 
 namespace CirrusSearch;
 
+use CirrusSearch\Search\CrossProjectBlockScorerFactory;
 use CirrusSearch\Search\FullTextResultsType;
 use CirrusSearch\Search\ResultSet;
 use CirrusSearch\Search\SearchContext;
@@ -153,38 +154,8 @@
                        return $retval;
                }
 
-               switch ( $this->config->get( 'CirrusSearchCrossProjectOrder' ) 
) {
-               case 'recall':
-                       uasort( $retval, function ( $a, $b ) {
-                               return $b->getTotalHits() - $a->getTotalHits();
-                       } );
-                       return $retval;
-               case 'random':
-                       // reset the random number generator
-                       // take the first 8 chars from the md5 to build a uint32
-                       // and to prevent hexdec from returning floats
-                       mt_srand( hexdec( substr( Util::generateIdentToken(), 
0, 8 ) ) );
-                       $sortKeys = array_map(
-                               function () {
-                                       return mt_rand();
-                               },
-                               $retval
-                       );
-                       // "Randomly" sort crossproject results
-                       // Should give the same order for the same identity
-                       array_multisort( $sortKeys, SORT_ASC, $retval );
-                       return $retval;
-               case 'static':
-                       return $retval;
-               default:
-                       LoggerFactory::getInstance( 'CirrusSearch' )->warning(
-                               'wgCirrusSearchCrossProjectOrder is set to ' .
-                               'unkown value {invalid_order} using static ' .
-                               'instead.',
-                               [ 'invalid_order' => $this->config->get( 
'CirrusSearchCrossProjectOrder' ) ]
-                       );
-                       return $retval;
-               }
+               return CrossProjectBlockScorerFactory::load( $this->config )
+                       ->reorder( $retval );
        }
 
        /**
diff --git a/includes/Search/CrossProjectBlockScorer.php 
b/includes/Search/CrossProjectBlockScorer.php
new file mode 100644
index 0000000..64c9765
--- /dev/null
+++ b/includes/Search/CrossProjectBlockScorer.php
@@ -0,0 +1,151 @@
+<?php
+
+namespace CirrusSearch\Search;
+
+use CirrusSearch\Util;
+use CirrusSearch\SearchConfig;
+
+/**
+ * Score an interwiki block
+ */
+abstract class CrossProjectBlockScorer {
+       /**
+        * Compute a score for a given bloack of crossproject searchresults
+        * @param string $prefix
+        * @param ResultSet $result
+        * @return string|null the interwiki identified for this $wikiId or 
null if none found
+        */
+       abstract public function score( $prefix, ResultSet $results );
+
+       /**
+        * Reorder crossproject blocks using the $scorer
+        * @param array $resultsets array of ResultSet or empty array if the 
search was disabled
+        * @param $scorer CrossProjectBlockScorer scorer to use
+        * @return a reorder ResultSet array
+        */
+       public function reorder( array $resultsets ) {
+               $sortKeys = [];
+               foreach ( $resultsets as $pref => $results ) {
+                       if ( $results instanceof ResultSet ) {
+                               $sortKeys[] = $this->score( $pref, $results );
+                       } else {
+                               $sortKeys[] = -1.0;
+                       }
+               }
+               array_multisort( $sortKeys, SORT_DESC, $resultsets );
+               return $resultsets;
+       }
+}
+
+/**
+ * Factory that reads cirrus config and builds a CrossProjectBlockScorer
+ */
+class CrossProjectBlockScorerFactory {
+       public static function load( SearchConfig $config ) {
+               $profileName = $config->get( 'CirrusSearchCrossProjectOrder' );
+               $profile = $config->getElement( 
'CirrusSearchCrossProjectBlockScorerProfiles', $profileName );
+               if ( !$profile ) {
+                       throw new \RuntimeException( 'Unknown 
CrossProjectBlockScorer profile : ' . $profileName );
+               }
+               if ( !isset( $profile['type'] ) ) {
+                       throw new \RuntimeException( "Invalid 
CrossProjectBlockScorer profile $profileName, 'type' must be set" );
+               }
+               return static::loadScorer( $profile['type'], isset ( 
$profile['settings'] ) ? $profile['settings'] : [] );
+       }
+
+       public static function loadScorer( $type, array $config ) {
+               switch ( $type ) {
+               case 'composite':
+                       return new CompositeCrossProjectBlockScorer( $config );
+               case 'random':
+                       return new RandomCrossProjectBlockScorer( $config );
+               case 'recall':
+                       return new RecallCrossProjectBlockScorer( $config );
+               case 'static':
+                       return new StaticCrossProjectBlockScorer( $config );
+               default:
+                       throw new \RuntimeException( 'Unknown 
CrossProjectBlockScorer type : ' . $type );
+               }
+       }
+}
+
+/**
+ * Randomly ordered but consistent for a single user
+ */
+class RandomCrossProjectBlockScorer extends CrossProjectBlockScorer {
+       public function __construct( array $settings ) {
+               mt_srand( hexdec( substr( Util::generateIdentToken(), 0, 8 ) ) 
);
+       }
+
+       public function score( $prefix, ResultSet $results ) {
+               return mt_rand();
+       }
+}
+
+/**
+ * Score based on total hits : log(total_hits + 2)
+ */
+class RecallCrossProjectBlockScorer extends CrossProjectBlockScorer {
+       public function __construct( array $settings ) {
+       }
+
+       public function score( $prefix, ResultSet $results ) {
+               return log( $results->getTotalHits() + 2 );
+       }
+}
+
+/**
+ * Based on a static config, allows to give a fixed score to a particular
+ * wiki
+ */
+class StaticCrossProjectBlockScorer extends CrossProjectBlockScorer {
+       /**
+        * static weights
+        */
+       private $staticScores;
+
+       public function __construct( array $settings ) {
+               $this->staticScores = $settings + [ '__default__' => 1 ];
+       }
+
+       /**
+        * @inheritDoc
+        */
+       public function score( $prefix, ResultSet $results ) {
+               $staticScoreKey = '__default__';
+               if ( isset( $this->staticScores[$prefix] ) ) {
+                       $staticScoreKey = $prefix;
+               }
+               return $this->staticScores[$staticScoreKey];
+       }
+}
+
+/**
+ * Composite, weighted sum of a list of subscorers
+ */
+class CompositeCrossProjectBlockScorer extends CrossProjectBlockScorer {
+       private $scorers = [];
+
+       public function __construct( array $settings ) {
+               foreach ( $settings as $type => $subSettings ) {
+                       $weight = isset ( $subSettings['weight'] ) ? 
$subSettings['weight'] : 1;
+                       $scorerSettings = isset( $subSettings['settings'] ) ? 
$subSettings['settings'] : [];
+                       $scorer = CrossProjectBlockScorerFactory::loadScorer( 
$type, $scorerSettings );
+                       $this->scorers[] = [
+                               'weight' => $weight,
+                               'scorer' => $scorer,
+                       ];
+               }
+       }
+
+       /**
+        * @inheritDoc
+        */
+       public function score( $prefix, ResultSet $results ) {
+               $score = 0;
+               foreach ( $this->scorers as $scorer ) {
+                       $score += $scorer['weight'] * $scorer['scorer']->score( 
$prefix, $results );
+               }
+               return $score;
+       }
+}
diff --git a/profiles/CrossProjectBlockScorerProfiles.config.php 
b/profiles/CrossProjectBlockScorerProfiles.config.php
new file mode 100644
index 0000000..7d1eb2f
--- /dev/null
+++ b/profiles/CrossProjectBlockScorerProfiles.config.php
@@ -0,0 +1,73 @@
+<?php
+
+namespace CirrusSearch;
+
+/**
+ * CirrusSearch - List of FullTextQueryBuilderProfiles used to generate an 
elasticsearch
+ * query by parsing user input.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+
+/**
+ * Profiles to control ordering of blocks of CrossProject searchresults.
+ *
+ * key is the profile name used in wgCirrusSearchCrossProjectOrder
+ * value is array where
+ * - 'type' the scorer to use (static, recall, random)
+ * - settings is scorer specific config
+ */
+$wgCirrusSearchCrossProjectBlockScorerProfiles = [
+       // static ordering, scores are provided in the 'settings' key
+       // with a score (value) per 'wiki prefix (key)
+       'static' => [
+               'type' => 'static',
+       ],
+
+       // ordered by recall (total hits)
+       'recall' => [
+               'type' => 'recall',
+       ],
+
+       // randomly ordered
+       'random' => [
+               'type' => 'random',
+       ],
+
+       // Example profile for WMF english wikipedia
+       // - wiktionary always first
+       // - wikibooks always last
+       // - others are ordered by recall
+       // wikt will be : (1 * 1) + (0.01 * log(total_hits + 2))
+       // wikibooks : (1 * 0.01) + (0.01 * log(total_hits + 2))
+       // others : (1 * 0.1) + (0.01 * log(total_hits + 2))
+       'wmf_enwiki' => [
+               'type' => 'composite',
+               'settings' => [
+                       'recall' => [
+                               'weight' => 0.01,
+                       ],
+                       'static' => [
+                               'weight' => 1,
+                               'settings' => [
+                                       '__default__' => 0.1,
+                                       'wikt' => 1,
+                                       'b' => 0.01,
+                               ],
+                       ],
+               ],
+       ],
+];
diff --git a/profiles/CrossProjectBlockScorerProfiles.php 
b/profiles/CrossProjectBlockScorerProfiles.php
new file mode 100644
index 0000000..d433d1a
--- /dev/null
+++ b/profiles/CrossProjectBlockScorerProfiles.php
@@ -0,0 +1,33 @@
+<?php
+
+namespace CirrusSearch;
+
+use WebRequest;
+
+/**
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+
+class CrossProjectBlockScorerProfiles {
+       public static function overrideOptions( WebRequest $request ) {
+               global $wgCirrusSearchCrossProjectBlockScoreProfiles,
+                       $wgCirrusSearchCrossProjectOrder;
+               $profile = $request->getVal( 'cirrusCrossProjectOrderProfile' );
+               if ( $profile != null && isset ( 
$wgCirrusSearchCrossProjectBlockScoreProfiles[$profile] ) ) {
+                       $wgCirrusSearchCrossProjectOrder = $profile;
+               }
+       }
+}
diff --git a/tests/unit/Search/CrossProjectBlockScorerTest.php 
b/tests/unit/Search/CrossProjectBlockScorerTest.php
new file mode 100644
index 0000000..8a4845c
--- /dev/null
+++ b/tests/unit/Search/CrossProjectBlockScorerTest.php
@@ -0,0 +1,104 @@
+<?php
+
+namespace CirrusSearch\Search;
+
+use CirrusSearch\HashSearchConfig;
+
+class CrossProjectBlockScorerTest extends \PHPUnit_Framework_TestCase {
+       public function testRecallScorer() {
+               $retval = [
+                       'b' => $this->mockRS( 5 ),
+                       'wikt' => $this->mockRS( 10 ),
+                       'broken' => [],
+                       'voy' => $this->mockRS( 15 ),
+               ];
+               $scorer = new RecallCrossProjectBlockScorer( [] );
+               $reordered = $scorer->reorder( $retval );
+               $this->assertEquals( array_keys( $reordered ), [ 'voy', 'wikt', 
'b', 'broken' ] );
+       }
+
+       public function testStatic() {
+               $retval = [
+                       'b' => $this->mockRS( 5 ),
+                       'wikt' => $this->mockRS( 1 ),
+                       'broken' => [],
+                       'voy' => $this->mockRS( 2 ),
+               ];
+               $scorer = new StaticCrossProjectBlockScorer( [
+                       'b' => 0.1,
+                       'wikt' => 0.2,
+                       'voy' => 0.3,
+                       '__default__' => 0.01,
+               ] );
+               $reordered = $scorer->reorder( $retval );
+               $this->assertEquals( array_keys( $reordered ), [ 'voy', 'wikt', 
'b', 'broken' ] );
+       }
+
+       public function testRandom() {
+               $retval = [
+                       'b' => $this->mockRS( 5 ),
+                       'wikt' => $this->mockRS( 1 ),
+                       'broken' => [],
+                       'voy' => $this->mockRS( 2 ),
+               ];
+               $scorer = new RandomCrossProjectBlockScorer( [] );
+               $reordered = $scorer->reorder( $retval );
+               // not sure how to test randomness...
+               // let's just make sure that all keys are here
+               foreach ( $retval as $k => $v ) {
+                       $this->assertArrayHasKey( $k, $reordered );
+               }
+       }
+
+       public function testComposite() {
+               $retval = [
+                       'b' => $this->mockRS( 5000000 ),
+                       'wikt' => $this->mockRS( 1 ),
+                       'broken' => [],
+                       'voy' => $this->mockRS( 1000500 ),
+                       's' => $this->mockRS( 1020450 ),
+               ];
+               $scorer = new CompositeCrossProjectBlockScorer( [
+                       'static' => [
+                               'weight' => 1,
+                               'settings' => [
+                                       'b' => 0.01,
+                                       'wikt' => 1,
+                                       '__default__' => 0.1,
+                               ]
+                       ],
+                       'recall' => [
+                               'weight' => 0.01,
+                       ],
+               ] );
+               $reordered = $scorer->reorder( $retval );
+               $this->assertEquals( array_keys( $reordered ), [ 'wikt', 's', 
'voy', 'b', 'broken' ] );
+       }
+
+       public function testEnWikiExample() {
+               $retval = [
+                       'b' => $this->mockRS( 5000000 ),
+                       'wikt' => $this->mockRS( 1 ),
+                       'broken' => [],
+                       'voy' => $this->mockRS( 1000500 ),
+                       's' => $this->mockRS( 1020450 ),
+               ];
+               $hashConfig = new HashSearchConfig(
+                       [ 'CirrusSearchCrossProjectOrder' => 'wmf_enwiki' ],
+                       [ 'inherit' ]
+               );
+               $scorer = CrossProjectBlockScorerFactory::load( $hashConfig );
+               $reordered = $scorer->reorder( $retval );
+               $this->assertEquals( array_keys( $reordered ), [ 'wikt', 's', 
'voy', 'b', 'broken' ] );
+       }
+
+       private function mockRS( $totalHits ) {
+               $rs = $this->getMockBuilder( 'CirrusSearch\Search\ResultSet' )
+                       ->disableOriginalConstructor()
+                       ->getMock();
+               $rs->expects( $this->any() )
+                       ->method( 'getTotalHits' )
+                       ->will( $this->returnValue( $totalHits ) );
+               return $rs;
+       }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/368771
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I9618b9e9bee51d91a9ec931725abcd39eb730b39
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: DCausse <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to