jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/368771 )
Change subject: Refactor ordering of crosspoject blocks
......................................................................
Refactor ordering of crosspoject blocks
to allow fine-grained configuration.
Introduce yet another profile system for ordering cross project
blocks. It allows to fine-tune ordering of the blocks based on
different scoring implementations:
- recall
- static
- random
and also permits to combine them using a composite implementation
(weighted sum).
It should then be easier to add other implementations such as a
scoring method based on the density of the highlighted snippets.
Bug: T171803
Change-Id: I9618b9e9bee51d91a9ec931725abcd39eb730b39
---
M CirrusSearch.php
M autoload.php
M includes/InterwikiSearcher.php
A includes/Search/CrossProjectBlockScorer.php
A profiles/CrossProjectBlockScorerProfiles.config.php
A profiles/CrossProjectBlockScorerProfiles.php
A tests/unit/Search/CrossProjectBlockScorerTest.php
7 files changed, 389 insertions(+), 32 deletions(-)
Approvals:
Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
EBernhardson: Looks good to me, approved
jenkins-bot: Verified
diff --git a/CirrusSearch.php b/CirrusSearch.php
index 4e471d1..ce94339 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -27,6 +27,7 @@
require_once __DIR__ . "/profiles/SimilarityProfiles.php";
require_once __DIR__ . "/profiles/SaneitizeProfiles.php";
require_once __DIR__ . "/profiles/FullTextQueryBuilderProfiles.config.php";
+require_once __DIR__ . "/profiles/CrossProjectBlockScorerProfiles.config.php";
$wgExtensionCredits['other'][] = [
'path' => __FILE__,
diff --git a/autoload.php b/autoload.php
index 3e5f32d..6b8cdda 100644
--- a/autoload.php
+++ b/autoload.php
@@ -34,6 +34,7 @@
'CirrusSearch\\CompletionRequestLog' => __DIR__ .
'/includes/CompletionRequestLog.php',
'CirrusSearch\\CompletionSuggester' => __DIR__ .
'/includes/CompletionSuggester.php',
'CirrusSearch\\Connection' => __DIR__ . '/includes/Connection.php',
+ 'CirrusSearch\\CrossProjectBlockScorerProfiles' => __DIR__ .
'/profiles/CrossProjectBlockScorerProfiles.php',
'CirrusSearch\\DataSender' => __DIR__ . '/includes/DataSender.php',
'CirrusSearch\\Dump' => __DIR__ . '/includes/Dump.php',
'CirrusSearch\\ElasticaErrorHandler' => __DIR__ .
'/includes/ElasticaErrorHandler.php',
@@ -150,6 +151,9 @@
'CirrusSearch\\Search\\BoostTemplatesFunctionScoreBuilder' => __DIR__ .
'/includes/Search/RescoreBuilders.php',
'CirrusSearch\\Search\\CirrusIndexField' => __DIR__ .
'/includes/Search/CirrusIndexField.php',
'CirrusSearch\\Search\\CirrusSearchIndexFieldFactory' => __DIR__ .
'/includes/Search/CirrusSearchIndexFieldFactory.php',
+ 'CirrusSearch\\Search\\CompositeCrossProjectBlockScorer' => __DIR__ .
'/includes/Search/CrossProjectBlockScorer.php',
+ 'CirrusSearch\\Search\\CrossProjectBlockScorer' => __DIR__ .
'/includes/Search/CrossProjectBlockScorer.php',
+ 'CirrusSearch\\Search\\CrossProjectBlockScorerFactory' => __DIR__ .
'/includes/Search/CrossProjectBlockScorer.php',
'CirrusSearch\\Search\\CustomFieldFunctionScoreBuilder' => __DIR__ .
'/includes/Search/RescoreBuilders.php',
'CirrusSearch\\Search\\DatetimeIndexField' => __DIR__ .
'/includes/Search/DatetimeIndexField.php',
'CirrusSearch\\Search\\EmptyResultSet' => __DIR__ .
'/includes/Search/EmptyResultSet.php',
@@ -175,6 +179,8 @@
'CirrusSearch\\Search\\NumberIndexField' => __DIR__ .
'/includes/Search/NumberIndexField.php',
'CirrusSearch\\Search\\OpeningTextIndexField' => __DIR__ .
'/includes/Search/OpeningTextIndexField.php',
'CirrusSearch\\Search\\PreferRecentFunctionScoreBuilder' => __DIR__ .
'/includes/Search/RescoreBuilders.php',
+ 'CirrusSearch\\Search\\RandomCrossProjectBlockScorer' => __DIR__ .
'/includes/Search/CrossProjectBlockScorer.php',
+ 'CirrusSearch\\Search\\RecallCrossProjectBlockScorer' => __DIR__ .
'/includes/Search/CrossProjectBlockScorer.php',
'CirrusSearch\\Search\\RescoreBuilder' => __DIR__ .
'/includes/Search/RescoreBuilders.php',
'CirrusSearch\\Search\\Result' => __DIR__ .
'/includes/Search/Result.php',
'CirrusSearch\\Search\\ResultSet' => __DIR__ .
'/includes/Search/ResultSet.php',
@@ -185,6 +191,7 @@
'CirrusSearch\\Search\\SearchMetricsProvider' => __DIR__ .
'/includes/Search/SearchMetricsProvider.php',
'CirrusSearch\\Search\\ShortTextIndexField' => __DIR__ .
'/includes/Search/ShortTextIndexField.php',
'CirrusSearch\\Search\\SourceTextIndexField' => __DIR__ .
'/includes/Search/SourceTextIndexField.php',
+ 'CirrusSearch\\Search\\StaticCrossProjectBlockScorer' => __DIR__ .
'/includes/Search/CrossProjectBlockScorer.php',
'CirrusSearch\\Search\\TeamDraftInterleaver' => __DIR__ .
'/includes/Search/TeamDraftInterleaver.php',
'CirrusSearch\\Search\\TextIndexField' => __DIR__ .
'/includes/Search/TextIndexField.php',
'CirrusSearch\\Search\\TitleHelper' => __DIR__ .
'/includes/Search/TitleHelper.php',
diff --git a/includes/InterwikiSearcher.php b/includes/InterwikiSearcher.php
index 09527d1..d9b241d 100644
--- a/includes/InterwikiSearcher.php
+++ b/includes/InterwikiSearcher.php
@@ -2,6 +2,7 @@
namespace CirrusSearch;
+use CirrusSearch\Search\CrossProjectBlockScorerFactory;
use CirrusSearch\Search\FullTextResultsType;
use CirrusSearch\Search\ResultSet;
use CirrusSearch\Search\SearchContext;
@@ -153,38 +154,8 @@
return $retval;
}
- switch ( $this->config->get( 'CirrusSearchCrossProjectOrder' )
) {
- case 'recall':
- uasort( $retval, function ( $a, $b ) {
- return $b->getTotalHits() - $a->getTotalHits();
- } );
- return $retval;
- case 'random':
- // reset the random number generator
- // take the first 8 chars from the md5 to build a uint32
- // and to prevent hexdec from returning floats
- mt_srand( hexdec( substr( Util::generateIdentToken(),
0, 8 ) ) );
- $sortKeys = array_map(
- function () {
- return mt_rand();
- },
- $retval
- );
- // "Randomly" sort crossproject results
- // Should give the same order for the same identity
- array_multisort( $sortKeys, SORT_ASC, $retval );
- return $retval;
- case 'static':
- return $retval;
- default:
- LoggerFactory::getInstance( 'CirrusSearch' )->warning(
- 'wgCirrusSearchCrossProjectOrder is set to ' .
- 'unkown value {invalid_order} using static ' .
- 'instead.',
- [ 'invalid_order' => $this->config->get(
'CirrusSearchCrossProjectOrder' ) ]
- );
- return $retval;
- }
+ return CrossProjectBlockScorerFactory::load( $this->config )
+ ->reorder( $retval );
}
/**
diff --git a/includes/Search/CrossProjectBlockScorer.php
b/includes/Search/CrossProjectBlockScorer.php
new file mode 100644
index 0000000..f8ba4b8
--- /dev/null
+++ b/includes/Search/CrossProjectBlockScorer.php
@@ -0,0 +1,168 @@
+<?php
+
+namespace CirrusSearch\Search;
+
+use CirrusSearch\Util;
+use CirrusSearch\SearchConfig;
+
+/**
+ * Score an interwiki block
+ */
+abstract class CrossProjectBlockScorer {
+ public function __construct( array $settings ) {
+ }
+
+ /**
+ * Compute a score for a given bloack of crossproject searchresults
+ * @param string $prefix
+ * @param ResultSet $results
+ * @return float the score for this block
+ */
+ abstract public function score( $prefix, ResultSet $results );
+
+ /**
+ * Reorder crossproject blocks using the $scorer
+ * @param array $resultsets array of ResultSet or empty array if the
search was disabled
+ * @param $scorer CrossProjectBlockScorer scorer to use
+ * @return array $resultsets reordered
+ */
+ public function reorder( array $resultsets ) {
+ $sortKeys = [];
+ foreach ( $resultsets as $pref => $results ) {
+ if ( $results instanceof ResultSet ) {
+ $sortKeys[] = $this->score( $pref, $results );
+ } else {
+ $sortKeys[] = -1.0;
+ }
+ }
+ array_multisort( $sortKeys, SORT_DESC, $resultsets );
+ return $resultsets;
+ }
+}
+
+/**
+ * Factory that reads cirrus config and builds a CrossProjectBlockScorer
+ */
+class CrossProjectBlockScorerFactory {
+ public static function load( SearchConfig $config ) {
+ $profileName = $config->get( 'CirrusSearchCrossProjectOrder' );
+ $profile = $config->getElement(
'CirrusSearchCrossProjectBlockScorerProfiles', $profileName );
+ if ( !$profile ) {
+ throw new \RuntimeException( 'Unknown
CrossProjectBlockScorer profile : ' . $profileName );
+ }
+ if ( !isset( $profile['type'] ) ) {
+ throw new \RuntimeException( "Invalid
CrossProjectBlockScorer profile $profileName, 'type' must be set" );
+ }
+ return static::loadScorer( $profile['type'], isset (
$profile['settings'] ) ? $profile['settings'] : [] );
+ }
+
+ public static function loadScorer( $type, array $config ) {
+ switch ( $type ) {
+ case 'composite':
+ return new CompositeCrossProjectBlockScorer( $config );
+ case 'random':
+ return new RandomCrossProjectBlockScorer( $config );
+ case 'recall':
+ return new RecallCrossProjectBlockScorer( $config );
+ case 'static':
+ return new StaticCrossProjectBlockScorer( $config );
+ default:
+ throw new \RuntimeException( 'Unknown
CrossProjectBlockScorer type : ' . $type );
+ }
+ }
+}
+
+/**
+ * Randomly ordered but consistent for a single user
+ */
+class RandomCrossProjectBlockScorer extends CrossProjectBlockScorer {
+ public function __construct( array $settings ) {
+ parent::__construct( $settings );
+ mt_srand( hexdec( substr( Util::generateIdentToken(), 0, 8 ) )
);
+ }
+
+ /**
+ * @param string $prefix
+ * @param ResultSet $results
+ * @return float
+ */
+ public function score( $prefix, ResultSet $results ) {
+ return (float)mt_rand();
+ }
+}
+
+/**
+ * Score based on total hits : log(total_hits + 2)
+ */
+class RecallCrossProjectBlockScorer extends CrossProjectBlockScorer {
+ /**
+ * @param string $prefix
+ * @param ResultSet $results
+ * @return float
+ */
+ public function score( $prefix, ResultSet $results ) {
+ return log( $results->getTotalHits() + 2 );
+ }
+}
+
+/**
+ * Based on a static config, allows to give a fixed score to a particular
+ * wiki
+ */
+class StaticCrossProjectBlockScorer extends CrossProjectBlockScorer {
+ /**
+ * static weights
+ */
+ private $staticScores;
+
+ public function __construct( array $settings ) {
+ parent::__construct( $settings );
+ $this->staticScores = $settings + [ '__default__' => 1 ];
+ }
+
+ /**
+ * @param string $prefix
+ * @param ResultSet $results
+ * @return float
+ */
+ public function score( $prefix, ResultSet $results ) {
+ $staticScoreKey = '__default__';
+ if ( isset( $this->staticScores[$prefix] ) ) {
+ $staticScoreKey = $prefix;
+ }
+ return $this->staticScores[$staticScoreKey];
+ }
+}
+
+/**
+ * Composite, weighted sum of a list of subscorers
+ */
+class CompositeCrossProjectBlockScorer extends CrossProjectBlockScorer {
+ private $scorers = [];
+
+ public function __construct( array $settings ) {
+ parent::__construct( $settings );
+ foreach ( $settings as $type => $subSettings ) {
+ $weight = isset ( $subSettings['weight'] ) ?
$subSettings['weight'] : 1;
+ $scorerSettings = isset( $subSettings['settings'] ) ?
$subSettings['settings'] : [];
+ $scorer = CrossProjectBlockScorerFactory::loadScorer(
$type, $scorerSettings );
+ $this->scorers[] = [
+ 'weight' => $weight,
+ 'scorer' => $scorer,
+ ];
+ }
+ }
+
+ /**
+ * @param string $prefix
+ * @param ResultSet $results
+ * @return float
+ */
+ public function score( $prefix, ResultSet $results ) {
+ $score = 0;
+ foreach ( $this->scorers as $scorer ) {
+ $score += $scorer['weight'] * $scorer['scorer']->score(
$prefix, $results );
+ }
+ return $score;
+ }
+}
diff --git a/profiles/CrossProjectBlockScorerProfiles.config.php
b/profiles/CrossProjectBlockScorerProfiles.config.php
new file mode 100644
index 0000000..7d1eb2f
--- /dev/null
+++ b/profiles/CrossProjectBlockScorerProfiles.config.php
@@ -0,0 +1,73 @@
+<?php
+
+namespace CirrusSearch;
+
+/**
+ * CirrusSearch - List of FullTextQueryBuilderProfiles used to generate an
elasticsearch
+ * query by parsing user input.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+
+/**
+ * Profiles to control ordering of blocks of CrossProject searchresults.
+ *
+ * key is the profile name used in wgCirrusSearchCrossProjectOrder
+ * value is array where
+ * - 'type' the scorer to use (static, recall, random)
+ * - settings is scorer specific config
+ */
+$wgCirrusSearchCrossProjectBlockScorerProfiles = [
+ // static ordering, scores are provided in the 'settings' key
+ // with a score (value) per 'wiki prefix (key)
+ 'static' => [
+ 'type' => 'static',
+ ],
+
+ // ordered by recall (total hits)
+ 'recall' => [
+ 'type' => 'recall',
+ ],
+
+ // randomly ordered
+ 'random' => [
+ 'type' => 'random',
+ ],
+
+ // Example profile for WMF english wikipedia
+ // - wiktionary always first
+ // - wikibooks always last
+ // - others are ordered by recall
+ // wikt will be : (1 * 1) + (0.01 * log(total_hits + 2))
+ // wikibooks : (1 * 0.01) + (0.01 * log(total_hits + 2))
+ // others : (1 * 0.1) + (0.01 * log(total_hits + 2))
+ 'wmf_enwiki' => [
+ 'type' => 'composite',
+ 'settings' => [
+ 'recall' => [
+ 'weight' => 0.01,
+ ],
+ 'static' => [
+ 'weight' => 1,
+ 'settings' => [
+ '__default__' => 0.1,
+ 'wikt' => 1,
+ 'b' => 0.01,
+ ],
+ ],
+ ],
+ ],
+];
diff --git a/profiles/CrossProjectBlockScorerProfiles.php
b/profiles/CrossProjectBlockScorerProfiles.php
new file mode 100644
index 0000000..d433d1a
--- /dev/null
+++ b/profiles/CrossProjectBlockScorerProfiles.php
@@ -0,0 +1,33 @@
+<?php
+
+namespace CirrusSearch;
+
+use WebRequest;
+
+/**
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+ * http://www.gnu.org/copyleft/gpl.html
+ */
+
+class CrossProjectBlockScorerProfiles {
+ public static function overrideOptions( WebRequest $request ) {
+ global $wgCirrusSearchCrossProjectBlockScoreProfiles,
+ $wgCirrusSearchCrossProjectOrder;
+ $profile = $request->getVal( 'cirrusCrossProjectOrderProfile' );
+ if ( $profile != null && isset (
$wgCirrusSearchCrossProjectBlockScoreProfiles[$profile] ) ) {
+ $wgCirrusSearchCrossProjectOrder = $profile;
+ }
+ }
+}
diff --git a/tests/unit/Search/CrossProjectBlockScorerTest.php
b/tests/unit/Search/CrossProjectBlockScorerTest.php
new file mode 100644
index 0000000..8a4845c
--- /dev/null
+++ b/tests/unit/Search/CrossProjectBlockScorerTest.php
@@ -0,0 +1,104 @@
+<?php
+
+namespace CirrusSearch\Search;
+
+use CirrusSearch\HashSearchConfig;
+
+class CrossProjectBlockScorerTest extends \PHPUnit_Framework_TestCase {
+ public function testRecallScorer() {
+ $retval = [
+ 'b' => $this->mockRS( 5 ),
+ 'wikt' => $this->mockRS( 10 ),
+ 'broken' => [],
+ 'voy' => $this->mockRS( 15 ),
+ ];
+ $scorer = new RecallCrossProjectBlockScorer( [] );
+ $reordered = $scorer->reorder( $retval );
+ $this->assertEquals( array_keys( $reordered ), [ 'voy', 'wikt',
'b', 'broken' ] );
+ }
+
+ public function testStatic() {
+ $retval = [
+ 'b' => $this->mockRS( 5 ),
+ 'wikt' => $this->mockRS( 1 ),
+ 'broken' => [],
+ 'voy' => $this->mockRS( 2 ),
+ ];
+ $scorer = new StaticCrossProjectBlockScorer( [
+ 'b' => 0.1,
+ 'wikt' => 0.2,
+ 'voy' => 0.3,
+ '__default__' => 0.01,
+ ] );
+ $reordered = $scorer->reorder( $retval );
+ $this->assertEquals( array_keys( $reordered ), [ 'voy', 'wikt',
'b', 'broken' ] );
+ }
+
+ public function testRandom() {
+ $retval = [
+ 'b' => $this->mockRS( 5 ),
+ 'wikt' => $this->mockRS( 1 ),
+ 'broken' => [],
+ 'voy' => $this->mockRS( 2 ),
+ ];
+ $scorer = new RandomCrossProjectBlockScorer( [] );
+ $reordered = $scorer->reorder( $retval );
+ // not sure how to test randomness...
+ // let's just make sure that all keys are here
+ foreach ( $retval as $k => $v ) {
+ $this->assertArrayHasKey( $k, $reordered );
+ }
+ }
+
+ public function testComposite() {
+ $retval = [
+ 'b' => $this->mockRS( 5000000 ),
+ 'wikt' => $this->mockRS( 1 ),
+ 'broken' => [],
+ 'voy' => $this->mockRS( 1000500 ),
+ 's' => $this->mockRS( 1020450 ),
+ ];
+ $scorer = new CompositeCrossProjectBlockScorer( [
+ 'static' => [
+ 'weight' => 1,
+ 'settings' => [
+ 'b' => 0.01,
+ 'wikt' => 1,
+ '__default__' => 0.1,
+ ]
+ ],
+ 'recall' => [
+ 'weight' => 0.01,
+ ],
+ ] );
+ $reordered = $scorer->reorder( $retval );
+ $this->assertEquals( array_keys( $reordered ), [ 'wikt', 's',
'voy', 'b', 'broken' ] );
+ }
+
+ public function testEnWikiExample() {
+ $retval = [
+ 'b' => $this->mockRS( 5000000 ),
+ 'wikt' => $this->mockRS( 1 ),
+ 'broken' => [],
+ 'voy' => $this->mockRS( 1000500 ),
+ 's' => $this->mockRS( 1020450 ),
+ ];
+ $hashConfig = new HashSearchConfig(
+ [ 'CirrusSearchCrossProjectOrder' => 'wmf_enwiki' ],
+ [ 'inherit' ]
+ );
+ $scorer = CrossProjectBlockScorerFactory::load( $hashConfig );
+ $reordered = $scorer->reorder( $retval );
+ $this->assertEquals( array_keys( $reordered ), [ 'wikt', 's',
'voy', 'b', 'broken' ] );
+ }
+
+ private function mockRS( $totalHits ) {
+ $rs = $this->getMockBuilder( 'CirrusSearch\Search\ResultSet' )
+ ->disableOriginalConstructor()
+ ->getMock();
+ $rs->expects( $this->any() )
+ ->method( 'getTotalHits' )
+ ->will( $this->returnValue( $totalHits ) );
+ return $rs;
+ }
+}
--
To view, visit https://gerrit.wikimedia.org/r/368771
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I9618b9e9bee51d91a9ec931725abcd39eb730b39
Gerrit-PatchSet: 3
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: DCausse <[email protected]>
Gerrit-Reviewer: Cindy-the-browser-test-bot <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Gehel <[email protected]>
Gerrit-Reviewer: Smalyshev <[email protected]>
Gerrit-Reviewer: Tjones <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits