DCausse has uploaded a new change for review.
https://gerrit.wikimedia.org/r/292574
Change subject: Fix popqual score for very small wikis
......................................................................
Fix popqual score for very small wikis
log base 1 is never defined.
Bug: T136940
Change-Id: Ie113e3bb524f2548f265a15ba871d735a234d251
---
M includes/BuildDocument/SuggestScoring.php
M maintenance/updateSuggesterIndex.php
M tests/unit/SuggestScoringTest.php
3 files changed, 49 insertions(+), 17 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch
refs/changes/74/292574/1
diff --git a/includes/BuildDocument/SuggestScoring.php
b/includes/BuildDocument/SuggestScoring.php
index 54d7cd6..ac62d7d 100644
--- a/includes/BuildDocument/SuggestScoring.php
+++ b/includes/BuildDocument/SuggestScoring.php
@@ -305,8 +305,14 @@
if ( $pop > self::POPULARITY_MAX ) {
$pop = 1;
} else {
- // @fixme: rough log scale by using maxDocs...
- $pop = log ( 1 + ( $pop * $this->maxDocs ), 1 + (
self::POPULARITY_MAX * $this->maxDocs ) );
+ $logBase = 1 + self::POPULARITY_MAX * $this->maxDocs;
+ // log₁(x) is undefined
+ if ( $logBase > 1 ) {
+ // @fixme: rough log scale by using maxDocs...
+ $pop = log ( 1 + ( $pop * $this->maxDocs ),
$logBase );
+ } else {
+ $pop = 0;
+ }
}
$score += $pop * self::POPULARITY_WEIGHT;
diff --git a/maintenance/updateSuggesterIndex.php
b/maintenance/updateSuggesterIndex.php
index 949e8ab..a199d89 100644
--- a/maintenance/updateSuggesterIndex.php
+++ b/maintenance/updateSuggesterIndex.php
@@ -76,6 +76,11 @@
private $indexIdentifier;
/**
+ * @var string the score method name to use.
+ */
+ private $scoreMethodName;
+
+ /**
* @var SuggestScoringMethod the score function to use.
*/
private $scoreMethod;
@@ -176,7 +181,8 @@
$wgCirrusSearchBannedPlugins,
$wgPoolCounterConf,
$wgCirrusSearchMasterTimeout,
- $wgCirrusSearchMaxShardsPerNode;
+ $wgCirrusSearchMaxShardsPerNode,
+ $wgCirrusSearchCompletionDefaultScore;
$this->masterTimeout = $this->getOption( 'masterTimeout',
$wgCirrusSearchMasterTimeout );
$this->indexTypeName = Connection::TITLE_SUGGEST_TYPE;
@@ -213,6 +219,10 @@
$this->utils->checkElasticsearchVersion();
$this->maxShardsPerNode = isset(
$wgCirrusSearchMaxShardsPerNode[ $this->indexTypeName ] ) ?
$wgCirrusSearchMaxShardsPerNode[ $this->indexTypeName ] : 'unlimited';
+
+ $this->scoreMethodName = $this->getOption( 'scoringMethod',
$wgCirrusSearchCompletionDefaultScore );
+ $this->scoreMethod =
SuggestScoringMethodFactory::getScoringMethod( $scoreMethodName );
+ $this->builder = new SuggestBuilder( $this->scoreMethod,
$this->withGeo );
try {
// If the version does not exist it's certainly because
nothing has been indexed.
@@ -503,20 +513,6 @@
}
private function indexData() {
- global $wgCirrusSearchCompletionDefaultScore;
- $scoreMethodName = $this->getOption( 'scoringMethod',
$wgCirrusSearchCompletionDefaultScore );
- if ( $this->scoreMethod == null ) {
- $this->scoreMethod =
SuggestScoringMethodFactory::getScoringMethod( $scoreMethodName );
- }
- if ( $this->builder == null ) {
- // NOTE: the builder stores a batchId value to flag
- // documents indexed by this builder. Make sure to
- // reuse the same instance when building docs otherwise
- // the batchId might be regenerated and can cause data
- // loss when recycling the index.
- $this->builder = new SuggestBuilder(
$this->scoreMethod, $this->withGeo );
- }
-
// We build the suggestions by reading CONTENT and GENERAL
indices.
// This does not support extra indices like FILES on commons.
$sourceIndexTypes = array( Connection::CONTENT_INDEX_TYPE,
Connection::GENERAL_INDEX_TYPE );
diff --git a/tests/unit/SuggestScoringTest.php
b/tests/unit/SuggestScoringTest.php
index c0d6b71..eba25f9 100644
--- a/tests/unit/SuggestScoringTest.php
+++ b/tests/unit/SuggestScoringTest.php
@@ -3,6 +3,7 @@
namespace CirrusSearch;
use CirrusSearch\BuildDocument\QualityScore;
+use CirrusSearch\BuildDocument\PQScore;
/**
* test suggest scoring functions.
@@ -279,4 +280,33 @@
);
$this->assertEquals( QualityScore::SCORE_RANGE, $qs->score(
$page ), "With a zero page wiki the highest score is also " .
QualityScore::SCORE_RANGE );
}
+
+ public function testPopQualScoreRanking() {
+ $templates = array( 'Good' => 2, 'Bad' => 0.5 );
+ $all_templates = array_keys( $templates );
+ $all_templates += array( 'Foo', 'Bar' );
+ $qs = new PQScore( array( 'Good' => 2, 'Bad' => 0.5 ) );
+ for ( $i = 0; $i < 1000; $i++ ) {
+ $maxDocs = rand( 0, 100 );
+ $qs->setMaxDocs( $maxDocs );
+ $tmpl = array();
+ for ( $j = rand( 0, count( $all_templates ) - 1 ); $j
>= 0; $j-- ) {
+ $tmpl[] = $all_templates[$j];
+ }
+ $page = array(
+ 'incoming_links' => rand( 0, 200 ),
+ 'external_link' => array_fill( 0, rand( 0, 200
), null ),
+ 'text_bytes' => (string) rand( 0, 230000 ),
+ 'heading' => array_fill( 0, rand( 0, 30 ), null
),
+ 'redirect' => array_fill( 0, rand( 0, 100 ),
null ),
+ 'template' => $tmpl
+ );
+
+ $score = $qs->score( $page );
+
+ $this->assertTrue( is_int( $score ), "Score is always
an integer" );
+ $this->assertTrue( $score >= 0, "Score is always
positive" );
+ $this->assertTrue( $score <= QualityScore::SCORE_RANGE,
"Score is always lower than QualityScore::SCORE_RANGE" );
+ }
+ }
}
--
To view, visit https://gerrit.wikimedia.org/r/292574
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie113e3bb524f2548f265a15ba871d735a234d251
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: DCausse <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits