jenkins-bot has submitted this change and it was merged.
Change subject: Fix popqual score for very small wikis
......................................................................
Fix popqual score for very small wikis
log base 1 is never defined.
Bug: T136940
Change-Id: Ie113e3bb524f2548f265a15ba871d735a234d251
---
M includes/BuildDocument/SuggestScoring.php
M maintenance/updateSuggesterIndex.php
M tests/unit/SuggestScoringTest.php
3 files changed, 82 insertions(+), 28 deletions(-)
Approvals:
Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
EBernhardson: Looks good to me, approved
jenkins-bot: Verified
diff --git a/includes/BuildDocument/SuggestScoring.php
b/includes/BuildDocument/SuggestScoring.php
index 54d7cd6..ac62d7d 100644
--- a/includes/BuildDocument/SuggestScoring.php
+++ b/includes/BuildDocument/SuggestScoring.php
@@ -305,8 +305,14 @@
if ( $pop > self::POPULARITY_MAX ) {
$pop = 1;
} else {
- // @fixme: rough log scale by using maxDocs...
- $pop = log ( 1 + ( $pop * $this->maxDocs ), 1 + (
self::POPULARITY_MAX * $this->maxDocs ) );
+ $logBase = 1 + self::POPULARITY_MAX * $this->maxDocs;
+ // log₁(x) is undefined
+ if ( $logBase > 1 ) {
+ // @fixme: rough log scale by using maxDocs...
+ $pop = log ( 1 + ( $pop * $this->maxDocs ),
$logBase );
+ } else {
+ $pop = 0;
+ }
}
$score += $pop * self::POPULARITY_WEIGHT;
diff --git a/maintenance/updateSuggesterIndex.php
b/maintenance/updateSuggesterIndex.php
index 949e8ab..1f15196 100644
--- a/maintenance/updateSuggesterIndex.php
+++ b/maintenance/updateSuggesterIndex.php
@@ -76,6 +76,11 @@
private $indexIdentifier;
/**
+ * @var string the score method name to use.
+ */
+ private $scoreMethodName;
+
+ /**
* @var SuggestScoringMethod the score function to use.
*/
private $scoreMethod;
@@ -176,7 +181,8 @@
$wgCirrusSearchBannedPlugins,
$wgPoolCounterConf,
$wgCirrusSearchMasterTimeout,
- $wgCirrusSearchMaxShardsPerNode;
+ $wgCirrusSearchMaxShardsPerNode,
+ $wgCirrusSearchCompletionDefaultScore;
$this->masterTimeout = $this->getOption( 'masterTimeout',
$wgCirrusSearchMasterTimeout );
$this->indexTypeName = Connection::TITLE_SUGGEST_TYPE;
@@ -213,6 +219,10 @@
$this->utils->checkElasticsearchVersion();
$this->maxShardsPerNode = isset(
$wgCirrusSearchMaxShardsPerNode[ $this->indexTypeName ] ) ?
$wgCirrusSearchMaxShardsPerNode[ $this->indexTypeName ] : 'unlimited';
+
+ $this->scoreMethodName = $this->getOption( 'scoringMethod',
$wgCirrusSearchCompletionDefaultScore );
+ $this->scoreMethod =
SuggestScoringMethodFactory::getScoringMethod( $this->scoreMethodName );
+ $this->builder = new SuggestBuilder( $this->scoreMethod,
$this->withGeo );
try {
// If the version does not exist it's certainly because
nothing has been indexed.
@@ -503,20 +513,6 @@
}
private function indexData() {
- global $wgCirrusSearchCompletionDefaultScore;
- $scoreMethodName = $this->getOption( 'scoringMethod',
$wgCirrusSearchCompletionDefaultScore );
- if ( $this->scoreMethod == null ) {
- $this->scoreMethod =
SuggestScoringMethodFactory::getScoringMethod( $scoreMethodName );
- }
- if ( $this->builder == null ) {
- // NOTE: the builder stores a batchId value to flag
- // documents indexed by this builder. Make sure to
- // reuse the same instance when building docs otherwise
- // the batchId might be regenerated and can cause data
- // loss when recycling the index.
- $this->builder = new SuggestBuilder(
$this->scoreMethod, $this->withGeo );
- }
-
// We build the suggestions by reading CONTENT and GENERAL
indices.
// This does not support extra indices like FILES on commons.
$sourceIndexTypes = array( Connection::CONTENT_INDEX_TYPE,
Connection::GENERAL_INDEX_TYPE );
@@ -574,7 +570,7 @@
$totalDocsToDump = $totalDocsInIndex;
$docsDumped = 0;
- $this->log( "Indexing $totalDocsToDump documents from
$sourceIndexType ($totalDocsInIndex in the index) with batchId:
{$this->builder->getBatchId()} and scoring method: $scoreMethodName\n" );
+ $this->log( "Indexing $totalDocsToDump documents from
$sourceIndexType ($totalDocsInIndex in the index) with batchId:
{$this->builder->getBatchId()} and scoring method: {$this->scoreMethodName}\n"
);
$destinationType = $this->getIndex()->getType(
Connection::TITLE_SUGGEST_TYPE_NAME );
diff --git a/tests/unit/SuggestScoringTest.php
b/tests/unit/SuggestScoringTest.php
index c0d6b71..8122d8f 100644
--- a/tests/unit/SuggestScoringTest.php
+++ b/tests/unit/SuggestScoringTest.php
@@ -2,7 +2,9 @@
namespace CirrusSearch;
+use CirrusSearch\BuildDocument\IncomingLinksScoringMethod;
use CirrusSearch\BuildDocument\QualityScore;
+use CirrusSearch\BuildDocument\PQScore;
/**
* test suggest scoring functions.
@@ -27,8 +29,8 @@
$qs = new QualityScore();
$qs->setMaxDocs( 10000 );
for( $i = 0; $i < 1000; $i++ ) {
- $value = rand( 0, 1000000 );
- $norm = rand( 1, 1000000 );
+ $value = mt_rand( 0, 1000000 );
+ $norm = mt_rand( 1, 1000000 );
$score = $qs->scoreNorm( $value, $norm );
$this->assertLessThanOrEqual( 1, $score, "scoreNorm
cannot produce a score greater than 1" );
$this->assertGreaterThanOrEqual( 0, $score, "scoreNorm
cannot produce a score lower than 0" );
@@ -59,8 +61,8 @@
public function testQualityScoreBoostFunction() {
$qs = new QualityScore();
for( $i = 0; $i < 1000; $i++ ) {
- $score = (float) rand() / (float) mt_getrandmax();
- $boost = (float) rand( 0, 10000 ) / rand( 1, 10000 );
+ $score = (float) mt_rand() / (float) mt_getrandmax();
+ $boost = (float) mt_rand( 0, 10000 ) / mt_rand( 1,
10000 );
$res = $qs->boost( $score, $boost );
$this->assertLessThanOrEqual( 1, $score, "boost cannot
produce a score greater than 1" );
$this->assertGreaterThanOrEqual( 0, $score, "boost
cannot produce a score lower than 0" );
@@ -219,12 +221,12 @@
for( $i = 0; $i < 1000; $i++ ) {
$page = array(
- 'incoming_links' => rand( 0, 2^31-1 ),
- 'external_link' => array_fill( 0, rand( 1, 2000
), null ),
- 'text_bytes' => rand( 1, 400000 ),
- 'heading' => array_fill( 0, rand( 1, 1000 ),
null ),
- 'redirect' => array_fill( 0, rand( 1, 1000 ),
null ),
- 'template' => rand( 0, 1 ) == 1 ? array( 'Good'
) : array('Bad')
+ 'incoming_links' => mt_rand( 0, 2^31-1 ),
+ 'external_link' => array_fill( 0, mt_rand( 1,
2000 ), null ),
+ 'text_bytes' => mt_rand( 1, 400000 ),
+ 'heading' => array_fill( 0, mt_rand( 1, 1000 ),
null ),
+ 'redirect' => array_fill( 0, mt_rand( 1, 1000
), null ),
+ 'template' => mt_rand( 0, 1 ) == 1 ? array(
'Good' ) : array('Bad')
);
$this->assertGreaterThan( 0, $qs->score( $page ),
"Score is always greater than 0" );
$this->assertLessThan( QualityScore::SCORE_RANGE,
$qs->score( $page ), "Score is always lower than " . QualityScore::SCORE_RANGE
);
@@ -279,4 +281,54 @@
);
$this->assertEquals( QualityScore::SCORE_RANGE, $qs->score(
$page ), "With a zero page wiki the highest score is also " .
QualityScore::SCORE_RANGE );
}
+
+ public function testRobustness() {
+ $templates = array( 'Good' => 2, 'Bad' => 0.5 );
+ $all_templates = array_keys( $templates );
+ $all_templates += array( 'Foo', 'Bar' );
+ for ( $i = 0; $i < 5000; $i++ ) {
+ $scorers = array();
+ $scorers[] = new PQScore( array( 'Good' => 2, 'Bad' =>
0.5 ) );
+ $scorers[] = new QualityScore( array( 'Good' => 2,
'Bad' => 0.5 ) );
+ $scorers[] = new IncomingLinksScoringMethod();
+ $tmpl = array();
+ for ( $j = mt_rand( 0, count( $all_templates ) - 1 );
$j >= 0; $j-- ) {
+ $tmpl[] = $all_templates[$j];
+ }
+ $page = array();
+ $page['incoming_links'] = mt_rand( 0, 1 ) ? mt_rand( 0,
200 ) : null;
+ $page['external_link'] = $this->randomArray( 200 );
+ $page['text_bytes'] = mt_rand( 0, 1 ) ? (string)
mt_rand( 0, 230000 ) : null;
+ $page['heading'] = $this->randomArray( 30 );
+ $page['redirect'] = $this->randomArray( 100 );
+ $page['popularity_score'] = mt_rand( 0, 1 ) ? 1 /
mt_rand( 1, 1800000 ) : null;
+ $page['templates'] = mt_rand( 0, 1 ) ? $tmpl : null;
+
+ $maxDocs = mt_rand( 0, 100 );
+ foreach( $scorers as $scorer ) {
+ $scorer->setMaxDocs( $maxDocs );
+ $score = $scorer->score( $page );
+ $pagedebug = print_r( $page, true );
+
+ $this->assertTrue( is_int( $score ), "Score is
always an integer for " . get_class( $scorer ) . " with these values
$pagedebug" );
+ $this->assertTrue( $score >= 0, "Score is
always positive " . get_class( $scorer ) . " with these values $pagedebug" );
+ $this->assertTrue( $score <=
QualityScore::SCORE_RANGE, "Score is always lower than
QualityScore::SCORE_RANGE " . get_class( $scorer ) . " with these values
$pagedebug" );
+ }
+ }
+ }
+
+ /**
+ * @param $max integer max element in the array
+ * @return array|null randomly null or an array of size [0, $max]
+ */
+ private function randomArray( $max ) {
+ if ( mt_rand( 0, 1 ) ) {
+ $size = mt_rand( 0, $max );
+ if ( $size === 0 ) {
+ return array();
+ }
+ return array_fill( 0, $size, null );
+ }
+ return null;
+ }
}
--
To view, visit https://gerrit.wikimedia.org/r/292574
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ie113e3bb524f2548f265a15ba871d735a234d251
Gerrit-PatchSet: 6
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: DCausse <[email protected]>
Gerrit-Reviewer: Cindy-the-browser-test-bot <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Gehel <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: Smalyshev <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits