jenkins-bot has submitted this change and it was merged. Change subject: Fix popqual score for very small wikis ......................................................................
Fix popqual score for very small wikis log base 1 is never defined. Bug: T136940 Change-Id: Ie113e3bb524f2548f265a15ba871d735a234d251 --- M includes/BuildDocument/SuggestScoring.php M maintenance/updateSuggesterIndex.php M tests/unit/SuggestScoringTest.php 3 files changed, 82 insertions(+), 28 deletions(-) Approvals: Cindy-the-browser-test-bot: Looks good to me, but someone else must approve EBernhardson: Looks good to me, approved jenkins-bot: Verified diff --git a/includes/BuildDocument/SuggestScoring.php b/includes/BuildDocument/SuggestScoring.php index 54d7cd6..ac62d7d 100644 --- a/includes/BuildDocument/SuggestScoring.php +++ b/includes/BuildDocument/SuggestScoring.php @@ -305,8 +305,14 @@ if ( $pop > self::POPULARITY_MAX ) { $pop = 1; } else { - // @fixme: rough log scale by using maxDocs... - $pop = log ( 1 + ( $pop * $this->maxDocs ), 1 + ( self::POPULARITY_MAX * $this->maxDocs ) ); + $logBase = 1 + self::POPULARITY_MAX * $this->maxDocs; + // log₁(x) is undefined + if ( $logBase > 1 ) { + // @fixme: rough log scale by using maxDocs... + $pop = log ( 1 + ( $pop * $this->maxDocs ), $logBase ); + } else { + $pop = 0; + } } $score += $pop * self::POPULARITY_WEIGHT; diff --git a/maintenance/updateSuggesterIndex.php b/maintenance/updateSuggesterIndex.php index 949e8ab..1f15196 100644 --- a/maintenance/updateSuggesterIndex.php +++ b/maintenance/updateSuggesterIndex.php @@ -76,6 +76,11 @@ private $indexIdentifier; /** + * @var string the score method name to use. + */ + private $scoreMethodName; + + /** * @var SuggestScoringMethod the score function to use. */ private $scoreMethod; @@ -176,7 +181,8 @@ $wgCirrusSearchBannedPlugins, $wgPoolCounterConf, $wgCirrusSearchMasterTimeout, - $wgCirrusSearchMaxShardsPerNode; + $wgCirrusSearchMaxShardsPerNode, + $wgCirrusSearchCompletionDefaultScore; $this->masterTimeout = $this->getOption( 'masterTimeout', $wgCirrusSearchMasterTimeout ); $this->indexTypeName = Connection::TITLE_SUGGEST_TYPE; @@ -213,6 +219,10 @@ $this->utils->checkElasticsearchVersion(); $this->maxShardsPerNode = isset( $wgCirrusSearchMaxShardsPerNode[ $this->indexTypeName ] ) ? $wgCirrusSearchMaxShardsPerNode[ $this->indexTypeName ] : 'unlimited'; + + $this->scoreMethodName = $this->getOption( 'scoringMethod', $wgCirrusSearchCompletionDefaultScore ); + $this->scoreMethod = SuggestScoringMethodFactory::getScoringMethod( $this->scoreMethodName ); + $this->builder = new SuggestBuilder( $this->scoreMethod, $this->withGeo ); try { // If the version does not exist it's certainly because nothing has been indexed. @@ -503,20 +513,6 @@ } private function indexData() { - global $wgCirrusSearchCompletionDefaultScore; - $scoreMethodName = $this->getOption( 'scoringMethod', $wgCirrusSearchCompletionDefaultScore ); - if ( $this->scoreMethod == null ) { - $this->scoreMethod = SuggestScoringMethodFactory::getScoringMethod( $scoreMethodName ); - } - if ( $this->builder == null ) { - // NOTE: the builder stores a batchId value to flag - // documents indexed by this builder. Make sure to - // reuse the same instance when building docs otherwise - // the batchId might be regenerated and can cause data - // loss when recycling the index. - $this->builder = new SuggestBuilder( $this->scoreMethod, $this->withGeo ); - } - // We build the suggestions by reading CONTENT and GENERAL indices. // This does not support extra indices like FILES on commons. $sourceIndexTypes = array( Connection::CONTENT_INDEX_TYPE, Connection::GENERAL_INDEX_TYPE ); @@ -574,7 +570,7 @@ $totalDocsToDump = $totalDocsInIndex; $docsDumped = 0; - $this->log( "Indexing $totalDocsToDump documents from $sourceIndexType ($totalDocsInIndex in the index) with batchId: {$this->builder->getBatchId()} and scoring method: $scoreMethodName\n" ); + $this->log( "Indexing $totalDocsToDump documents from $sourceIndexType ($totalDocsInIndex in the index) with batchId: {$this->builder->getBatchId()} and scoring method: {$this->scoreMethodName}\n" ); $destinationType = $this->getIndex()->getType( Connection::TITLE_SUGGEST_TYPE_NAME ); diff --git a/tests/unit/SuggestScoringTest.php b/tests/unit/SuggestScoringTest.php index c0d6b71..8122d8f 100644 --- a/tests/unit/SuggestScoringTest.php +++ b/tests/unit/SuggestScoringTest.php @@ -2,7 +2,9 @@ namespace CirrusSearch; +use CirrusSearch\BuildDocument\IncomingLinksScoringMethod; use CirrusSearch\BuildDocument\QualityScore; +use CirrusSearch\BuildDocument\PQScore; /** * test suggest scoring functions. @@ -27,8 +29,8 @@ $qs = new QualityScore(); $qs->setMaxDocs( 10000 ); for( $i = 0; $i < 1000; $i++ ) { - $value = rand( 0, 1000000 ); - $norm = rand( 1, 1000000 ); + $value = mt_rand( 0, 1000000 ); + $norm = mt_rand( 1, 1000000 ); $score = $qs->scoreNorm( $value, $norm ); $this->assertLessThanOrEqual( 1, $score, "scoreNorm cannot produce a score greater than 1" ); $this->assertGreaterThanOrEqual( 0, $score, "scoreNorm cannot produce a score lower than 0" ); @@ -59,8 +61,8 @@ public function testQualityScoreBoostFunction() { $qs = new QualityScore(); for( $i = 0; $i < 1000; $i++ ) { - $score = (float) rand() / (float) mt_getrandmax(); - $boost = (float) rand( 0, 10000 ) / rand( 1, 10000 ); + $score = (float) mt_rand() / (float) mt_getrandmax(); + $boost = (float) mt_rand( 0, 10000 ) / mt_rand( 1, 10000 ); $res = $qs->boost( $score, $boost ); $this->assertLessThanOrEqual( 1, $score, "boost cannot produce a score greater than 1" ); $this->assertGreaterThanOrEqual( 0, $score, "boost cannot produce a score lower than 0" ); @@ -219,12 +221,12 @@ for( $i = 0; $i < 1000; $i++ ) { $page = array( - 'incoming_links' => rand( 0, 2^31-1 ), - 'external_link' => array_fill( 0, rand( 1, 2000 ), null ), - 'text_bytes' => rand( 1, 400000 ), - 'heading' => array_fill( 0, rand( 1, 1000 ), null ), - 'redirect' => array_fill( 0, rand( 1, 1000 ), null ), - 'template' => rand( 0, 1 ) == 1 ? array( 'Good' ) : array('Bad') + 'incoming_links' => mt_rand( 0, 2^31-1 ), + 'external_link' => array_fill( 0, mt_rand( 1, 2000 ), null ), + 'text_bytes' => mt_rand( 1, 400000 ), + 'heading' => array_fill( 0, mt_rand( 1, 1000 ), null ), + 'redirect' => array_fill( 0, mt_rand( 1, 1000 ), null ), + 'template' => mt_rand( 0, 1 ) == 1 ? array( 'Good' ) : array('Bad') ); $this->assertGreaterThan( 0, $qs->score( $page ), "Score is always greater than 0" ); $this->assertLessThan( QualityScore::SCORE_RANGE, $qs->score( $page ), "Score is always lower than " . QualityScore::SCORE_RANGE ); @@ -279,4 +281,54 @@ ); $this->assertEquals( QualityScore::SCORE_RANGE, $qs->score( $page ), "With a zero page wiki the highest score is also " . QualityScore::SCORE_RANGE ); } + + public function testRobustness() { + $templates = array( 'Good' => 2, 'Bad' => 0.5 ); + $all_templates = array_keys( $templates ); + $all_templates += array( 'Foo', 'Bar' ); + for ( $i = 0; $i < 5000; $i++ ) { + $scorers = array(); + $scorers[] = new PQScore( array( 'Good' => 2, 'Bad' => 0.5 ) ); + $scorers[] = new QualityScore( array( 'Good' => 2, 'Bad' => 0.5 ) ); + $scorers[] = new IncomingLinksScoringMethod(); + $tmpl = array(); + for ( $j = mt_rand( 0, count( $all_templates ) - 1 ); $j >= 0; $j-- ) { + $tmpl[] = $all_templates[$j]; + } + $page = array(); + $page['incoming_links'] = mt_rand( 0, 1 ) ? mt_rand( 0, 200 ) : null; + $page['external_link'] = $this->randomArray( 200 ); + $page['text_bytes'] = mt_rand( 0, 1 ) ? (string) mt_rand( 0, 230000 ) : null; + $page['heading'] = $this->randomArray( 30 ); + $page['redirect'] = $this->randomArray( 100 ); + $page['popularity_score'] = mt_rand( 0, 1 ) ? 1 / mt_rand( 1, 1800000 ) : null; + $page['templates'] = mt_rand( 0, 1 ) ? $tmpl : null; + + $maxDocs = mt_rand( 0, 100 ); + foreach( $scorers as $scorer ) { + $scorer->setMaxDocs( $maxDocs ); + $score = $scorer->score( $page ); + $pagedebug = print_r( $page, true ); + + $this->assertTrue( is_int( $score ), "Score is always an integer for " . get_class( $scorer ) . " with these values $pagedebug" ); + $this->assertTrue( $score >= 0, "Score is always positive " . get_class( $scorer ) . " with these values $pagedebug" ); + $this->assertTrue( $score <= QualityScore::SCORE_RANGE, "Score is always lower than QualityScore::SCORE_RANGE " . get_class( $scorer ) . " with these values $pagedebug" ); + } + } + } + + /** + * @param $max integer max element in the array + * @return array|null randomly null or an array of size [0, $max] + */ + private function randomArray( $max ) { + if ( mt_rand( 0, 1 ) ) { + $size = mt_rand( 0, $max ); + if ( $size === 0 ) { + return array(); + } + return array_fill( 0, $size, null ); + } + return null; + } } -- To view, visit https://gerrit.wikimedia.org/r/292574 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ie113e3bb524f2548f265a15ba871d735a234d251 Gerrit-PatchSet: 6 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: DCausse <dcau...@wikimedia.org> Gerrit-Reviewer: Cindy-the-browser-test-bot <bernhardsone...@gmail.com> Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: Gehel <gleder...@wikimedia.org> Gerrit-Reviewer: Manybubbles <never...@wikimedia.org> Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits