jenkins-bot has submitted this change and it was merged.

Change subject: Fix popqual score for very small wikis
......................................................................


Fix popqual score for very small wikis

log base 1 is never defined.

Bug: T136940
Change-Id: Ie113e3bb524f2548f265a15ba871d735a234d251
---
M includes/BuildDocument/SuggestScoring.php
M maintenance/updateSuggesterIndex.php
M tests/unit/SuggestScoringTest.php
3 files changed, 82 insertions(+), 28 deletions(-)

Approvals:
  Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
  EBernhardson: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/includes/BuildDocument/SuggestScoring.php 
b/includes/BuildDocument/SuggestScoring.php
index 54d7cd6..ac62d7d 100644
--- a/includes/BuildDocument/SuggestScoring.php
+++ b/includes/BuildDocument/SuggestScoring.php
@@ -305,8 +305,14 @@
                if ( $pop > self::POPULARITY_MAX ) {
                        $pop = 1;
                } else {
-                       // @fixme: rough log scale by using maxDocs...
-                       $pop = log ( 1 + ( $pop * $this->maxDocs ), 1 + ( 
self::POPULARITY_MAX * $this->maxDocs ) );
+                       $logBase = 1 + self::POPULARITY_MAX * $this->maxDocs;
+                       // log₁(x) is undefined
+                       if ( $logBase > 1 ) {
+                               // @fixme: rough log scale by using maxDocs...
+                               $pop = log ( 1 + ( $pop * $this->maxDocs ), 
$logBase );
+                       } else {
+                               $pop = 0;
+                       }
                }
 
                $score += $pop * self::POPULARITY_WEIGHT;
diff --git a/maintenance/updateSuggesterIndex.php 
b/maintenance/updateSuggesterIndex.php
index 949e8ab..1f15196 100644
--- a/maintenance/updateSuggesterIndex.php
+++ b/maintenance/updateSuggesterIndex.php
@@ -76,6 +76,11 @@
        private $indexIdentifier;
 
        /**
+        * @var string the score method name to use.
+        */
+       private $scoreMethodName;
+
+       /**
         * @var SuggestScoringMethod the score function to use.
         */
        private $scoreMethod;
@@ -176,7 +181,8 @@
                        $wgCirrusSearchBannedPlugins,
                        $wgPoolCounterConf,
                        $wgCirrusSearchMasterTimeout,
-                       $wgCirrusSearchMaxShardsPerNode;
+                       $wgCirrusSearchMaxShardsPerNode,
+                       $wgCirrusSearchCompletionDefaultScore;
 
                $this->masterTimeout = $this->getOption( 'masterTimeout', 
$wgCirrusSearchMasterTimeout );
                $this->indexTypeName = Connection::TITLE_SUGGEST_TYPE;
@@ -213,6 +219,10 @@
                $this->utils->checkElasticsearchVersion();
 
                $this->maxShardsPerNode = isset( 
$wgCirrusSearchMaxShardsPerNode[ $this->indexTypeName ] ) ? 
$wgCirrusSearchMaxShardsPerNode[ $this->indexTypeName ] : 'unlimited';
+
+               $this->scoreMethodName = $this->getOption( 'scoringMethod', 
$wgCirrusSearchCompletionDefaultScore );
+               $this->scoreMethod = 
SuggestScoringMethodFactory::getScoringMethod( $this->scoreMethodName );
+               $this->builder = new SuggestBuilder( $this->scoreMethod, 
$this->withGeo );
 
                try {
                        // If the version does not exist it's certainly because 
nothing has been indexed.
@@ -503,20 +513,6 @@
        }
 
        private function indexData() {
-               global $wgCirrusSearchCompletionDefaultScore;
-               $scoreMethodName = $this->getOption( 'scoringMethod', 
$wgCirrusSearchCompletionDefaultScore );
-               if ( $this->scoreMethod == null ) {
-                       $this->scoreMethod = 
SuggestScoringMethodFactory::getScoringMethod( $scoreMethodName );
-               }
-               if ( $this->builder == null ) {
-                       // NOTE: the builder stores a batchId value to flag
-                       // documents indexed by this builder. Make sure to
-                       // reuse the same instance when building docs otherwise
-                       // the batchId might be regenerated and can cause data
-                       // loss when recycling the index.
-                       $this->builder = new SuggestBuilder( 
$this->scoreMethod, $this->withGeo );
-               }
-
                // We build the suggestions by reading CONTENT and GENERAL 
indices.
                // This does not support extra indices like FILES on commons.
                $sourceIndexTypes = array( Connection::CONTENT_INDEX_TYPE, 
Connection::GENERAL_INDEX_TYPE );
@@ -574,7 +570,7 @@
                        $totalDocsToDump = $totalDocsInIndex;
 
                        $docsDumped = 0;
-                       $this->log( "Indexing $totalDocsToDump documents from 
$sourceIndexType ($totalDocsInIndex in the index) with batchId: 
{$this->builder->getBatchId()} and scoring method: $scoreMethodName\n" );
+                       $this->log( "Indexing $totalDocsToDump documents from 
$sourceIndexType ($totalDocsInIndex in the index) with batchId: 
{$this->builder->getBatchId()} and scoring method: {$this->scoreMethodName}\n" 
);
 
                        $destinationType = $this->getIndex()->getType( 
Connection::TITLE_SUGGEST_TYPE_NAME );
 
diff --git a/tests/unit/SuggestScoringTest.php 
b/tests/unit/SuggestScoringTest.php
index c0d6b71..8122d8f 100644
--- a/tests/unit/SuggestScoringTest.php
+++ b/tests/unit/SuggestScoringTest.php
@@ -2,7 +2,9 @@
 
 namespace CirrusSearch;
 
+use CirrusSearch\BuildDocument\IncomingLinksScoringMethod;
 use CirrusSearch\BuildDocument\QualityScore;
+use CirrusSearch\BuildDocument\PQScore;
 
 /**
  * test suggest scoring functions.
@@ -27,8 +29,8 @@
                $qs = new QualityScore();
                $qs->setMaxDocs( 10000 );
                for( $i = 0; $i < 1000; $i++ ) {
-                       $value = rand( 0, 1000000 );
-                       $norm = rand( 1, 1000000 );
+                       $value = mt_rand( 0, 1000000 );
+                       $norm = mt_rand( 1, 1000000 );
                        $score = $qs->scoreNorm( $value, $norm );
                        $this->assertLessThanOrEqual( 1, $score, "scoreNorm 
cannot produce a score greater than 1" );
                        $this->assertGreaterThanOrEqual( 0, $score, "scoreNorm 
cannot produce a score lower than 0" );
@@ -59,8 +61,8 @@
        public function testQualityScoreBoostFunction() {
                $qs = new QualityScore();
                for( $i = 0; $i < 1000; $i++ ) {
-                       $score = (float) rand() / (float) mt_getrandmax();
-                       $boost = (float) rand( 0, 10000 ) / rand( 1, 10000 );
+                       $score = (float) mt_rand() / (float) mt_getrandmax();
+                       $boost = (float) mt_rand( 0, 10000 ) / mt_rand( 1, 
10000 );
                        $res = $qs->boost( $score, $boost );
                        $this->assertLessThanOrEqual( 1, $score, "boost cannot 
produce a score greater than 1" );
                        $this->assertGreaterThanOrEqual( 0, $score, "boost 
cannot produce a score lower than 0" );
@@ -219,12 +221,12 @@
 
                for( $i = 0; $i < 1000; $i++ ) {
                        $page = array(
-                               'incoming_links' => rand( 0, 2^31-1 ),
-                               'external_link' => array_fill( 0, rand( 1, 2000 
), null ),
-                               'text_bytes' => rand( 1, 400000 ),
-                               'heading' => array_fill( 0, rand( 1, 1000 ), 
null ),
-                               'redirect' => array_fill( 0, rand( 1, 1000 ), 
null ),
-                               'template' => rand( 0, 1 ) == 1 ? array( 'Good' 
) : array('Bad')
+                               'incoming_links' => mt_rand( 0, 2^31-1 ),
+                               'external_link' => array_fill( 0, mt_rand( 1, 
2000 ), null ),
+                               'text_bytes' => mt_rand( 1, 400000 ),
+                               'heading' => array_fill( 0, mt_rand( 1, 1000 ), 
null ),
+                               'redirect' => array_fill( 0, mt_rand( 1, 1000 
), null ),
+                               'template' => mt_rand( 0, 1 ) == 1 ? array( 
'Good' ) : array('Bad')
                        );
                        $this->assertGreaterThan( 0, $qs->score( $page ), 
"Score is always greater than 0" );
                        $this->assertLessThan( QualityScore::SCORE_RANGE, 
$qs->score( $page ), "Score is always lower than " . QualityScore::SCORE_RANGE 
);
@@ -279,4 +281,54 @@
                );
                $this->assertEquals( QualityScore::SCORE_RANGE, $qs->score( 
$page ), "With a zero page wiki the highest score is also " . 
QualityScore::SCORE_RANGE );
        }
+
+       public function testRobustness() {
+               $templates =  array( 'Good' => 2, 'Bad' => 0.5 );
+               $all_templates = array_keys( $templates );
+               $all_templates += array( 'Foo', 'Bar' );
+               for ( $i = 0; $i < 5000; $i++ ) {
+                       $scorers = array();
+                       $scorers[] = new PQScore( array( 'Good' => 2, 'Bad' => 
0.5 ) );
+                       $scorers[] = new QualityScore( array( 'Good' => 2, 
'Bad' => 0.5 ) );
+                       $scorers[] = new IncomingLinksScoringMethod();
+                       $tmpl = array();
+                       for ( $j = mt_rand( 0, count( $all_templates ) - 1 ); 
$j >= 0; $j-- ) {
+                               $tmpl[] = $all_templates[$j];
+                       }
+                       $page = array();
+                       $page['incoming_links'] = mt_rand( 0, 1 ) ? mt_rand( 0, 
200 ) : null;
+                       $page['external_link'] = $this->randomArray( 200 );
+                       $page['text_bytes'] = mt_rand( 0, 1 ) ? (string) 
mt_rand( 0, 230000 ) : null;
+                       $page['heading'] = $this->randomArray( 30 );
+                       $page['redirect'] = $this->randomArray( 100 );
+                       $page['popularity_score'] = mt_rand( 0, 1 ) ? 1 / 
mt_rand( 1, 1800000 ) : null;
+                       $page['templates'] = mt_rand( 0, 1 ) ? $tmpl : null;
+
+                       $maxDocs = mt_rand( 0, 100 );
+                       foreach( $scorers as $scorer ) {
+                               $scorer->setMaxDocs( $maxDocs );
+                               $score = $scorer->score( $page );
+                               $pagedebug = print_r( $page, true );
+
+                               $this->assertTrue( is_int( $score ), "Score is 
always an integer for " . get_class( $scorer ) . " with these values 
$pagedebug" );
+                               $this->assertTrue( $score >= 0, "Score is 
always positive " . get_class( $scorer ) . " with these values $pagedebug" );
+                               $this->assertTrue( $score <= 
QualityScore::SCORE_RANGE, "Score is always lower than 
QualityScore::SCORE_RANGE " . get_class( $scorer ) . " with these values 
$pagedebug" );
+                       }
+               }
+       }
+
+       /**
+        * @param $max integer max element in the array
+        * @return array|null randomly null or an array of size [0, $max]
+        */
+       private function randomArray( $max ) {
+               if ( mt_rand( 0, 1 ) ) {
+                       $size = mt_rand( 0, $max );
+                       if ( $size === 0 ) {
+                               return array();
+                       }
+                       return array_fill( 0, $size, null );
+               }
+               return null;
+       }
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/292574
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ie113e3bb524f2548f265a15ba871d735a234d251
Gerrit-PatchSet: 6
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: DCausse <dcau...@wikimedia.org>
Gerrit-Reviewer: Cindy-the-browser-test-bot <bernhardsone...@gmail.com>
Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org>
Gerrit-Reviewer: Gehel <gleder...@wikimedia.org>
Gerrit-Reviewer: Manybubbles <never...@wikimedia.org>
Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to