DCausse has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/292574

Change subject: Fix popqual score for very small wikis
......................................................................

Fix popqual score for very small wikis

log base 1 is never defined.

Bug: T136940
Change-Id: Ie113e3bb524f2548f265a15ba871d735a234d251
---
M includes/BuildDocument/SuggestScoring.php
M maintenance/updateSuggesterIndex.php
M tests/unit/SuggestScoringTest.php
3 files changed, 49 insertions(+), 17 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/74/292574/1

diff --git a/includes/BuildDocument/SuggestScoring.php 
b/includes/BuildDocument/SuggestScoring.php
index 54d7cd6..ac62d7d 100644
--- a/includes/BuildDocument/SuggestScoring.php
+++ b/includes/BuildDocument/SuggestScoring.php
@@ -305,8 +305,14 @@
                if ( $pop > self::POPULARITY_MAX ) {
                        $pop = 1;
                } else {
-                       // @fixme: rough log scale by using maxDocs...
-                       $pop = log ( 1 + ( $pop * $this->maxDocs ), 1 + ( 
self::POPULARITY_MAX * $this->maxDocs ) );
+                       $logBase = 1 + self::POPULARITY_MAX * $this->maxDocs;
+                       // log₁(x) is undefined
+                       if ( $logBase > 1 ) {
+                               // @fixme: rough log scale by using maxDocs...
+                               $pop = log ( 1 + ( $pop * $this->maxDocs ), 
$logBase );
+                       } else {
+                               $pop = 0;
+                       }
                }
 
                $score += $pop * self::POPULARITY_WEIGHT;
diff --git a/maintenance/updateSuggesterIndex.php 
b/maintenance/updateSuggesterIndex.php
index 949e8ab..a199d89 100644
--- a/maintenance/updateSuggesterIndex.php
+++ b/maintenance/updateSuggesterIndex.php
@@ -76,6 +76,11 @@
        private $indexIdentifier;
 
        /**
+        * @var string the score method name to use.
+        */
+       private $scoreMethodName;
+
+       /**
         * @var SuggestScoringMethod the score function to use.
         */
        private $scoreMethod;
@@ -176,7 +181,8 @@
                        $wgCirrusSearchBannedPlugins,
                        $wgPoolCounterConf,
                        $wgCirrusSearchMasterTimeout,
-                       $wgCirrusSearchMaxShardsPerNode;
+                       $wgCirrusSearchMaxShardsPerNode,
+                       $wgCirrusSearchCompletionDefaultScore;
 
                $this->masterTimeout = $this->getOption( 'masterTimeout', 
$wgCirrusSearchMasterTimeout );
                $this->indexTypeName = Connection::TITLE_SUGGEST_TYPE;
@@ -213,6 +219,10 @@
                $this->utils->checkElasticsearchVersion();
 
                $this->maxShardsPerNode = isset( 
$wgCirrusSearchMaxShardsPerNode[ $this->indexTypeName ] ) ? 
$wgCirrusSearchMaxShardsPerNode[ $this->indexTypeName ] : 'unlimited';
+
+               $this->scoreMethodName = $this->getOption( 'scoringMethod', 
$wgCirrusSearchCompletionDefaultScore );
+               $this->scoreMethod = 
SuggestScoringMethodFactory::getScoringMethod( $scoreMethodName );
+               $this->builder = new SuggestBuilder( $this->scoreMethod, 
$this->withGeo );
 
                try {
                        // If the version does not exist it's certainly because 
nothing has been indexed.
@@ -503,20 +513,6 @@
        }
 
        private function indexData() {
-               global $wgCirrusSearchCompletionDefaultScore;
-               $scoreMethodName = $this->getOption( 'scoringMethod', 
$wgCirrusSearchCompletionDefaultScore );
-               if ( $this->scoreMethod == null ) {
-                       $this->scoreMethod = 
SuggestScoringMethodFactory::getScoringMethod( $scoreMethodName );
-               }
-               if ( $this->builder == null ) {
-                       // NOTE: the builder stores a batchId value to flag
-                       // documents indexed by this builder. Make sure to
-                       // reuse the same instance when building docs otherwise
-                       // the batchId might be regenerated and can cause data
-                       // loss when recycling the index.
-                       $this->builder = new SuggestBuilder( 
$this->scoreMethod, $this->withGeo );
-               }
-
                // We build the suggestions by reading CONTENT and GENERAL 
indices.
                // This does not support extra indices like FILES on commons.
                $sourceIndexTypes = array( Connection::CONTENT_INDEX_TYPE, 
Connection::GENERAL_INDEX_TYPE );
diff --git a/tests/unit/SuggestScoringTest.php 
b/tests/unit/SuggestScoringTest.php
index c0d6b71..eba25f9 100644
--- a/tests/unit/SuggestScoringTest.php
+++ b/tests/unit/SuggestScoringTest.php
@@ -3,6 +3,7 @@
 namespace CirrusSearch;
 
 use CirrusSearch\BuildDocument\QualityScore;
+use CirrusSearch\BuildDocument\PQScore;
 
 /**
  * test suggest scoring functions.
@@ -279,4 +280,33 @@
                );
                $this->assertEquals( QualityScore::SCORE_RANGE, $qs->score( 
$page ), "With a zero page wiki the highest score is also " . 
QualityScore::SCORE_RANGE );
        }
+
+       public function testPopQualScoreRanking() {
+               $templates =  array( 'Good' => 2, 'Bad' => 0.5 );
+               $all_templates = array_keys( $templates );
+               $all_templates += array( 'Foo', 'Bar' );
+               $qs = new PQScore( array( 'Good' => 2, 'Bad' => 0.5 ) );
+               for ( $i = 0; $i < 1000; $i++ ) {
+                       $maxDocs = rand( 0, 100 );
+                       $qs->setMaxDocs( $maxDocs );
+                       $tmpl = array();
+                       for ( $j = rand( 0, count( $all_templates ) - 1 ); $j 
>= 0; $j-- ) {
+                               $tmpl[] = $all_templates[$j];
+                       }
+                       $page = array(
+                               'incoming_links' => rand( 0, 200 ),
+                               'external_link' => array_fill( 0, rand( 0, 200 
), null ),
+                               'text_bytes' => (string) rand( 0, 230000 ),
+                               'heading' => array_fill( 0, rand( 0, 30 ), null 
),
+                               'redirect' => array_fill( 0, rand( 0, 100 ), 
null ),
+                               'template' => $tmpl
+                       );
+
+                       $score = $qs->score( $page );
+
+                       $this->assertTrue( is_int( $score ), "Score is always 
an integer" );
+                       $this->assertTrue( $score >= 0, "Score is always 
positive" );
+                       $this->assertTrue( $score <= QualityScore::SCORE_RANGE, 
"Score is always lower than QualityScore::SCORE_RANGE" );
+               }
+       }
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/292574
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie113e3bb524f2548f265a15ba871d735a234d251
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: DCausse <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to