EBernhardson has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/235905

Change subject: Improve completion suggestion suggester (take 2)
......................................................................

Improve completion suggestion suggester (take 2)

* Lower score for suggestions based on redirects that do not start with the
  same letter.
* Fuzzy weirdness: run fuzzy with prefix len = 0 only if query is more than 4
  char.
* Do not remove stopwords from user query (to be or not to be).
* Better precision for punctuation (use whitespace analyzer for plain instead
  of standard analyzer).
* Enable ASCII folding only for plain_stop.

Bug: T110915
Change-Id: I7ed88ba76746bf0189d8ba70377778122b554403
(cherry picked from commit 660f8c44044101f02c443ccdb40a3cd1fdba0334)
---
M includes/BuildDocument/SuggestBuilder.php
M includes/ElasticsearchIntermediary.php
M includes/Maintenance/SuggesterAnalysisConfigBuilder.php
M includes/Searcher.php
M profiles/SuggestProfiles.php
M tests/unit/SuggestBuilderTest.php
6 files changed, 156 insertions(+), 28 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/05/235905/1

diff --git a/includes/BuildDocument/SuggestBuilder.php 
b/includes/BuildDocument/SuggestBuilder.php
index 9570b2f..45824f9 100644
--- a/includes/BuildDocument/SuggestBuilder.php
+++ b/includes/BuildDocument/SuggestBuilder.php
@@ -42,6 +42,13 @@
        const REDIRECT_DISCOUNT = 0.1;
 
        /**
+        * Number of common prefix chars a redirect must share with the title 
to be
+        * promoted as a title suggestion.
+        * This is useful not to promote Eraq as a title suggestion for Iraq
+        */
+       const REDIRECT_COMMON_PREFIX_LEN = 1;
+
+       /**
         * @var SuggestScoringMethod the scoring function
         */
        private $scoringMethod;
@@ -301,6 +308,25 @@
                $a = mb_strtolower( $a );
                $b = mb_strtolower( $b );
 
+               $aLength = mb_strlen( $a );
+               $bLength = mb_strlen( $b );
+
+               $commonPrefixLen = self::REDIRECT_COMMON_PREFIX_LEN;
+
+               if ( $aLength < $commonPrefixLen ) {
+                       $commonPrefixLen = $aLength;
+               }
+               if( $bLength < $commonPrefixLen ) {
+                       $commonPrefixLen = $bLength;
+               }
+
+               // check the common prefix
+               if ( mb_substr( $a, 0, $commonPrefixLen ) != mb_substr( $b, 0, 
$commonPrefixLen ) ) {
+                       return PHP_INT_MAX;
+               }
+
+               // TODO: switch to a ratio instead of raw distance would help 
to group
+               // longer strings
                return levenshtein( $a, $b );
        }
 }
diff --git a/includes/ElasticsearchIntermediary.php 
b/includes/ElasticsearchIntermediary.php
index 10701d4..7b62813 100644
--- a/includes/ElasticsearchIntermediary.php
+++ b/includes/ElasticsearchIntermediary.php
@@ -313,8 +313,8 @@
                $this->searchMetrics['wgCirrusStartTime'] = $this->requestStart;
                $this->searchMetrics['wgCirrusEndTime'] = $endTime;
                $logContext = $this->buildLogContext( $took );
-               if ( isset( $logContext['elasticTook'] ) ) {
-                       $this->searchMetrics['wgCirrusElasticTime'] = 
$logContext['elasticTook'];
+               if ( isset( $logContext['elasticTookMs'] ) ) {
+                       $this->searchMetrics['wgCirrusElasticTime'] = 
$logContext['elasticTookMs'];
                }
                if ( $wgCirrusSearchLogElasticRequests ) {
                        $logMessage = $this->buildLogMessage( $logContext );
@@ -341,6 +341,9 @@
                $message .= " against {index} took {tookMs} millis";
                if ( isset( $context['elasticTookMs'] ) ) {
                        $message .= " and {elasticTookMs} Elasticsearch millis";
+                       if ( isset( $context['elasticTook2PassMs'] ) ) {
+                               $message .= " (with 2nd pass: 
{elasticTook2PassMs} ms)";
+                       }
                }
                if ( isset( $context['hitsTotal'] ) ){
                        $message .= ". Found {hitsTotal} total results";
diff --git a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php 
b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
index eaf9ecc..7367dd5 100644
--- a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
+++ b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
@@ -43,16 +43,39 @@
         */
        protected function defaults() {
                $defaults = array(
+                       'char_filter' => array(
+                               'word_break_helper' => array(
+                                       'type' => 'mapping',
+                                       'mappings' => array(
+                                               '_=>\u0020', // a space for mw
+                                               ',=>\u0020', // useful for 
"Lastname, Firstname"
+                                               '"=>\u0020', // " certainly 
phrase search?
+                                               '-=>\u0020', // useful for 
hyphenated names
+                                               "'=>\u0020",       // Useful 
for finding names
+                                               '\u2019=>\u0020',  // Unicode 
right single quote
+                                               '\u02BC=>\u0020',  // Unicode 
modifier letter apostrophe
+                                               // Not sure about ( and )...
+                                               // very useful to search for :
+                                               //   "john smith explo" instead 
of "john smith (expl"
+                                               // but annoying to search for 
"(C)"
+                                               // ')=>\u0020',
+                                               // '(=>\u0020',
+                                               // Others are the ones ignored 
by common search engines
+                                               ':=>\u0020',
+                                               ';=>\u0020',
+                                               '\\[=>\u0020',
+                                               '\\]=>\u0020',
+                                               '{=>\u0020',
+                                               '}=>\u0020',
+                                               '\\\\=>\u0020'
+                                       ),
+                               ),
+                       ),
                        'filter' => array(
                                "stop_filter" => array(
                                        "type" => "stop",
                                        "stopwords" => "_none_",
                                        "remove_trailing" => "true"
-                               ),
-                               "stop_filter_search" => array(
-                                       "type" => "stop",
-                                       "stopwords" => "_none_",
-                                       "remove_trailing" => "false"
                                ),
                                "asciifolding_preserve" => array(
                                        "type" => "asciifolding",
@@ -79,12 +102,14 @@
                                        ),
                                        "tokenizer" => "standard"
                                ),
+                               // We do not remove stop words when searching,
+                               // this leads to extremely weird behaviors while
+                               // writing "to be or no to be"
                                "stop_analyzer_search" => array(
                                        "type" => "custom",
                                        "filter" => array(
                                                "standard",
                                                "lowercase",
-                                               "stop_filter_search",
                                                "asciifolding_preserve",
                                                "token_limit"
                                        ),
@@ -92,23 +117,21 @@
                                ),
                                "plain" => array(
                                        "type" => "custom",
+                                       "char_filter" => array( 
'word_break_helper' ),
                                        "filter" => array(
-                                               "standard",
-                                               "icu_normalizer",
-                                               "asciifolding_preserve",
-                                               "token_limit"
+                                               "token_limit",
+                                               "lowercase"
                                        ),
-                                       "tokenizer" => "standard"
+                                       "tokenizer" => "whitespace"
                                ),
                                "plain_search" => array(
                                        "type" => "custom",
+                                       "char_filter" => array( 
'word_break_helper' ),
                                        "filter" => array(
-                                               "standard",
-                                               "icu_normalizer",
-                                               "asciifolding_preserve",
-                                               "token_limit"
+                                               "token_limit",
+                                               "lowercase"
                                        ),
-                                       "tokenizer" => "standard"
+                                       "tokenizer" => "whitespace"
                                )
                        ),
                );
@@ -118,9 +141,11 @@
        private function customize( $config ) {
                $defaultStopSet = $this->getDefaultStopSet( 
$this->getLanguage() );
                $config['filter']['stop_filter']['stopwords'] = $defaultStopSet;
-               $config['filter']['stop_filter_search']['stopwords'] = 
$defaultStopSet;
                if ( $this->isIcuAvailable() ) {
-                       foreach ( $config[ 'analyzer' ] as &$analyzer ) {
+                       foreach ( $config[ 'analyzer' ] as $k => &$analyzer ) {
+                               if ( $k != "stop_analyzer" && $k != 
"stop_analyzer_search" ) {
+                                       continue;
+                               }
                                if ( !isset( $analyzer[ 'filter'  ] ) ) {
                                        continue;
                                }
diff --git a/includes/Searcher.php b/includes/Searcher.php
index a95590e..2b90462 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -789,7 +789,7 @@
                $this->term = $text;
 
                $suggest = array( 'text' => $text );
-               $queryLen = mb_strlen( $text );
+               $queryLen = mb_strlen( trim( $text ) ); // Avoid cheating with 
spaces
                $profile = $wgCirrusSearchCompletionSettings;
 
                if ( $context != null && isset( $context['geo']['lat'] ) && 
isset( $context['geo']['lon'] )
@@ -801,6 +801,9 @@
 
                foreach ( $profile as $name => $config ) {
                        if ( $config['min_query_len'] > $queryLen ) {
+                               continue;
+                       }
+                       if ( isset( $config['max_query_len'] ) && $queryLen > 
$config['max_query_len'] ) {
                                continue;
                        }
                        $field = $config['field'];
@@ -896,6 +899,7 @@
         * @return Title[] List of suggested titles
         */
        protected function postProcessSuggest( $query, \Elastica\Response 
$response, $profile, $limit = -1 ) {
+               $this->logContext['elasticTookMs'] = intval( 
$response->getQueryTime() * 1000 );
                $data = $response->getData();
                unset( $data['_shards'] );
 
@@ -971,6 +975,7 @@
                                        array( 'ids' => $missingText ),
                                        array( '_source_include' => 'redirect' 
) );
                                if ( $redirResponse->isOk() ) {
+                                       $this->logContext['elasticTook2PassMs'] 
= intval( $redirResponse->getQueryTime() * 1000 );
                                        $docs = $redirResponse->getData();
                                        $docs = $docs['docs'];
                                        foreach ( $docs as $doc ) {
diff --git a/profiles/SuggestProfiles.php b/profiles/SuggestProfiles.php
index 94b94c4..c04a6f9 100644
--- a/profiles/SuggestProfiles.php
+++ b/profiles/SuggestProfiles.php
@@ -48,25 +48,51 @@
                'plain_stop' => array(
                        'field' => 'suggest-stop',
                        'min_query_len' => 0,
-                       'discount' => 0.1,
+                       'discount' => 0.001,
                        'fetch_limit_factor' => 2,
                ),
-               'plain_fuzzy' => array(
+               // Fuzzy query for query length (3 to 4) with prefix len 1
+               'plain_fuzzy_1' => array(
                        'field' => 'suggest',
                        'min_query_len' => 3,
+                       'max_query_len' => 4,
                        'discount' => 0.005,
-                       'fetch_limit_factor' => 2,
+                       'fetch_limit_factor' => 1,
+                       'fuzzy' => array(
+                               'fuzzyness' => 'AUTO',
+                               'prefix_length' => 1,
+                               'unicode_aware' => true,
+                       )
+               ),
+               'plain_stop_fuzzy_1' => array(
+                       'field' => 'suggest-stop',
+                       'min_query_len' => 3,
+                       'max_query_len' => 4,
+                       'discount' => 0.0001,
+                       'fetch_limit_factor' => 1,
+                       'fuzzy' => array(
+                               'fuzzyness' => 'AUTO',
+                               'prefix_length' => 1,
+                               'unicode_aware' => true,
+                       )
+               ),
+               // Fuzzy query for query length > 5 with prefix len 0
+               'plain_fuzzy_0' => array(
+                       'field' => 'suggest',
+                       'min_query_len' => 5,
+                       'discount' => 0.005,
+                       'fetch_limit_factor' => 1,
                        'fuzzy' => array(
                                'fuzzyness' => 'AUTO',
                                'prefix_length' => 0,
                                'unicode_aware' => true,
                        )
                ),
-               'plain_stop_fuzzy' => array(
+               'plain_stop_fuzzy_0' => array(
                        'field' => 'suggest-stop',
-                       'min_query_len' => 3,
-                       'discount' => 0.001,
-                       'fetch_limit_factor' => 2,
+                       'min_query_len' => 5,
+                       'discount' => 0.0001,
+                       'fetch_limit_factor' => 1,
                        'fuzzy' => array(
                                'fuzzyness' => 'AUTO',
                                'prefix_length' => 0,
diff --git a/tests/unit/SuggestBuilderTest.php 
b/tests/unit/SuggestBuilderTest.php
index e361631..416d7f8 100644
--- a/tests/unit/SuggestBuilderTest.php
+++ b/tests/unit/SuggestBuilderTest.php
@@ -72,6 +72,49 @@
                $this->assertSame( $expected, $suggestions );
        }
 
+       public function testEraq() {
+               $builder = new SuggestBuilder( 
SuggestScoringMethodFactory::getScoringMethod( 'incomingLinks', 1 ) );
+               $score = 10;
+               $redirScore = (int) ( $score * 
SuggestBuilder::REDIRECT_DISCOUNT );
+               $doc = array(
+                       'title' => 'Iraq',
+                       'redirect' => array(
+                               array( 'title' => "Eraq", 'namespace' => 0 ),
+                               array( 'title' => "Irak", 'namespace' => 0 ),
+                       ),
+                       'incoming_links' => $score
+               );
+
+               $expected = array(
+                       array(
+                               'suggest' => array(
+                                       'input' => array( 'Iraq', 'Irak' ),
+                                       'output' => '1:t:Iraq',
+                                       'weight' => $score
+                               ),
+                               'suggest-stop' => array(
+                                       'input' => array( 'Iraq', 'Irak' ),
+                                       'output' => '1:t:Iraq',
+                                       'weight' => $score
+                               )
+                       ),
+                       array(
+                               'suggest' => array(
+                                       'input' => array( 'Eraq' ),
+                                       'output' => '1:r',
+                                       'weight' => $redirScore
+                               ),
+                               'suggest-stop' => array(
+                                       'input' => array( 'Eraq' ),
+                                       'output' => '1:r',
+                                       'weight' => $redirScore
+                               )
+                       )
+               );
+               $suggestions = $builder->build( 1, $doc );
+               $this->assertSame( $expected, $suggestions );
+       }
+
        public function testUlm() {
                $builder = new SuggestBuilder( 
SuggestScoringMethodFactory::getScoringMethod( 'incomingLinks', 1 ) );
                $score = 10;

-- 
To view, visit https://gerrit.wikimedia.org/r/235905
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I7ed88ba76746bf0189d8ba70377778122b554403
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: wmf/1.26wmf21
Gerrit-Owner: EBernhardson <[email protected]>
Gerrit-Reviewer: DCausse <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to