EBernhardson has uploaded a new change for review.
https://gerrit.wikimedia.org/r/235905
Change subject: Improve completion suggestion suggester (take 2)
......................................................................
Improve completion suggestion suggester (take 2)
* Lower score for suggestions based on redirects that do not start with the
same letter.
* Fuzzy weirdness: run fuzzy with prefix len = 0 only if query is more than 4
char.
* Do not remove stopwords from user query (to be or not to be).
* Better precision for punctuation (use whitespace analyzer for plain instead
of standard analyzer).
* Enable ASCII folding only for plain_stop.
Bug: T110915
Change-Id: I7ed88ba76746bf0189d8ba70377778122b554403
(cherry picked from commit 660f8c44044101f02c443ccdb40a3cd1fdba0334)
---
M includes/BuildDocument/SuggestBuilder.php
M includes/ElasticsearchIntermediary.php
M includes/Maintenance/SuggesterAnalysisConfigBuilder.php
M includes/Searcher.php
M profiles/SuggestProfiles.php
M tests/unit/SuggestBuilderTest.php
6 files changed, 156 insertions(+), 28 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch
refs/changes/05/235905/1
diff --git a/includes/BuildDocument/SuggestBuilder.php
b/includes/BuildDocument/SuggestBuilder.php
index 9570b2f..45824f9 100644
--- a/includes/BuildDocument/SuggestBuilder.php
+++ b/includes/BuildDocument/SuggestBuilder.php
@@ -42,6 +42,13 @@
const REDIRECT_DISCOUNT = 0.1;
/**
+ * Number of common prefix chars a redirect must share with the title
to be
+ * promoted as a title suggestion.
+ * This is useful not to promote Eraq as a title suggestion for Iraq
+ */
+ const REDIRECT_COMMON_PREFIX_LEN = 1;
+
+ /**
* @var SuggestScoringMethod the scoring function
*/
private $scoringMethod;
@@ -301,6 +308,25 @@
$a = mb_strtolower( $a );
$b = mb_strtolower( $b );
+ $aLength = mb_strlen( $a );
+ $bLength = mb_strlen( $b );
+
+ $commonPrefixLen = self::REDIRECT_COMMON_PREFIX_LEN;
+
+ if ( $aLength < $commonPrefixLen ) {
+ $commonPrefixLen = $aLength;
+ }
+ if( $bLength < $commonPrefixLen ) {
+ $commonPrefixLen = $bLength;
+ }
+
+ // check the common prefix
+ if ( mb_substr( $a, 0, $commonPrefixLen ) != mb_substr( $b, 0,
$commonPrefixLen ) ) {
+ return PHP_INT_MAX;
+ }
+
+ // TODO: switch to a ratio instead of raw distance would help
to group
+ // longer strings
return levenshtein( $a, $b );
}
}
diff --git a/includes/ElasticsearchIntermediary.php
b/includes/ElasticsearchIntermediary.php
index 10701d4..7b62813 100644
--- a/includes/ElasticsearchIntermediary.php
+++ b/includes/ElasticsearchIntermediary.php
@@ -313,8 +313,8 @@
$this->searchMetrics['wgCirrusStartTime'] = $this->requestStart;
$this->searchMetrics['wgCirrusEndTime'] = $endTime;
$logContext = $this->buildLogContext( $took );
- if ( isset( $logContext['elasticTook'] ) ) {
- $this->searchMetrics['wgCirrusElasticTime'] =
$logContext['elasticTook'];
+ if ( isset( $logContext['elasticTookMs'] ) ) {
+ $this->searchMetrics['wgCirrusElasticTime'] =
$logContext['elasticTookMs'];
}
if ( $wgCirrusSearchLogElasticRequests ) {
$logMessage = $this->buildLogMessage( $logContext );
@@ -341,6 +341,9 @@
$message .= " against {index} took {tookMs} millis";
if ( isset( $context['elasticTookMs'] ) ) {
$message .= " and {elasticTookMs} Elasticsearch millis";
+ if ( isset( $context['elasticTook2PassMs'] ) ) {
+ $message .= " (with 2nd pass:
{elasticTook2PassMs} ms)";
+ }
}
if ( isset( $context['hitsTotal'] ) ){
$message .= ". Found {hitsTotal} total results";
diff --git a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
index eaf9ecc..7367dd5 100644
--- a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
+++ b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
@@ -43,16 +43,39 @@
*/
protected function defaults() {
$defaults = array(
+ 'char_filter' => array(
+ 'word_break_helper' => array(
+ 'type' => 'mapping',
+ 'mappings' => array(
+ '_=>\u0020', // a space for mw
+ ',=>\u0020', // useful for
"Lastname, Firstname"
+ '"=>\u0020', // " certainly
phrase search?
+ '-=>\u0020', // useful for
hyphenated names
+ "'=>\u0020", // Useful
for finding names
+ '\u2019=>\u0020', // Unicode
right single quote
+ '\u02BC=>\u0020', // Unicode
modifier letter apostrophe
+ // Not sure about ( and )...
+ // very useful to search for :
+ // "john smith explo" instead
of "john smith (expl"
+ // but annoying to search for
"(C)"
+ // ')=>\u0020',
+ // '(=>\u0020',
+ // Others are the ones ignored
by common search engines
+ ':=>\u0020',
+ ';=>\u0020',
+ '\\[=>\u0020',
+ '\\]=>\u0020',
+ '{=>\u0020',
+ '}=>\u0020',
+ '\\\\=>\u0020'
+ ),
+ ),
+ ),
'filter' => array(
"stop_filter" => array(
"type" => "stop",
"stopwords" => "_none_",
"remove_trailing" => "true"
- ),
- "stop_filter_search" => array(
- "type" => "stop",
- "stopwords" => "_none_",
- "remove_trailing" => "false"
),
"asciifolding_preserve" => array(
"type" => "asciifolding",
@@ -79,12 +102,14 @@
),
"tokenizer" => "standard"
),
+ // We do not remove stop words when searching,
+ // this leads to extremely weird behaviors while
+ // writing "to be or no to be"
"stop_analyzer_search" => array(
"type" => "custom",
"filter" => array(
"standard",
"lowercase",
- "stop_filter_search",
"asciifolding_preserve",
"token_limit"
),
@@ -92,23 +117,21 @@
),
"plain" => array(
"type" => "custom",
+ "char_filter" => array(
'word_break_helper' ),
"filter" => array(
- "standard",
- "icu_normalizer",
- "asciifolding_preserve",
- "token_limit"
+ "token_limit",
+ "lowercase"
),
- "tokenizer" => "standard"
+ "tokenizer" => "whitespace"
),
"plain_search" => array(
"type" => "custom",
+ "char_filter" => array(
'word_break_helper' ),
"filter" => array(
- "standard",
- "icu_normalizer",
- "asciifolding_preserve",
- "token_limit"
+ "token_limit",
+ "lowercase"
),
- "tokenizer" => "standard"
+ "tokenizer" => "whitespace"
)
),
);
@@ -118,9 +141,11 @@
private function customize( $config ) {
$defaultStopSet = $this->getDefaultStopSet(
$this->getLanguage() );
$config['filter']['stop_filter']['stopwords'] = $defaultStopSet;
- $config['filter']['stop_filter_search']['stopwords'] =
$defaultStopSet;
if ( $this->isIcuAvailable() ) {
- foreach ( $config[ 'analyzer' ] as &$analyzer ) {
+ foreach ( $config[ 'analyzer' ] as $k => &$analyzer ) {
+ if ( $k != "stop_analyzer" && $k !=
"stop_analyzer_search" ) {
+ continue;
+ }
if ( !isset( $analyzer[ 'filter' ] ) ) {
continue;
}
diff --git a/includes/Searcher.php b/includes/Searcher.php
index a95590e..2b90462 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -789,7 +789,7 @@
$this->term = $text;
$suggest = array( 'text' => $text );
- $queryLen = mb_strlen( $text );
+ $queryLen = mb_strlen( trim( $text ) ); // Avoid cheating with
spaces
$profile = $wgCirrusSearchCompletionSettings;
if ( $context != null && isset( $context['geo']['lat'] ) &&
isset( $context['geo']['lon'] )
@@ -801,6 +801,9 @@
foreach ( $profile as $name => $config ) {
if ( $config['min_query_len'] > $queryLen ) {
+ continue;
+ }
+ if ( isset( $config['max_query_len'] ) && $queryLen >
$config['max_query_len'] ) {
continue;
}
$field = $config['field'];
@@ -896,6 +899,7 @@
* @return Title[] List of suggested titles
*/
protected function postProcessSuggest( $query, \Elastica\Response
$response, $profile, $limit = -1 ) {
+ $this->logContext['elasticTookMs'] = intval(
$response->getQueryTime() * 1000 );
$data = $response->getData();
unset( $data['_shards'] );
@@ -971,6 +975,7 @@
array( 'ids' => $missingText ),
array( '_source_include' => 'redirect'
) );
if ( $redirResponse->isOk() ) {
+ $this->logContext['elasticTook2PassMs']
= intval( $redirResponse->getQueryTime() * 1000 );
$docs = $redirResponse->getData();
$docs = $docs['docs'];
foreach ( $docs as $doc ) {
diff --git a/profiles/SuggestProfiles.php b/profiles/SuggestProfiles.php
index 94b94c4..c04a6f9 100644
--- a/profiles/SuggestProfiles.php
+++ b/profiles/SuggestProfiles.php
@@ -48,25 +48,51 @@
'plain_stop' => array(
'field' => 'suggest-stop',
'min_query_len' => 0,
- 'discount' => 0.1,
+ 'discount' => 0.001,
'fetch_limit_factor' => 2,
),
- 'plain_fuzzy' => array(
+ // Fuzzy query for query length (3 to 4) with prefix len 1
+ 'plain_fuzzy_1' => array(
'field' => 'suggest',
'min_query_len' => 3,
+ 'max_query_len' => 4,
'discount' => 0.005,
- 'fetch_limit_factor' => 2,
+ 'fetch_limit_factor' => 1,
+ 'fuzzy' => array(
+ 'fuzzyness' => 'AUTO',
+ 'prefix_length' => 1,
+ 'unicode_aware' => true,
+ )
+ ),
+ 'plain_stop_fuzzy_1' => array(
+ 'field' => 'suggest-stop',
+ 'min_query_len' => 3,
+ 'max_query_len' => 4,
+ 'discount' => 0.0001,
+ 'fetch_limit_factor' => 1,
+ 'fuzzy' => array(
+ 'fuzzyness' => 'AUTO',
+ 'prefix_length' => 1,
+ 'unicode_aware' => true,
+ )
+ ),
+ // Fuzzy query for query length > 5 with prefix len 0
+ 'plain_fuzzy_0' => array(
+ 'field' => 'suggest',
+ 'min_query_len' => 5,
+ 'discount' => 0.005,
+ 'fetch_limit_factor' => 1,
'fuzzy' => array(
'fuzzyness' => 'AUTO',
'prefix_length' => 0,
'unicode_aware' => true,
)
),
- 'plain_stop_fuzzy' => array(
+ 'plain_stop_fuzzy_0' => array(
'field' => 'suggest-stop',
- 'min_query_len' => 3,
- 'discount' => 0.001,
- 'fetch_limit_factor' => 2,
+ 'min_query_len' => 5,
+ 'discount' => 0.0001,
+ 'fetch_limit_factor' => 1,
'fuzzy' => array(
'fuzzyness' => 'AUTO',
'prefix_length' => 0,
diff --git a/tests/unit/SuggestBuilderTest.php
b/tests/unit/SuggestBuilderTest.php
index e361631..416d7f8 100644
--- a/tests/unit/SuggestBuilderTest.php
+++ b/tests/unit/SuggestBuilderTest.php
@@ -72,6 +72,49 @@
$this->assertSame( $expected, $suggestions );
}
+ public function testEraq() {
+ $builder = new SuggestBuilder(
SuggestScoringMethodFactory::getScoringMethod( 'incomingLinks', 1 ) );
+ $score = 10;
+ $redirScore = (int) ( $score *
SuggestBuilder::REDIRECT_DISCOUNT );
+ $doc = array(
+ 'title' => 'Iraq',
+ 'redirect' => array(
+ array( 'title' => "Eraq", 'namespace' => 0 ),
+ array( 'title' => "Irak", 'namespace' => 0 ),
+ ),
+ 'incoming_links' => $score
+ );
+
+ $expected = array(
+ array(
+ 'suggest' => array(
+ 'input' => array( 'Iraq', 'Irak' ),
+ 'output' => '1:t:Iraq',
+ 'weight' => $score
+ ),
+ 'suggest-stop' => array(
+ 'input' => array( 'Iraq', 'Irak' ),
+ 'output' => '1:t:Iraq',
+ 'weight' => $score
+ )
+ ),
+ array(
+ 'suggest' => array(
+ 'input' => array( 'Eraq' ),
+ 'output' => '1:r',
+ 'weight' => $redirScore
+ ),
+ 'suggest-stop' => array(
+ 'input' => array( 'Eraq' ),
+ 'output' => '1:r',
+ 'weight' => $redirScore
+ )
+ )
+ );
+ $suggestions = $builder->build( 1, $doc );
+ $this->assertSame( $expected, $suggestions );
+ }
+
public function testUlm() {
$builder = new SuggestBuilder(
SuggestScoringMethodFactory::getScoringMethod( 'incomingLinks', 1 ) );
$score = 10;
--
To view, visit https://gerrit.wikimedia.org/r/235905
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I7ed88ba76746bf0189d8ba70377778122b554403
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: wmf/1.26wmf21
Gerrit-Owner: EBernhardson <[email protected]>
Gerrit-Reviewer: DCausse <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits