jenkins-bot has submitted this change and it was merged.
Change subject: Optionally enable sloppy prefix matching
......................................................................
Optionally enable sloppy prefix matching
This prefix matching has tons of slop: "m p" matches "Main Page".
Bug: 54974
Change-Id: Ib7decd6fdb98e08240507b1248a7f7f2761b52f4
---
M CirrusSearch.php
M includes/CirrusSearchAnalysisConfigBuilder.php
M includes/CirrusSearchMappingConfigBuilder.php
M includes/CirrusSearchSearcher.php
4 files changed, 41 insertions(+), 5 deletions(-)
Approvals:
Chad: Looks good to me, approved
jenkins-bot: Verified
diff --git a/CirrusSearch.php b/CirrusSearch.php
index cd2fba4..0515b3a 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -51,6 +51,15 @@
// set to 1 for some redundancy, if not 2 for more redundancy.
$wgCirrusSearchContentReplicaCount = array( 'content' => 0, 'general' => 0 );
+// Is it ok if the prefix starts on any word in the title or just the first
word?
+// Defaults to false (first word only) because that is the wikipedia behavior
and so
+// what we expect users to expect. Does not effect the prefix: search filter
or
+// url parameter - that always starts with the first word. false -> true will
break
+// prefix searching until an in place reindex is complete. true -> false is
fine
+// any time and you can then go false -> true if you haven't run an in place
reindex
+// since the change.
+$wgCirrusSearchPrefixSearchStartsWithAnyWord = false;
+
// When searching for a phrase how many words not searched for can be in the
phrase
// before it doesn't match. If I search for "like yellow candy" then
phraseSlop of 0
// won't match "like brownish yellow candy" but phraseSlop of 1 will.
diff --git a/includes/CirrusSearchAnalysisConfigBuilder.php
b/includes/CirrusSearchAnalysisConfigBuilder.php
index 7a4bf00..73c359d 100644
--- a/includes/CirrusSearchAnalysisConfigBuilder.php
+++ b/includes/CirrusSearchAnalysisConfigBuilder.php
@@ -69,6 +69,11 @@
'tokenizer' => 'prefix',
'filter' => array( 'lowercase' )
),
+ 'word_prefix' => array(
+ 'type' => 'custom',
+ 'tokenizer' => 'standard',
+ 'filter' => array( 'standard',
'lowercase', 'prefix_ngram_filter' ),
+ ),
'lowercase_keyword' => array(
'type' => 'custom',
'tokenizer' => 'no_splitting',
@@ -85,11 +90,14 @@
'lowercase' => array(
'type' => 'lowercase',
),
-
'aggressive_splitting' => array(
'type' => 'word_delimiter',
'stem_english_possessive' => 'false',
// No need
- )
+ ),
+ 'prefix_ngram_filter' => array(
+ 'type' => 'edgeNGram',
+ 'max_gram' =>
CirrusSearchSearcher::MAX_PREFIX_SEARCH,
+ ),
),
'tokenizer' => array(
'prefix' => array(
@@ -98,7 +106,7 @@
),
'no_splitting' => array( // Just grab the whole
term.
'type' => 'keyword',
- )
+ ),
)
);
}
@@ -108,6 +116,7 @@
*/
private function customize( $config ) {
global $wgCirrusSearchUseAggressiveSplitting;
+
switch ( $this->language ) {
// Please add languages in alphabetical order.
case 'el':
diff --git a/includes/CirrusSearchMappingConfigBuilder.php
b/includes/CirrusSearchMappingConfigBuilder.php
index a62bdc6..2187d75 100644
--- a/includes/CirrusSearchMappingConfigBuilder.php
+++ b/includes/CirrusSearchMappingConfigBuilder.php
@@ -31,9 +31,15 @@
* @return array the mapping config
*/
public function buildConfig() {
+ global $wgCirrusSearchPrefixSearchStartsWithAnyWord;
global $wgCirrusSearchPhraseUseText;
// Note never to set something as type='object' here because
that isn't returned by elasticsearch
// and is infered anyway.
+
+ $titleExtraAnalyzers = array( 'suggest', 'prefix' );
+ if ( $wgCirrusSearchPrefixSearchStartsWithAnyWord ) {
+ $titleExtraAnalyzers[] = 'word_prefix';
+ }
$textExtraAnalyzers = array();
if ( $wgCirrusSearchPhraseUseText ) {
@@ -42,7 +48,7 @@
return array(
'properties' => array(
- 'title' => $this->buildStringField( 'title',
array( 'suggest', 'prefix' ) ),
+ 'title' => $this->buildStringField( 'title',
$titleExtraAnalyzers ),
'text' => $this->buildStringField( 'text',
$textExtraAnalyzers ),
'category' =>
$this->buildLowercaseKeywordField(),
'heading' => $this->buildStringField( 'heading'
),
diff --git a/includes/CirrusSearchSearcher.php
b/includes/CirrusSearchSearcher.php
index c227915..86f2d93 100644
--- a/includes/CirrusSearchSearcher.php
+++ b/includes/CirrusSearchSearcher.php
@@ -84,6 +84,7 @@
* @param array(string) of titles
*/
public function prefixSearch( $search ) {
+ global $wgCirrusSearchPrefixSearchStartsWithAnyWord;
$requestLength = strlen( $search );
if ( $requestLength > self::MAX_PREFIX_SEARCH ) {
throw new UsageException( 'Prefix search requset was
longer longer than the maximum allowed length.' .
@@ -91,7 +92,18 @@
}
wfDebugLog( 'CirrusSearch', "Prefix searching: $search" );
- $this->filters[] = $this->buildPrefixFilter( $search );
+ if ( $wgCirrusSearchPrefixSearchStartsWithAnyWord ) {
+ $match = new \Elastica\Query\Match();
+ $match->setField( 'title.word_prefix', array(
+ 'query' => $search,
+ 'analyzer' => 'plain',
+ 'operator' => 'and',
+ ) );
+ $this->filters[] = new \Elastica\Filter\Query( $match );
+ } else {
+ $this->filters[] = $this->buildPrefixFilter( $search );
+ }
+
$this->description = "prefix search for '$search'";
$this->buildFullTextResults = false;
return $this->search();
--
To view, visit https://gerrit.wikimedia.org/r/90150
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: Ib7decd6fdb98e08240507b1248a7f7f2761b52f4
Gerrit-PatchSet: 2
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
Gerrit-Reviewer: Chad <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: jenkins-bot
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits