Manybubbles has uploaded a new change for review.
https://gerrit.wikimedia.org/r/90150
Change subject: Optionally enable sloppy prefix matching
......................................................................
Optionally enable sloppy prefix matching
This prefix matching has tons of slop: "m p" matches "Main Page".
Bug: 54974
Change-Id: Ib7decd6fdb98e08240507b1248a7f7f2761b52f4
---
M CirrusSearch.php
M includes/CirrusSearchAnalysisConfigBuilder.php
M includes/CirrusSearchMappingConfigBuilder.php
M includes/CirrusSearchSearcher.php
4 files changed, 42 insertions(+), 5 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch
refs/changes/50/90150/1
diff --git a/CirrusSearch.php b/CirrusSearch.php
index c6c1fcf..d287bba 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -51,6 +51,15 @@
// set to 1 for some redundancy, if not 2 for more redundancy.
$wgCirrusSearchContentReplicaCount = array( 'content' => 0, 'general' => 0 );
+// Is it ok if the prefix starts on any word in the title or just the first
word?
+// Defaults to false (first word only) because that is the wikipedia behavior
and so
+// what we expect users to expect. Does not effect the prefix: search filter
or
+// url parameter - that always starts with the first word. false -> true will
break
+// prefix searching until an in place reindex is complete. true -> false is
fine
+// any time and you can then go false -> true if you haven't run an in place
reindex
+// since the change.
+$wgCirrusSearchPrefixSearchStartsWithAnyWord = false;
+
// When searching for a phrase how many words not searched for can be in the
phrase
// before it doesn't match. If I search for "like yellow candy" then
phraseSlop of 0
// won't match "like brownish yellow candy" but phraseSlop of 1 will.
diff --git a/includes/CirrusSearchAnalysisConfigBuilder.php
b/includes/CirrusSearchAnalysisConfigBuilder.php
index 7a4bf00..73c359d 100644
--- a/includes/CirrusSearchAnalysisConfigBuilder.php
+++ b/includes/CirrusSearchAnalysisConfigBuilder.php
@@ -69,6 +69,11 @@
'tokenizer' => 'prefix',
'filter' => array( 'lowercase' )
),
+ 'word_prefix' => array(
+ 'type' => 'custom',
+ 'tokenizer' => 'standard',
+ 'filter' => array( 'standard',
'lowercase', 'prefix_ngram_filter' ),
+ ),
'lowercase_keyword' => array(
'type' => 'custom',
'tokenizer' => 'no_splitting',
@@ -85,11 +90,14 @@
'lowercase' => array(
'type' => 'lowercase',
),
-
'aggressive_splitting' => array(
'type' => 'word_delimiter',
'stem_english_possessive' => 'false',
// No need
- )
+ ),
+ 'prefix_ngram_filter' => array(
+ 'type' => 'edgeNGram',
+ 'max_gram' =>
CirrusSearchSearcher::MAX_PREFIX_SEARCH,
+ ),
),
'tokenizer' => array(
'prefix' => array(
@@ -98,7 +106,7 @@
),
'no_splitting' => array( // Just grab the whole
term.
'type' => 'keyword',
- )
+ ),
)
);
}
@@ -108,6 +116,7 @@
*/
private function customize( $config ) {
global $wgCirrusSearchUseAggressiveSplitting;
+
switch ( $this->language ) {
// Please add languages in alphabetical order.
case 'el':
diff --git a/includes/CirrusSearchMappingConfigBuilder.php
b/includes/CirrusSearchMappingConfigBuilder.php
index 8bcb1f4..1f72c9d 100644
--- a/includes/CirrusSearchMappingConfigBuilder.php
+++ b/includes/CirrusSearchMappingConfigBuilder.php
@@ -31,11 +31,18 @@
* @return array the mapping config
*/
public function buildConfig() {
+ global $wgCirrusSearchPrefixSearchStartsWithAnyWord;
// Note never to set something as type='object' here because
that isn't returned by elasticsearch
// and is infered anyway.
+
+ $titleExtraAnalyzers = array( 'suggest', 'prefix' );
+ if ( $wgCirrusSearchPrefixSearchStartsWithAnyWord ) {
+ $titleExtraAnalyzers[] = 'word_prefix';
+ }
+
return array(
'properties' => array(
- 'title' => $this->buildStringField( 'title',
array( 'suggest', 'prefix' ) ),
+ 'title' => $this->buildStringField( 'title',
$titleExtraAnalyzers ),
'text' => $this->buildStringField( 'text',
array( 'suggest' ) ),
'category' =>
$this->buildLowercaseKeywordField(),
'heading' => $this->buildStringField( 'heading'
),
diff --git a/includes/CirrusSearchSearcher.php
b/includes/CirrusSearchSearcher.php
index 89606c5..72c9040 100644
--- a/includes/CirrusSearchSearcher.php
+++ b/includes/CirrusSearchSearcher.php
@@ -83,6 +83,7 @@
* @param array(string) of titles
*/
public function prefixSearch( $search ) {
+ global $wgCirrusSearchPrefixSearchStartsWithAnyWord;
$requestLength = strlen( $search );
if ( $requestLength > self::MAX_PREFIX_SEARCH ) {
throw new UsageException( 'Prefix search requset was
longer longer than the maximum allowed length.' .
@@ -90,7 +91,18 @@
}
wfDebugLog( 'CirrusSearch', "Prefix searching: $search" );
- $this->filters[] = $this->buildPrefixFilter( $search );
+ if ( $wgCirrusSearchPrefixSearchStartsWithAnyWord ) {
+ $match = new \Elastica\Query\Match();
+ $match->setField( 'title.word_prefix', array(
+ 'query' => $search,
+ 'analyzer' => 'plain',
+ 'operator' => 'and',
+ ) );
+ $this->filters[] = new \Elastica\Filter\Query( $match );
+ } else {
+ $this->filters[] = $this->buildPrefixFilter( $search );
+ }
+
$this->description = "prefix search for '$search'";
$this->buildFullTextResults = false;
return $this->search();
--
To view, visit https://gerrit.wikimedia.org/r/90150
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ib7decd6fdb98e08240507b1248a7f7f2761b52f4
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits