jenkins-bot has submitted this change and it was merged.
Change subject: Prefix search always squashes accents
......................................................................
Prefix search always squashes accents
Enable accent squashing for all prefix searches. The argument is that
even if you speak a language that prefers accents not be squashed you
might be on a keyboard without accents. We're not taking this argument
further and adding accent squashing to all searches at this point (or
maybe ever).
Bug: 67521
Change-Id: I1e9dc357b861edf1ff3c959e3ac3b287292c2bea
---
M includes/Maintenance/AnalysisConfigBuilder.php
M includes/Maintenance/MappingConfigBuilder.php
M includes/Search/ResultsType.php
M includes/Searcher.php
M tests/browser/features/prefix_search.feature
M tests/browser/features/relevancy.feature
M tests/browser/features/step_definitions/search_steps.rb
M tests/browser/features/step_definitions/simple_search_steps.rb
M tests/browser/features/support/hooks.rb
M tests/browser/features/support/pages/search_page.rb
10 files changed, 122 insertions(+), 38 deletions(-)
Approvals:
Chad: Looks good to me, approved
jenkins-bot: Verified
diff --git a/includes/Maintenance/AnalysisConfigBuilder.php
b/includes/Maintenance/AnalysisConfigBuilder.php
index 6474627..cdc5ed4 100644
--- a/includes/Maintenance/AnalysisConfigBuilder.php
+++ b/includes/Maintenance/AnalysisConfigBuilder.php
@@ -30,7 +30,7 @@
* and change the minor version when it changes but isn't
* incompatible
*/
- const VERSION = '0.9';
+ const VERSION = '0.10';
/**
* Language code we're building analysis for
@@ -112,10 +112,22 @@
'filter' => array( 'lowercase' ),
'char_filter' => array(
'near_space_flattener' ),
),
+ 'near_match_asciifolding' => array(
+ 'type' => 'custom',
+ 'tokenizer' => 'no_splitting',
+ 'filter' => array( 'lowercase',
'asciifolding' ),
+ 'char_filter' => array(
'near_space_flattener' ),
+ ),
'prefix' => array(
'type' => 'custom',
'tokenizer' => 'prefix',
'filter' => array( 'lowercase' ),
+ 'char_filter' => array(
'near_space_flattener' ),
+ ),
+ 'prefix_asciifolding' => array(
+ 'type' => 'custom',
+ 'tokenizer' => 'prefix',
+ 'filter' => array( 'lowercase',
'asciifolding' ),
'char_filter' => array(
'near_space_flattener' ),
),
'word_prefix' => array(
@@ -157,6 +169,10 @@
'max_gram' =>
Searcher::MAX_TITLE_SEARCH,
),
'asciifolding' => array(
+ 'type' => 'asciifolding',
+ 'preserve_original' => false
+ ),
+ 'asciifolding_preserve' => array(
'type' => 'asciifolding',
'preserve_original' => true
),
@@ -245,15 +261,13 @@
$filters[] = 'stop';
$filters[] = 'kstem';
$filters[] = 'custom_stem';
- $filters[] = 'asciifolding';
+ $filters[] = 'asciifolding_preserve';
$config[ 'analyzer' ][ 'text' ][ 'filter' ] = $filters;
- // Add asciifolding to the the plain analyzer as well
(but not plain_search)
- $config[ 'analyzer' ][ 'plain' ][ 'filter' ][] =
'asciifolding';
- // Add asciifolding to the prefix queries and
incategory filters
- $config[ 'analyzer' ][ 'prefix' ][ 'filter' ][] =
'asciifolding';
- $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter'
][] = 'asciifolding';
- $config[ 'analyzer' ][ 'near_match' ][ 'filter' ][] =
'asciifolding';
+ // Add asciifolding_preserve to the the plain analyzer
as well (but not plain_search)
+ $config[ 'analyzer' ][ 'plain' ][ 'filter' ][] =
'asciifolding_preserve';
+ // Add asciifolding_preserve filters
+ $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter'
][] = 'asciifolding_preserve';
// In English text_search is just a copy of text
$config[ 'analyzer' ][ 'text_search' ] = $config[
'analyzer' ][ 'text' ];
@@ -268,10 +282,8 @@
);
break;
case 'french':
- // Add asciifolding to the prefix queries and
incategory filters
- $config[ 'analyzer' ][ 'prefix' ][ 'filter' ][] =
'asciifolding';
- $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter'
][] = 'asciifolding';
- $config[ 'analyzer' ][ 'near_match' ][ 'filter' ][] =
'asciifolding';
+ // Add asciifolding_preserve to filters
+ $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter'
][] = 'asciifolding_preserve';
break;
case 'italian':
$config[ 'filter' ][ 'italian_elision' ] = array(
@@ -324,12 +336,10 @@
$filters[] = 'asciifolding';
$config[ 'analyzer' ][ 'text' ][ 'filter' ] = $filters;
- // Add asciifolding to the the plain analyzer as well
(but not plain_search)
- $config[ 'analyzer' ][ 'plain' ][ 'filter' ][] =
'asciifolding';
- // Add asciifolding to the prefix queries and
incategory filters
- $config[ 'analyzer' ][ 'prefix' ][ 'filter' ][] =
'asciifolding';
- $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter'
][] = 'asciifolding';
- $config[ 'analyzer' ][ 'near_match' ][ 'filter' ][] =
'asciifolding';
+ // Add asciifolding_preserve to the the plain analyzer
as well (but not plain_search)
+ $config[ 'analyzer' ][ 'plain' ][ 'filter' ][] =
'asciifolding_preserve';
+ // Add asciifolding_preserve to filters
+ $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter'
][] = 'asciifolding_preserve';
// In Italian text_search is just a copy of text
$config[ 'analyzer' ][ 'text_search' ] = $config[
'analyzer' ][ 'text' ];
diff --git a/includes/Maintenance/MappingConfigBuilder.php
b/includes/Maintenance/MappingConfigBuilder.php
index 067632a..6696783 100644
--- a/includes/Maintenance/MappingConfigBuilder.php
+++ b/includes/Maintenance/MappingConfigBuilder.php
@@ -44,7 +44,7 @@
* and change the minor version when it changes but isn't
* incompatible
*/
- const VERSION = '1.7';
+ const VERSION = '1.8';
/**
* Whether to allow prefix searches to match on any word
@@ -90,7 +90,9 @@
$titleExtraAnalyzers = array(
$suggestExtra,
array( 'index_analyzer' => 'prefix', 'search_analyzer'
=> 'near_match', 'index_options' => 'docs' ),
+ array( 'index_analyzer' => 'prefix_asciifolding',
'search_analyzer' => 'near_match_asciifolding', 'index_options' => 'docs' ),
array( 'analyzer' => 'near_match', 'index_options' =>
'docs' ),
+ array( 'analyzer' => 'near_match_asciifolding',
'index_options' => 'docs' ),
array( 'analyzer' => 'keyword', 'index_options' =>
'docs' ),
);
if ( $this->prefixSearchStartsWithAnyWord ) {
@@ -195,6 +197,15 @@
'index_options' => 'freqs',
'position_offset_gap' =>
self::POSITION_OFFSET_GAP,
'norms' => array( 'enabled' => false ),
+ 'fields' => array(
+ 'asciifolding' => array(
+ 'type' => 'string',
+ 'analyzer' =>
'near_match_asciifolding',
+ 'index_options' => 'freqs',
+ 'position_offset_gap' =>
self::POSITION_OFFSET_GAP,
+ 'norms' => array( 'enabled' =>
false ),
+ ),
+ ),
);
$nearMatchFields = array(
'title' => $wgCirrusSearchWeights[ 'title' ],
diff --git a/includes/Search/ResultsType.php b/includes/Search/ResultsType.php
index 31655ec..fea4d6a 100644
--- a/includes/Search/ResultsType.php
+++ b/includes/Search/ResultsType.php
@@ -125,10 +125,13 @@
'post_tags' => array( Searcher::HIGHLIGHT_POST ),
'fields' => array(
"title.$this->matchedAnalyzer" => $entireValue,
+ "title.{$this->matchedAnalyzer}_asciifolding"
=> $entireValue,
"redirect.title.$this->matchedAnalyzer" =>
$manyValues,
- )
+
"redirect.title.{$this->matchedAnalyzer}_asciifolding" => $manyValues,
+ ),
);
}
+
/**
* Convert the results to titles.
* @return array with optional keys:
@@ -148,9 +151,20 @@
// though.
if ( isset( $highlights[ "title.$this->matchedAnalyzer"
] ) ) {
$resultForTitle[ 'titleMatch' ] = $title;
+ } else if ( isset( $highlights[
"title.{$this->matchedAnalyzer}_asciifolding" ] ) ) {
+ $resultForTitle[ 'titleMatch' ] = $title;
+ }
+ $redirectHighlights = array();
+
+ if ( isset( $highlights[
"redirect.title.$this->matchedAnalyzer" ] ) ) {
+ $redirectHighlights = $highlights[
"redirect.title.$this->matchedAnalyzer" ];
}
if ( isset( $highlights[
"redirect.title.$this->matchedAnalyzer" ] ) ) {
- foreach ( $highlights[
"redirect.title.$this->matchedAnalyzer" ] as $redirectTitle ) {
+ $redirectHighlights = array_merge(
$redirectHighlights,
+ $highlights[
"redirect.title.{$this->matchedAnalyzer}_asciifolding" ] );
+ }
+ if ( count( $redirectHighlights ) !== 0 ) {
+ foreach ( $redirectHighlights as $redirectTitle
) {
// The match was against a redirect so
we should replace the $title with one that
// represents the redirect.
// The first step is to strip the
actual highlighting from the title.
diff --git a/includes/Searcher.php b/includes/Searcher.php
index 603bc3f..a170f9f 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -253,17 +253,19 @@
// from the default query we make so we feed it exactly the
right query to highlight.
$this->highlightQuery = new \Elastica\Query\MultiMatch();
$this->highlightQuery->setQuery( $search );
- $this->highlightQuery->setFields( array( 'title.near_match',
'redirect.title.near_match' ) );
+ $this->highlightQuery->setFields( array(
+ 'title.near_match', 'redirect.title.near_match',
+ 'title.near_match_asciifolding',
'redirect.title.near_match_asciifolding',
+ ) );
if ( $wgCirrusSearchAllFields[ 'use' ] ) {
// Inseat of using the highlight query we need to make
one like it that uses the all_near_match field.
$allQuery = new \Elastica\Query\MultiMatch();
$allQuery->setQuery( $search );
- $allQuery->setFields( array( 'all_near_match' ) );
+ $allQuery->setFields( array( 'all_near_match',
'all_near_match.asciifolding' ) );
$this->filters[] = new \Elastica\Filter\Query(
$allQuery );
} else {
$this->filters[] = new \Elastica\Filter\Query(
$this->highlightQuery );
}
- $this->boostLinks = ''; // No boost
return $this->search( 'near_match', $search );
}
@@ -291,17 +293,15 @@
} else {
// Elasticsearch seems to have trouble extracting the
proper terms to highlight
// from the default query we make so we feed it exactly
the right query to highlight.
- $this->highlightQuery = new
\Elastica\Query\MultiMatch();
- $this->highlightQuery->setQuery( $search );
- $this->highlightQuery->setFields( array(
'title.prefix', 'redirect.title.prefix' ) );
- $this->filters[] = new \Elastica\Filter\Query(
$this->highlightQuery );
+ $this->query = new \Elastica\Query\MultiMatch();
+ $this->query->setQuery( $search );
+ $this->query->setFields( array(
+ 'title.prefix^10', 'redirect.title.prefix^10',
+ 'title.prefix_asciifolding',
'redirect.title.prefix_asciifolding'
+ ) );
}
$this->boostTemplates = self::getDefaultBoostTemplates();
- // If there aren't any boost templates then we can use a sort
for ordering
- // rather than a boost.
- if ( count( $this->boostTemplates ) === 0 ) {
- $this->sort = 'incoming_links_desc';
- }
+ $this->boostLinks = true;
return $this->search( 'prefix', $search );
}
diff --git a/tests/browser/features/prefix_search.feature
b/tests/browser/features/prefix_search.feature
index 74fd968..d4a7058 100644
--- a/tests/browser/features/prefix_search.feature
+++ b/tests/browser/features/prefix_search.feature
@@ -67,3 +67,16 @@
| Africa | África | África
|
| AlphaBeta | AlphaBeta | AlphaBeta
|
| ÁlphaBeta | AlphaBeta | AlphaBeta
|
+
+ @accent_squashing
+ Scenario Outline: Search suggestions with accents
+ When I type <term> into the search box
+ Then suggestions should appear
+ And <first_suggestion> is the first suggestion
+ And <second_suggestion> is the second suggestion
+ Examples:
+ | term | first_suggestion | second_suggestion |
+ | Áccent Sorting | Áccent Sorting | Accent Sorting |
+ | áccent Sorting | Áccent Sorting | Accent Sorting |
+ | Accent Sorting | Accent Sorting | Áccent Sorting |
+ | accent Sorting | Accent Sorting | Áccent Sorting |
diff --git a/tests/browser/features/relevancy.feature
b/tests/browser/features/relevancy.feature
index 9870b7e..b1db81e 100644
--- a/tests/browser/features/relevancy.feature
+++ b/tests/browser/features/relevancy.feature
@@ -62,4 +62,18 @@
Scenario: Redirects count as incoming links
When I search for Relevancyredirecttest
Then Relevancyredirecttest Larger is the first search result
- And Relevancyredirecttest Smaller is the second search result
\ No newline at end of file
+ And Relevancyredirecttest Smaller is the second search result
+
+ Scenario: Results are sorted based on how close the match is
+ When I search for Relevancyclosetest Foô
+ And I disable incoming links in the weighting
+ Then Relevancyclosetest Foô is the first search result
+ And Relevancyclosetest Foo is the second search result
+ And Foo Relevancyclosetest is the third search result
+
+ Scenario: Results are sorted based on how close the match is (backwards this
time)
+ When I search for Relevancyclosetest Foo
+ And I disable incoming links in the weighting
+ Then Relevancyclosetest Foo is the first search result
+ And Relevancyclosetest Foô is the second search result
+ And Foo Relevancyclosetest is the third search result
diff --git a/tests/browser/features/step_definitions/search_steps.rb
b/tests/browser/features/step_definitions/search_steps.rb
index 9254641..8d2cb25 100644
--- a/tests/browser/features/step_definitions/search_steps.rb
+++ b/tests/browser/features/step_definitions/search_steps.rb
@@ -105,9 +105,16 @@
end
Then(/^(.+) is the first suggestion$/) do |title|
if title == "none"
- on(SearchPage).one_result_element.should_not be_visible
+ on(SearchPage).first_result_element.should_not be_visible
else
- on(SearchPage).one_result.should == title
+ on(SearchPage).first_result.should == title
+ end
+end
+Then(/^(.+) is the second suggestion$/) do |title|
+ if title == "none"
+ on(SearchPage).second_result_element.should_not be_visible
+ else
+ on(SearchPage).second_result.should == title
end
end
Then(/^(.+) is not in the suggestions$/) do |title|
diff --git a/tests/browser/features/step_definitions/simple_search_steps.rb
b/tests/browser/features/step_definitions/simple_search_steps.rb
index 2a4c2e7..8c869c2 100644
--- a/tests/browser/features/step_definitions/simple_search_steps.rb
+++ b/tests/browser/features/step_definitions/simple_search_steps.rb
@@ -21,7 +21,7 @@
@browser.url.should match Regexp.escape("&title=Special%3ASearch")
end
Then(/^(.+) should be the first result$/) do |page_name|
- on(SearchPage).one_result.should == page_name
+ on(SearchPage).first_result.should == page_name
end
Then(/^the page I arrive on has title (.+)$/) do |title|
diff --git a/tests/browser/features/support/hooks.rb
b/tests/browser/features/support/hooks.rb
index 2de79e2..c54b5a1 100644
--- a/tests/browser/features/support/hooks.rb
+++ b/tests/browser/features/support/hooks.rb
@@ -474,6 +474,9 @@
And a page named Relevancyredirecttest Larger/A exists with contents
[[Relevancyredirecttest Larger]]
And a page named Relevancyredirecttest Larger/B exists with contents
[[Relevancyredirecttest Larger/Redirect]]
And a page named Relevancyredirecttest Larger/C exists with contents
[[Relevancyredirecttest Larger/Redirect]]
+ And a page named Relevancyclosetest Foô exists
+ And a page named Relevancyclosetest Foo exists
+ And a page named Foo Relevancyclosetest exists
)
end
relevancy = true
@@ -564,3 +567,14 @@
removed_text = true
end
end
+
+accent_squashing = false
+Before("@accent_squashing") do
+ unless accent_squashing
+ steps %(
+ Given a page named Áccent Sorting exists
+ And a page named Accent Sorting exists
+ )
+ accent_squashing = true
+ end
+end
diff --git a/tests/browser/features/support/pages/search_page.rb
b/tests/browser/features/support/pages/search_page.rb
index 429124d..1615e99 100644
--- a/tests/browser/features/support/pages/search_page.rb
+++ b/tests/browser/features/support/pages/search_page.rb
@@ -6,6 +6,7 @@
text_field(:search_input, id: "searchInput")
div(:search_results, class: "suggestions-results")
div(:search_special, class: "suggestions-special")
- div(:one_result, class: "suggestions-result")
+ div(:first_result, class: "suggestions-result", index: 0)
+ div(:second_result, class: "suggestions-result", index: 1)
links(:all_results, class: "suggestions-result") { |page|
page.search_results_element.link_elements }
end
--
To view, visit https://gerrit.wikimedia.org/r/168071
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I1e9dc357b861edf1ff3c959e3ac3b287292c2bea
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
Gerrit-Reviewer: Chad <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits