jenkins-bot has submitted this change and it was merged.

Change subject: Prefix search always squashes accents
......................................................................


Prefix search always squashes accents

Enable accent squashing for all prefix searches.  The argument is that
even if you speak a language that prefers accents not be squashed you
might be on a keyboard without accents.  We're not taking this argument
further and adding accent squashing to all searches at this point (or
maybe ever).

Bug: 67521
Change-Id: I1e9dc357b861edf1ff3c959e3ac3b287292c2bea
---
M includes/Maintenance/AnalysisConfigBuilder.php
M includes/Maintenance/MappingConfigBuilder.php
M includes/Search/ResultsType.php
M includes/Searcher.php
M tests/browser/features/prefix_search.feature
M tests/browser/features/relevancy.feature
M tests/browser/features/step_definitions/search_steps.rb
M tests/browser/features/step_definitions/simple_search_steps.rb
M tests/browser/features/support/hooks.rb
M tests/browser/features/support/pages/search_page.rb
10 files changed, 122 insertions(+), 38 deletions(-)

Approvals:
  Chad: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/includes/Maintenance/AnalysisConfigBuilder.php 
b/includes/Maintenance/AnalysisConfigBuilder.php
index 6474627..cdc5ed4 100644
--- a/includes/Maintenance/AnalysisConfigBuilder.php
+++ b/includes/Maintenance/AnalysisConfigBuilder.php
@@ -30,7 +30,7 @@
         * and change the minor version when it changes but isn't
         * incompatible
         */
-       const VERSION = '0.9';
+       const VERSION = '0.10';
 
        /**
         * Language code we're building analysis for
@@ -112,10 +112,22 @@
                                        'filter' => array( 'lowercase' ),
                                        'char_filter' => array( 
'near_space_flattener' ),
                                ),
+                               'near_match_asciifolding' => array(
+                                       'type' => 'custom',
+                                       'tokenizer' => 'no_splitting',
+                                       'filter' => array( 'lowercase', 
'asciifolding' ),
+                                       'char_filter' => array( 
'near_space_flattener' ),
+                               ),
                                'prefix' => array(
                                        'type' => 'custom',
                                        'tokenizer' => 'prefix',
                                        'filter' => array( 'lowercase' ),
+                                       'char_filter' => array( 
'near_space_flattener' ),
+                               ),
+                               'prefix_asciifolding' => array(
+                                       'type' => 'custom',
+                                       'tokenizer' => 'prefix',
+                                       'filter' => array( 'lowercase', 
'asciifolding' ),
                                        'char_filter' => array( 
'near_space_flattener' ),
                                ),
                                'word_prefix' => array(
@@ -157,6 +169,10 @@
                                        'max_gram' => 
Searcher::MAX_TITLE_SEARCH,
                                ),
                                'asciifolding' => array(
+                                       'type' => 'asciifolding',
+                                       'preserve_original' => false
+                               ),
+                               'asciifolding_preserve' => array(
                                        'type' => 'asciifolding',
                                        'preserve_original' => true
                                ),
@@ -245,15 +261,13 @@
                        $filters[] = 'stop';
                        $filters[] = 'kstem';
                        $filters[] = 'custom_stem';
-                       $filters[] = 'asciifolding';
+                       $filters[] = 'asciifolding_preserve';
                        $config[ 'analyzer' ][ 'text' ][ 'filter' ] = $filters;
 
-                       // Add asciifolding to the the plain analyzer as well 
(but not plain_search)
-                       $config[ 'analyzer' ][ 'plain' ][ 'filter' ][] = 
'asciifolding';
-                       // Add asciifolding to the prefix queries and 
incategory filters
-                       $config[ 'analyzer' ][ 'prefix' ][ 'filter' ][] = 
'asciifolding';
-                       $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' 
][] = 'asciifolding';
-                       $config[ 'analyzer' ][ 'near_match' ][ 'filter' ][] = 
'asciifolding';
+                       // Add asciifolding_preserve to the the plain analyzer 
as well (but not plain_search)
+                       $config[ 'analyzer' ][ 'plain' ][ 'filter' ][] = 
'asciifolding_preserve';
+                       // Add asciifolding_preserve filters
+                       $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' 
][] = 'asciifolding_preserve';
 
                        // In English text_search is just a copy of text
                        $config[ 'analyzer' ][ 'text_search' ] = $config[ 
'analyzer' ][ 'text' ];
@@ -268,10 +282,8 @@
                        );
                        break;
                case 'french':
-                       // Add asciifolding to the prefix queries and 
incategory filters
-                       $config[ 'analyzer' ][ 'prefix' ][ 'filter' ][] = 
'asciifolding';
-                       $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' 
][] = 'asciifolding';
-                       $config[ 'analyzer' ][ 'near_match' ][ 'filter' ][] = 
'asciifolding';
+                       // Add asciifolding_preserve to filters
+                       $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' 
][] = 'asciifolding_preserve';
                        break;
                case 'italian':
                        $config[ 'filter' ][ 'italian_elision' ] = array(
@@ -324,12 +336,10 @@
                        $filters[] = 'asciifolding';
                        $config[ 'analyzer' ][ 'text' ][ 'filter' ] = $filters;
 
-                       // Add asciifolding to the the plain analyzer as well 
(but not plain_search)
-                       $config[ 'analyzer' ][ 'plain' ][ 'filter' ][] = 
'asciifolding';
-                       // Add asciifolding to the prefix queries and 
incategory filters
-                       $config[ 'analyzer' ][ 'prefix' ][ 'filter' ][] = 
'asciifolding';
-                       $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' 
][] = 'asciifolding';
-                       $config[ 'analyzer' ][ 'near_match' ][ 'filter' ][] = 
'asciifolding';
+                       // Add asciifolding_preserve to the the plain analyzer 
as well (but not plain_search)
+                       $config[ 'analyzer' ][ 'plain' ][ 'filter' ][] = 
'asciifolding_preserve';
+                       // Add asciifolding_preserve to filters
+                       $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' 
][] = 'asciifolding_preserve';
 
                        // In Italian text_search is just a copy of text
                        $config[ 'analyzer' ][ 'text_search' ] = $config[ 
'analyzer' ][ 'text' ];
diff --git a/includes/Maintenance/MappingConfigBuilder.php 
b/includes/Maintenance/MappingConfigBuilder.php
index 067632a..6696783 100644
--- a/includes/Maintenance/MappingConfigBuilder.php
+++ b/includes/Maintenance/MappingConfigBuilder.php
@@ -44,7 +44,7 @@
         * and change the minor version when it changes but isn't
         * incompatible
         */
-       const VERSION = '1.7';
+       const VERSION = '1.8';
 
        /**
         * Whether to allow prefix searches to match on any word
@@ -90,7 +90,9 @@
                $titleExtraAnalyzers = array(
                        $suggestExtra,
                        array( 'index_analyzer' => 'prefix', 'search_analyzer' 
=> 'near_match', 'index_options' => 'docs' ),
+                       array( 'index_analyzer' => 'prefix_asciifolding', 
'search_analyzer' => 'near_match_asciifolding', 'index_options' => 'docs' ),
                        array( 'analyzer' => 'near_match', 'index_options' => 
'docs' ),
+                       array( 'analyzer' => 'near_match_asciifolding', 
'index_options' => 'docs' ),
                        array( 'analyzer' => 'keyword', 'index_options' => 
'docs' ),
                );
                if ( $this->prefixSearchStartsWithAnyWord ) {
@@ -195,6 +197,15 @@
                                'index_options' => 'freqs',
                                'position_offset_gap' => 
self::POSITION_OFFSET_GAP,
                                'norms' => array( 'enabled' => false ),
+                               'fields' => array(
+                                       'asciifolding' => array(
+                                               'type' => 'string',
+                                               'analyzer' => 
'near_match_asciifolding',
+                                               'index_options' => 'freqs',
+                                               'position_offset_gap' => 
self::POSITION_OFFSET_GAP,
+                                               'norms' => array( 'enabled' => 
false ),
+                                       ),
+                               ),
                        );
                        $nearMatchFields = array(
                                'title' => $wgCirrusSearchWeights[ 'title' ],
diff --git a/includes/Search/ResultsType.php b/includes/Search/ResultsType.php
index 31655ec..fea4d6a 100644
--- a/includes/Search/ResultsType.php
+++ b/includes/Search/ResultsType.php
@@ -125,10 +125,13 @@
                        'post_tags' => array( Searcher::HIGHLIGHT_POST ),
                        'fields' => array(
                                "title.$this->matchedAnalyzer" => $entireValue,
+                               "title.{$this->matchedAnalyzer}_asciifolding" 
=> $entireValue,
                                "redirect.title.$this->matchedAnalyzer" => 
$manyValues,
-                       )
+                               
"redirect.title.{$this->matchedAnalyzer}_asciifolding" => $manyValues,
+                       ),
                );
        }
+
        /**
         * Convert the results to titles.
         * @return array with optional keys:
@@ -148,9 +151,20 @@
                        // though.
                        if ( isset( $highlights[ "title.$this->matchedAnalyzer" 
] ) ) {
                                $resultForTitle[ 'titleMatch' ] = $title;
+                       } else if ( isset( $highlights[ 
"title.{$this->matchedAnalyzer}_asciifolding" ] ) ) {
+                               $resultForTitle[ 'titleMatch' ] = $title;
+                       }
+                       $redirectHighlights = array();
+
+                       if ( isset( $highlights[ 
"redirect.title.$this->matchedAnalyzer" ] ) ) {
+                               $redirectHighlights = $highlights[ 
"redirect.title.$this->matchedAnalyzer" ];
                        }
                        if ( isset( $highlights[ 
"redirect.title.$this->matchedAnalyzer" ] ) ) {
-                               foreach ( $highlights[ 
"redirect.title.$this->matchedAnalyzer" ] as $redirectTitle ) {
+                               $redirectHighlights = array_merge( 
$redirectHighlights,
+                                       $highlights[ 
"redirect.title.{$this->matchedAnalyzer}_asciifolding" ] );
+                       }
+                       if ( count( $redirectHighlights ) !== 0 ) {
+                               foreach ( $redirectHighlights as $redirectTitle 
) {
                                        // The match was against a redirect so 
we should replace the $title with one that
                                        // represents the redirect.
                                        // The first step is to strip the 
actual highlighting from the title.
diff --git a/includes/Searcher.php b/includes/Searcher.php
index 603bc3f..a170f9f 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -253,17 +253,19 @@
                // from the default query we make so we feed it exactly the 
right query to highlight.
                $this->highlightQuery = new \Elastica\Query\MultiMatch();
                $this->highlightQuery->setQuery( $search );
-               $this->highlightQuery->setFields( array( 'title.near_match', 
'redirect.title.near_match' ) );
+               $this->highlightQuery->setFields( array(
+                       'title.near_match', 'redirect.title.near_match',
+                       'title.near_match_asciifolding', 
'redirect.title.near_match_asciifolding',
+               ) );
                if ( $wgCirrusSearchAllFields[ 'use' ] ) {
                        // Inseat of using the highlight query we need to make 
one like it that uses the all_near_match field.
                        $allQuery = new \Elastica\Query\MultiMatch();
                        $allQuery->setQuery( $search );
-                       $allQuery->setFields( array( 'all_near_match' ) );
+                       $allQuery->setFields( array( 'all_near_match', 
'all_near_match.asciifolding' ) );
                        $this->filters[] = new \Elastica\Filter\Query( 
$allQuery );
                } else {
                        $this->filters[] = new \Elastica\Filter\Query( 
$this->highlightQuery );
                }
-               $this->boostLinks = ''; // No boost
 
                return $this->search( 'near_match', $search );
        }
@@ -291,17 +293,15 @@
                } else {
                        // Elasticsearch seems to have trouble extracting the 
proper terms to highlight
                        // from the default query we make so we feed it exactly 
the right query to highlight.
-                       $this->highlightQuery = new 
\Elastica\Query\MultiMatch();
-                       $this->highlightQuery->setQuery( $search );
-                       $this->highlightQuery->setFields( array( 
'title.prefix', 'redirect.title.prefix' ) );
-                       $this->filters[] = new \Elastica\Filter\Query( 
$this->highlightQuery );
+                       $this->query = new \Elastica\Query\MultiMatch();
+                       $this->query->setQuery( $search );
+                       $this->query->setFields( array(
+                               'title.prefix^10', 'redirect.title.prefix^10',
+                               'title.prefix_asciifolding', 
'redirect.title.prefix_asciifolding'
+                       ) );
                }
                $this->boostTemplates = self::getDefaultBoostTemplates();
-               // If there aren't any boost templates then we can use a sort 
for ordering
-               // rather than a boost.
-               if ( count( $this->boostTemplates ) === 0 ) {
-                       $this->sort = 'incoming_links_desc';
-               }
+               $this->boostLinks = true;
 
                return $this->search( 'prefix', $search );
        }
diff --git a/tests/browser/features/prefix_search.feature 
b/tests/browser/features/prefix_search.feature
index 74fd968..d4a7058 100644
--- a/tests/browser/features/prefix_search.feature
+++ b/tests/browser/features/prefix_search.feature
@@ -67,3 +67,16 @@
     | Africa                 | África                 | África                 
|
     | AlphaBeta              | AlphaBeta              | AlphaBeta              
|
     | ÁlphaBeta              | AlphaBeta              | AlphaBeta              
|
+
+  @accent_squashing
+  Scenario Outline: Search suggestions with accents
+    When I type <term> into the search box
+    Then suggestions should appear
+      And <first_suggestion> is the first suggestion
+      And <second_suggestion> is the second suggestion
+  Examples:
+    |      term      | first_suggestion | second_suggestion |
+    | Áccent Sorting | Áccent Sorting   | Accent Sorting    |
+    | áccent Sorting | Áccent Sorting   | Accent Sorting    |
+    | Accent Sorting | Accent Sorting   | Áccent Sorting    |
+    | accent Sorting | Accent Sorting   | Áccent Sorting    |
diff --git a/tests/browser/features/relevancy.feature 
b/tests/browser/features/relevancy.feature
index 9870b7e..b1db81e 100644
--- a/tests/browser/features/relevancy.feature
+++ b/tests/browser/features/relevancy.feature
@@ -62,4 +62,18 @@
   Scenario: Redirects count as incoming links
     When I search for Relevancyredirecttest
     Then Relevancyredirecttest Larger is the first search result
-      And Relevancyredirecttest Smaller is the second search result
\ No newline at end of file
+      And Relevancyredirecttest Smaller is the second search result
+
+  Scenario: Results are sorted based on how close the match is
+    When I search for Relevancyclosetest Foô
+      And I disable incoming links in the weighting
+    Then Relevancyclosetest Foô is the first search result
+      And Relevancyclosetest Foo is the second search result
+      And Foo Relevancyclosetest is the third search result
+
+  Scenario: Results are sorted based on how close the match is (backwards this 
time)
+    When I search for Relevancyclosetest Foo
+      And I disable incoming links in the weighting
+    Then Relevancyclosetest Foo is the first search result
+      And Relevancyclosetest Foô is the second search result
+      And Foo Relevancyclosetest is the third search result
diff --git a/tests/browser/features/step_definitions/search_steps.rb 
b/tests/browser/features/step_definitions/search_steps.rb
index 9254641..8d2cb25 100644
--- a/tests/browser/features/step_definitions/search_steps.rb
+++ b/tests/browser/features/step_definitions/search_steps.rb
@@ -105,9 +105,16 @@
 end
 Then(/^(.+) is the first suggestion$/) do |title|
   if title == "none"
-    on(SearchPage).one_result_element.should_not be_visible
+    on(SearchPage).first_result_element.should_not be_visible
   else
-    on(SearchPage).one_result.should == title
+    on(SearchPage).first_result.should == title
+  end
+end
+Then(/^(.+) is the second suggestion$/) do |title|
+  if title == "none"
+    on(SearchPage).second_result_element.should_not be_visible
+  else
+    on(SearchPage).second_result.should == title
   end
 end
 Then(/^(.+) is not in the suggestions$/) do |title|
diff --git a/tests/browser/features/step_definitions/simple_search_steps.rb 
b/tests/browser/features/step_definitions/simple_search_steps.rb
index 2a4c2e7..8c869c2 100644
--- a/tests/browser/features/step_definitions/simple_search_steps.rb
+++ b/tests/browser/features/step_definitions/simple_search_steps.rb
@@ -21,7 +21,7 @@
   @browser.url.should match Regexp.escape("&title=Special%3ASearch")
 end
 Then(/^(.+) should be the first result$/) do |page_name|
-  on(SearchPage).one_result.should == page_name
+  on(SearchPage).first_result.should == page_name
 end
 
 Then(/^the page I arrive on has title (.+)$/) do |title|
diff --git a/tests/browser/features/support/hooks.rb 
b/tests/browser/features/support/hooks.rb
index 2de79e2..c54b5a1 100644
--- a/tests/browser/features/support/hooks.rb
+++ b/tests/browser/features/support/hooks.rb
@@ -474,6 +474,9 @@
       And a page named Relevancyredirecttest Larger/A exists with contents 
[[Relevancyredirecttest Larger]]
       And a page named Relevancyredirecttest Larger/B exists with contents 
[[Relevancyredirecttest Larger/Redirect]]
       And a page named Relevancyredirecttest Larger/C exists with contents 
[[Relevancyredirecttest Larger/Redirect]]
+      And a page named Relevancyclosetest Foô exists
+      And a page named Relevancyclosetest Foo exists
+      And a page named Foo Relevancyclosetest exists
         )
   end
   relevancy = true
@@ -564,3 +567,14 @@
     removed_text = true
   end
 end
+
+accent_squashing = false
+Before("@accent_squashing") do
+  unless accent_squashing
+    steps %(
+      Given a page named Áccent Sorting exists
+        And a page named Accent Sorting exists
+        )
+    accent_squashing = true
+  end
+end
diff --git a/tests/browser/features/support/pages/search_page.rb 
b/tests/browser/features/support/pages/search_page.rb
index 429124d..1615e99 100644
--- a/tests/browser/features/support/pages/search_page.rb
+++ b/tests/browser/features/support/pages/search_page.rb
@@ -6,6 +6,7 @@
   text_field(:search_input, id: "searchInput")
   div(:search_results, class: "suggestions-results")
   div(:search_special, class: "suggestions-special")
-  div(:one_result, class: "suggestions-result")
+  div(:first_result, class: "suggestions-result", index: 0)
+  div(:second_result, class: "suggestions-result", index: 1)
   links(:all_results, class: "suggestions-result") { |page| 
page.search_results_element.link_elements }
 end

-- 
To view, visit https://gerrit.wikimedia.org/r/168071
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I1e9dc357b861edf1ff3c959e3ac3b287292c2bea
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
Gerrit-Reviewer: Chad <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to