Tjones has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/315837

Change subject: Improve processing of the apostrophe by the search engine in 
Ukrainian
......................................................................

Improve processing of the apostrophe by the search engine in Ukrainian

Refactor "russian" config (also used by Ukrainian) to allow language-
specific config. Refactor Russian Ё/Е mapping to be Russian-specific.

Map right single quote and modifier letter apostrophe to aopstrophe
for Ukrainian search. Override map of right single quote and modifier
letter apostrophe to space (as word breaks) for Ukrainian completion
 suggester.

Bug: T146358
Change-Id: Ibbbe54b30c8765b9b70cd60993aac6ea9a8aabd6
---
M includes/Maintenance/AnalysisConfigBuilder.php
M includes/Maintenance/SuggesterAnalysisConfigBuilder.php
2 files changed, 41 insertions(+), 11 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/37/315837/1

diff --git a/includes/Maintenance/AnalysisConfigBuilder.php 
b/includes/Maintenance/AnalysisConfigBuilder.php
index 4e80b9a..a17d321 100644
--- a/includes/Maintenance/AnalysisConfigBuilder.php
+++ b/includes/Maintenance/AnalysisConfigBuilder.php
@@ -500,15 +500,27 @@
                                'type' => 'mapping',
                                'mappings' => [
                                        '\u0301=>',             // combining 
acute accent, only used to show stress T102298
-                                       // T124592 fold ё=>е and Ё=>Е, 
precomposed or with combining diacritic
-                                       '\u0435\u0308=>\u0435',
-                                       '\u0415\u0308=>\u0415',
-                                       '\u0451=>\u0435',
-                                       '\u0401=>\u0415',
                                        '\u0130=>I',    // dotted I (fix 
regression caused by unpacking)
                                ],
                        ];
 
+                       // The Russian analyzer is also used for Ukrainian and 
Rusyn for now, so processing that's
+                       // very specific to Russian should be separated out
+                       if ($this->language == 'ru') {
+                               // T124592 fold ё=>е and Ё=>Е, precomposed or 
with combining diacritic
+                               $config[ 'char_filter' ][ 'russian_charfilter' 
][ 'mappings' ][] = '\u0435\u0308=>\u0435';
+                               $config[ 'char_filter' ][ 'russian_charfilter' 
][ 'mappings' ][] = '\u0415\u0308=>\u0415';
+                               $config[ 'char_filter' ][ 'russian_charfilter' 
][ 'mappings' ][] = '\u0451=>\u0435';
+                               $config[ 'char_filter' ][ 'russian_charfilter' 
][ 'mappings' ][] = '\u0401=>\u0415';
+                               }
+
+                       // Ukrainian uses the Russian analyzer for now, but we 
want some Ukrainian-specific processing
+                       if ($this->language == 'uk') {
+                               // T146358 map right quote and modifier letter 
apostrophe to apostrophe
+                               $config[ 'char_filter' ][ 'russian_charfilter' 
][ 'mappings' ][] = '\u02BC=>\u0027';
+                               $config[ 'char_filter' ][ 'russian_charfilter' 
][ 'mappings' ][] = '\u2019=>\u0027';
+                               }
+
                        // Drop acute stress marks and fold ё=>е everywhere
                        $config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 
'russian_charfilter';
                        $config[ 'analyzer' ][ 'plain_search' ][ 'char_filter' 
][] = 'russian_charfilter';
diff --git a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php 
b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
index 0b58676..275be0a 100644
--- a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
+++ b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
@@ -180,13 +180,31 @@
                switch ( $this->getDefaultTextAnalyzerType() ) {
                // Please add languages in alphabetical order.
                case 'russian':
+                       $wbhMappings = &$config[ 'char_filter' ][ 
'word_break_helper' ][ 'mappings' ];
+
                        // T102298 ignore combining acute / stress accents
-                       $config[ 'char_filter' ][ 'word_break_helper' ][ 
'mappings' ][] = '\u0301=>';
-                       // T124592 fold ё=>е and Ё=>Е, precomposed or with 
combining diacritic
-                       $config[ 'char_filter' ][ 'word_break_helper' ][ 
'mappings' ][] = '\u0451=>\u0435';
-                       $config[ 'char_filter' ][ 'word_break_helper' ][ 
'mappings' ][] = '\u0401=>\u0415';
-                       $config[ 'char_filter' ][ 'word_break_helper' ][ 
'mappings' ][] = '\u0435\u0308=>\u0435';
-                       $config[ 'char_filter' ][ 'word_break_helper' ][ 
'mappings' ][] = '\u0415\u0308=>\u0415';
+                       $wbhMappings[] = '\u0301=>';
+
+                       // The Russian analyzer is also used for Ukrainian and 
Rusyn for now, so processing that's
+                       // very specific to Russian should be separated out
+                       if ($this->getLanguage() == 'ru') {
+                               // T124592 fold ё=>е and Ё=>Е, precomposed or 
with combining diacritic
+                               $wbhMappings[] = '\u0451=>\u0435';
+                               $wbhMappings[] = '\u0401=>\u0415';
+                               $wbhMappings[] = '\u0435\u0308=>\u0435';
+                               $wbhMappings[] = '\u0415\u0308=>\u0415';
+                               }
+
+                       // Ukrainian uses the Russian analyzer for now, but we 
want some Ukrainian-specific processing
+                       if ($this->getLanguage() == 'uk') {
+                               // T146358 map right quote and modifier letter 
apostrophe to apostrophe, not space
+                               // remove existing mappings first
+                               $wbhMappings = array_diff($wbhMappings, 
['\u2019=>\u0020', '\u02BC=>\u0020']);
+                               $wbhMappings = array_values($wbhMappings);
+                               $wbhMappings[] = '\u02BC=>\u0027';
+                               $wbhMappings[] = '\u2019=>\u0027';
+                               }
+
                        break;
                }
 

-- 
To view, visit https://gerrit.wikimedia.org/r/315837
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ibbbe54b30c8765b9b70cd60993aac6ea9a8aabd6
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Tjones <tjo...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to