jenkins-bot has submitted this change and it was merged.

Change subject: Improve processing of the apostrophe by the search engine in 
Ukrainian
......................................................................


Improve processing of the apostrophe by the search engine in Ukrainian

Refactor "russian" config (also used by Ukrainian) to allow language-
specific config. Refactor Russian Ё/Е mapping to be Russian-specific.

Map right single quote and modifier letter apostrophe to aopstrophe
for Ukrainian search.

Bug: T146358
Change-Id: Ibbbe54b30c8765b9b70cd60993aac6ea9a8aabd6
---
M includes/Maintenance/AnalysisConfigBuilder.php
M includes/Maintenance/SuggesterAnalysisConfigBuilder.php
2 files changed, 34 insertions(+), 16 deletions(-)

Approvals:
  Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
  DCausse: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/includes/Maintenance/AnalysisConfigBuilder.php 
b/includes/Maintenance/AnalysisConfigBuilder.php
index 4e80b9a..5b887d4 100644
--- a/includes/Maintenance/AnalysisConfigBuilder.php
+++ b/includes/Maintenance/AnalysisConfigBuilder.php
@@ -500,14 +500,33 @@
                                'type' => 'mapping',
                                'mappings' => [
                                        '\u0301=>',             // combining 
acute accent, only used to show stress T102298
-                                       // T124592 fold ё=>е and Ё=>Е, 
precomposed or with combining diacritic
-                                       '\u0435\u0308=>\u0435',
-                                       '\u0415\u0308=>\u0415',
-                                       '\u0451=>\u0435',
-                                       '\u0401=>\u0415',
                                        '\u0130=>I',    // dotted I (fix 
regression caused by unpacking)
                                ],
                        ];
+
+                       $config[ 'char_filter' ][ 'near_space_flattener' ][ 
'mappings' ][] = '\u0301=>'; // T102298
+
+                       // The Russian analyzer is also used for Ukrainian and 
Rusyn for now, so processing that's
+                       // very specific to Russian should be separated out
+                       if ($this->language == 'ru') {
+                               // T124592 fold ё=>е and Ё=>Е, precomposed or 
with combining diacritic
+                               $config[ 'char_filter' ][ 'russian_charfilter' 
][ 'mappings' ][] = '\u0435\u0308=>\u0435';
+                               $config[ 'char_filter' ][ 'russian_charfilter' 
][ 'mappings' ][] = '\u0415\u0308=>\u0415';
+                               $config[ 'char_filter' ][ 'russian_charfilter' 
][ 'mappings' ][] = '\u0451=>\u0435';
+                               $config[ 'char_filter' ][ 'russian_charfilter' 
][ 'mappings' ][] = '\u0401=>\u0415';
+
+                               $config[ 'char_filter' ][ 
'near_space_flattener' ][ 'mappings' ][] = '\u0451=>\u0435';
+                               $config[ 'char_filter' ][ 
'near_space_flattener' ][ 'mappings' ][] = '\u0401=>\u0415';
+                               $config[ 'char_filter' ][ 
'near_space_flattener' ][ 'mappings' ][] = '\u0435\u0308=>\u0435';
+                               $config[ 'char_filter' ][ 
'near_space_flattener' ][ 'mappings' ][] = '\u0415\u0308=>\u0415';
+                       }
+
+                       // Ukrainian uses the Russian analyzer for now, but we 
want some Ukrainian-specific processing
+                       if ($this->language == 'uk') {
+                               // T146358 map right quote and modifier letter 
apostrophe to apostrophe
+                               $config[ 'char_filter' ][ 'russian_charfilter' 
][ 'mappings' ][] = '\u02BC=>\u0027';
+                               $config[ 'char_filter' ][ 'russian_charfilter' 
][ 'mappings' ][] = '\u2019=>\u0027';
+                       }
 
                        // Drop acute stress marks and fold ё=>е everywhere
                        $config[ 'analyzer' ][ 'plain' ][ 'char_filter' ][] = 
'russian_charfilter';
@@ -516,11 +535,6 @@
                        $config[ 'analyzer' ][ 'suggest' ][ 'char_filter' ][] = 
'russian_charfilter';
                        $config[ 'analyzer' ][ 'suggest_reverse' ][ 
'char_filter' ][] = 'russian_charfilter';
 
-                       $config[ 'char_filter' ][ 'near_space_flattener' ][ 
'mappings' ][] = '\u0301=>';
-                       $config[ 'char_filter' ][ 'near_space_flattener' ][ 
'mappings' ][] = '\u0451=>\u0435';
-                       $config[ 'char_filter' ][ 'near_space_flattener' ][ 
'mappings' ][] = '\u0401=>\u0415';
-                       $config[ 'char_filter' ][ 'near_space_flattener' ][ 
'mappings' ][] = '\u0435\u0308=>\u0435';
-                       $config[ 'char_filter' ][ 'near_space_flattener' ][ 
'mappings' ][] = '\u0415\u0308=>\u0415';
 
 
                        // unpack built-in Russian analyzer and add character 
filter T102298 / T124592
diff --git a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php 
b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
index 0b58676..ff3ba1d 100644
--- a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
+++ b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
@@ -182,14 +182,18 @@
                case 'russian':
                        // T102298 ignore combining acute / stress accents
                        $config[ 'char_filter' ][ 'word_break_helper' ][ 
'mappings' ][] = '\u0301=>';
-                       // T124592 fold ё=>е and Ё=>Е, precomposed or with 
combining diacritic
-                       $config[ 'char_filter' ][ 'word_break_helper' ][ 
'mappings' ][] = '\u0451=>\u0435';
-                       $config[ 'char_filter' ][ 'word_break_helper' ][ 
'mappings' ][] = '\u0401=>\u0415';
-                       $config[ 'char_filter' ][ 'word_break_helper' ][ 
'mappings' ][] = '\u0435\u0308=>\u0435';
-                       $config[ 'char_filter' ][ 'word_break_helper' ][ 
'mappings' ][] = '\u0415\u0308=>\u0415';
+
+                       // The Russian analyzer is also used for Ukrainian and 
Rusyn for now, so processing that's
+                       // very specific to Russian should be separated out
+                       if ($this->getLanguage() == 'ru') {
+                               // T124592 fold ё=>е and Ё=>Е, precomposed or 
with combining diacritic
+                               $config[ 'char_filter' ][ 'word_break_helper' 
][ 'mappings' ][] = '\u0451=>\u0435';
+                               $config[ 'char_filter' ][ 'word_break_helper' 
][ 'mappings' ][] = '\u0401=>\u0415';
+                               $config[ 'char_filter' ][ 'word_break_helper' 
][ 'mappings' ][] = '\u0435\u0308=>\u0435';
+                               $config[ 'char_filter' ][ 'word_break_helper' 
][ 'mappings' ][] = '\u0415\u0308=>\u0415';
+                       }
                        break;
                }
-
 
                if ( $this->isIcuAvailable() ) {
                        foreach ( $config[ 'analyzer' ] as $k => &$analyzer ) {

-- 
To view, visit https://gerrit.wikimedia.org/r/315837
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Ibbbe54b30c8765b9b70cd60993aac6ea9a8aabd6
Gerrit-PatchSet: 2
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Tjones <tjo...@wikimedia.org>
Gerrit-Reviewer: Cindy-the-browser-test-bot <bernhardsone...@gmail.com>
Gerrit-Reviewer: DCausse <dcau...@wikimedia.org>
Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org>
Gerrit-Reviewer: Gehel <gleder...@wikimedia.org>
Gerrit-Reviewer: Manybubbles <never...@wikimedia.org>
Gerrit-Reviewer: Smalyshev <smalys...@wikimedia.org>
Gerrit-Reviewer: Tjones <tjo...@wikimedia.org>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to