jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/365251 )

Change subject: Configure Japanese Language Analysis with Kuromoji
......................................................................


Configure Japanese Language Analysis with Kuromoji

With community input, it was decided that the Kuromoji language analyzer
should not be deployed. However, if it ever were deployed, this is the
baseline configuration that I would recommend.

It fixes problems Kuromoji has:
 - inconsistent treatment of fullwidth numbers
 - many non-Japanese, non-Latin words are not indexed

Incidentally re-format italian_elision not to take up so much vertical
space.

Bug: T166731
Change-Id: I133cdc9affa3ed308a46a87892e069cd7461848e
---
M includes/Maintenance/AnalysisConfigBuilder.php
M tests/unit/fixtures/languageAnalysis/ja.expected
2 files changed, 67 insertions(+), 26 deletions(-)

Approvals:
  jenkins-bot: Verified
  DCausse: Looks good to me, approved



diff --git a/includes/Maintenance/AnalysisConfigBuilder.php 
b/includes/Maintenance/AnalysisConfigBuilder.php
index 86bbf83..43519bb 100644
--- a/includes/Maintenance/AnalysisConfigBuilder.php
+++ b/includes/Maintenance/AnalysisConfigBuilder.php
@@ -716,27 +716,9 @@
                        $config[ 'filter' ][ 'italian_elision' ] = [
                                'type' => 'elision',
                                'articles' => [
-                                       'c',
-                                       'l',
-                                       'all',
-                                       'dall',
-                                       'dell',
-                                       'nell',
-                                       'sull',
-                                       'coll',
-                                       'pell',
-                                       'gl',
-                                       'agl',
-                                       'dagl',
-                                       'degl',
-                                       'negl',
-                                       'sugl',
-                                       'un',
-                                       'm',
-                                       't',
-                                       's',
-                                       'v',
-                                       'd'
+                                       'c', 'l', 'all', 'dall', 'dell', 
'nell', 'sull',
+                                       'coll', 'pell', 'gl', 'agl', 'dagl', 
'degl', 'negl',
+                                       'sugl', 'un', 'm', 't', 's', 'v', 'd'
                                ],
                        ];
                        $config[ 'filter' ][ 'italian_stop' ] = [
@@ -768,6 +750,34 @@
                        $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' 
][] = 'asciifolding_preserve';
 
                        // In Italian text_search is just a copy of text
+                       $config[ 'analyzer' ][ 'text_search' ] = $config[ 
'analyzer' ][ 'text' ];
+                       break;
+               case 'japanese':
+                       // See 
https://www.mediawiki.org/wiki/User:TJones_(WMF)/T166731
+                       $config[ 'char_filter' ][ 'fullwidthnumfix' ] = [
+                               // pre-convert fullwidth numbers because 
Kuromoji tokenizer treats them weirdly
+                               'type' => 'mapping',
+                               'mappings' => [
+                                       "\uff10=>0", "\uff11=>1", "\uff12=>2", 
"\uff13=>3",
+                                       "\uff14=>4", "\uff15=>5", "\uff16=>6", 
"\uff17=>7",
+                                       "\uff18=>8", "\uff19=>9",
+                               ],
+                       ];
+
+                       $config[ 'analyzer' ][ 'text' ] = [
+                               'type' => 'custom',
+                               'char_filter' => [ 'fullwidthnumfix' ],
+                               'tokenizer' => 'kuromoji_tokenizer',
+                       ];
+
+                       $filters = [];
+                       $filters[] = 'kuromoji_baseform';
+                       $filters[] = 'cjk_width';
+                       $filters[] = 'ja_stop';
+                       $filters[] = 'kuromoji_stemmer';
+                       $filters[] = 'lowercase';
+                       $config[ 'analyzer' ][ 'text' ][ 'filter' ] = $filters;
+
                        $config[ 'analyzer' ][ 'text_search' ] = $config[ 
'analyzer' ][ 'text' ];
                        break;
                case 'russian':
@@ -1050,7 +1060,7 @@
                // For Hebrew, see 
https://www.mediawiki.org/wiki/User:TJones_(WMF)/T162741
 
                'analysis-stempel' => [ 'pl' => 'polish' ],
-               'analysis-kuromoji' => [ 'ja' => 'kuromoji' ],
+               'analysis-kuromoji' => [ 'ja' => 'japanese' ],
                'analysis-stconvert,analysis-smartcn' => [ 'zh' => 'chinese' ],
                'analysis-hebrew' => [ 'he' => 'hebrew' ],
                'analysis-ukrainian' => [ 'uk' => 'ukrainian' ],
diff --git a/tests/unit/fixtures/languageAnalysis/ja.expected 
b/tests/unit/fixtures/languageAnalysis/ja.expected
index 9c2d409..03c8a8d 100644
--- a/tests/unit/fixtures/languageAnalysis/ja.expected
+++ b/tests/unit/fixtures/languageAnalysis/ja.expected
@@ -1,15 +1,31 @@
 {
     "analyzer": {
         "text": {
-            "type": "kuromoji",
+            "type": "custom",
             "char_filter": [
-                "word_break_helper"
+                "fullwidthnumfix"
+            ],
+            "tokenizer": "kuromoji_tokenizer",
+            "filter": [
+                "kuromoji_baseform",
+                "cjk_width",
+                "ja_stop",
+                "kuromoji_stemmer",
+                "lowercase"
             ]
         },
         "text_search": {
-            "type": "kuromoji",
+            "type": "custom",
             "char_filter": [
-                "word_break_helper"
+                "fullwidthnumfix"
+            ],
+            "tokenizer": "kuromoji_tokenizer",
+            "filter": [
+                "kuromoji_baseform",
+                "cjk_width",
+                "ja_stop",
+                "kuromoji_stemmer",
+                "lowercase"
             ]
         },
         "plain": {
@@ -247,6 +263,21 @@
                 ")=>\\u0020",
                 ":=>\\u0020"
             ]
+        },
+        "fullwidthnumfix": {
+            "type": "mapping",
+            "mappings": [
+                "\\uff10=>0",
+                "\\uff11=>1",
+                "\\uff12=>2",
+                "\\uff13=>3",
+                "\\uff14=>4",
+                "\\uff15=>5",
+                "\\uff16=>6",
+                "\\uff17=>7",
+                "\\uff18=>8",
+                "\\uff19=>9"
+            ]
         }
     }
 }
\ No newline at end of file

-- 
To view, visit https://gerrit.wikimedia.org/r/365251
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I133cdc9affa3ed308a46a87892e069cd7461848e
Gerrit-PatchSet: 2
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Tjones <[email protected]>
Gerrit-Reviewer: Cindy-the-browser-test-bot <[email protected]>
Gerrit-Reviewer: DCausse <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Gehel <[email protected]>
Gerrit-Reviewer: Smalyshev <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to