Tjones has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/357299 )

Change subject: Enable Hebrew Analysis
......................................................................

Enable Hebrew Analysis

Update config to enable HebMorph ("analysis-hebrew") if it is
available, and configure analysis elements for text and text_search.

Update AnalysisConfigBuilder tests.

Bug: T162741
Change-Id: Ice9ffc7a35d879d857659311aae8dd9d01576189
---
M includes/Maintenance/AnalysisConfigBuilder.php
M tests/unit/Maintenance/AnalysisConfigBuilderTest.php
M tests/unit/fixtures/languageAnalysis/he.config
M tests/unit/fixtures/languageAnalysis/he.expected
4 files changed, 23 insertions(+), 16 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/99/357299/1

diff --git a/includes/Maintenance/AnalysisConfigBuilder.php 
b/includes/Maintenance/AnalysisConfigBuilder.php
index 19c816e..7818250 100644
--- a/includes/Maintenance/AnalysisConfigBuilder.php
+++ b/includes/Maintenance/AnalysisConfigBuilder.php
@@ -695,11 +695,12 @@
                        $config[ 'filter' ][ 'lowercase' ][ 'language' ] = 
'greek';
                        break;
                case 'hebrew':
-                       // If the hebrew plugin kicked us over to the hebrew 
analyzer use its companion
-                       // analyzer for queries.
-                       if ( $config[ 'analyzer' ][ 'text_search' ][ 'type' ] 
=== 'hebrew' ) {
-                               $config[ 'analyzer' ][ 'text_search' ][ 'type' 
] = 'hebrew_exact';
-                       }
+                       $config[ 'analyzer' ][ 'text' ] = [
+                               'type' => 'custom',
+                               'tokenizer' => 'hebrew',
+                               'filter' => [ 'niqqud', 'hebrew_lemmatizer', 
'lowercase', 'asciifolding' ],
+                       ];
+                       $config[ 'analyzer' ][ 'text_search' ] = $config[ 
'analyzer' ][ 'text' ];
                        break;
                case 'italian':
                        $config[ 'filter' ][ 'italian_elision' ] = [
@@ -1038,8 +1039,7 @@
                'analysis-kuromoji' => [ 'ja' => 'kuromoji' ],
                'analysis-smartcn' => [ 'zh-hans' => 'smartcn' ],
                'analysis-stconvert,analysis-smartcn' => [ 'zh' => 'chinese' ],
-               'elasticsearch-analysis-hebrew' => [ 'he' => 'hebrew' ],
-               // TODO Hebrew requires some special query handling....
+               'analysis-hebrew' => [ 'he' => 'hebrew' ],
                'analysis-ukrainian' => [ 'uk' => 'ukrainian' ],
        ];
 
diff --git a/tests/unit/Maintenance/AnalysisConfigBuilderTest.php 
b/tests/unit/Maintenance/AnalysisConfigBuilderTest.php
index aed0e82..921ce51 100644
--- a/tests/unit/Maintenance/AnalysisConfigBuilderTest.php
+++ b/tests/unit/Maintenance/AnalysisConfigBuilderTest.php
@@ -394,7 +394,7 @@
                ]);
                $plugins = [
                        'analysis-stempel', 'analysis-kuromoji',
-                       'analysis-smartcn', 'elasticsearch-analysis-hebrew',
+                       'analysis-smartcn', 'analysis-hebrew',
                        'analysis-ukrainian', 'analysis-stconvert'
                ];
                $builder = new AnalysisConfigBuilder( $langCode, $plugins, 
$config );
diff --git a/tests/unit/fixtures/languageAnalysis/he.config 
b/tests/unit/fixtures/languageAnalysis/he.config
index 2c63c08..0967ef4 100644
--- a/tests/unit/fixtures/languageAnalysis/he.config
+++ b/tests/unit/fixtures/languageAnalysis/he.config
@@ -1,2 +1 @@
-{
-}
+{}
diff --git a/tests/unit/fixtures/languageAnalysis/he.expected 
b/tests/unit/fixtures/languageAnalysis/he.expected
index 745240c..f374709 100644
--- a/tests/unit/fixtures/languageAnalysis/he.expected
+++ b/tests/unit/fixtures/languageAnalysis/he.expected
@@ -1,15 +1,23 @@
 {
     "analyzer": {
         "text": {
-            "type": "hebrew",
-            "char_filter": [
-                "word_break_helper"
+            "type": "custom",
+            "tokenizer": "hebrew",
+            "filter": [
+                "niqqud",
+                "hebrew_lemmatizer",
+                "lowercase",
+                "asciifolding"
             ]
         },
         "text_search": {
-            "type": "hebrew_exact",
-            "char_filter": [
-                "word_break_helper"
+            "type": "custom",
+            "tokenizer": "hebrew",
+            "filter": [
+                "niqqud",
+                "hebrew_lemmatizer",
+                "lowercase",
+                "asciifolding"
             ]
         },
         "plain": {

-- 
To view, visit https://gerrit.wikimedia.org/r/357299
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ice9ffc7a35d879d857659311aae8dd9d01576189
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Tjones <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to