jenkins-bot has submitted this change and it was merged. ( 
https://gerrit.wikimedia.org/r/393689 )

Change subject: Port per-language indexing code from Wikibase to CirrusSearch
......................................................................


Port per-language indexing code from Wikibase to CirrusSearch

Bug: T176903
Change-Id: I6abaf6b75aac86b39d416372c612e4099bfddfaa
---
M includes/Maintenance/AnalysisConfigBuilder.php
M tests/unit/Maintenance/AnalysisConfigBuilderTest.php
A tests/unit/fixtures/analyzer/all_defaults.expected
A tests/unit/fixtures/analyzer/all_defaults.plain.expected
A tests/unit/fixtures/analyzer/en-ru-es-de-zh.expected
A tests/unit/fixtures/analyzer/en-ru-es-de-zh.plain.expected
A tests/unit/fixtures/analyzer/en-zh-sv.expected
A tests/unit/fixtures/analyzer/en-zh-sv.plain.expected
A tests/unit/fixtures/analyzer/he-uk-nolang.expected
A tests/unit/fixtures/analyzer/he-uk-nolang.plain.expected
A tests/unit/fixtures/analyzer/he-uk-noplug.expected
A tests/unit/fixtures/analyzer/he-uk-noplug.plain.expected
A tests/unit/fixtures/analyzer/he-uk.expected
A tests/unit/fixtures/analyzer/he-uk.plain.expected
14 files changed, 1,967 insertions(+), 0 deletions(-)

Approvals:
  Tjones: Looks good to me, approved
  Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
  jenkins-bot: Verified



diff --git a/includes/Maintenance/AnalysisConfigBuilder.php 
b/includes/Maintenance/AnalysisConfigBuilder.php
index 0ee472b..ae57a5c 100644
--- a/includes/Maintenance/AnalysisConfigBuilder.php
+++ b/includes/Maintenance/AnalysisConfigBuilder.php
@@ -987,6 +987,150 @@
        }
 
        /**
+        * Get list of filters that are mentioned in analyzers but not defined
+        * explicitly.
+        * @param array[] $config Full configuration array
+        * @param string[] $analyzers List of analyzers to consider.
+        * @return array List of default filters, each containing only filter 
type
+        */
+       private function getDefaultFilters( array &$config,  array $analyzers ) 
{
+               $defaultFilters = [];
+               foreach ( $analyzers as $analyzer ) {
+                       if ( empty( $config['analyzer'][$analyzer]['filter'] ) 
) {
+                               continue;
+                       }
+                       foreach ( $config['analyzer'][$analyzer]['filter'] as 
$filterName ) {
+                               if ( !isset( $config['filter'][$filterName] ) ) 
{
+                                       // This is default definition for the 
built-in filter
+                                       $defaultFilters[$filterName] = [ 'type' 
=> $filterName ];
+                               }
+                       }
+               }
+               return $defaultFilters;
+       }
+
+       /**
+        * Check every filter in the config - if it's the same as in old config,
+        * ignore it. If it has the same name, but different content - create 
new filter
+        * with different name by prefixing it with language code.
+        *
+        * @param array[] $config Configuration being processed
+        * @param array[] $standardFilters Existing filters list
+        * @param array[] $defaultFilters List of default filters already 
mentioned in the config
+        * @param string $prefix Prefix for disambiguation
+        * @return array[] The list of filters not in the old config.
+        */
+       private function resolveFilters( array &$config, array 
$standardFilters, array $defaultFilters, $prefix ) {
+               $resultFilters = [];
+               foreach ( $config['filter'] as $name => $filter ) {
+                       $existingFilter = null;
+                       if ( isset( $standardFilters[$name] ) ) {
+                               $existingFilter = $standardFilters[$name];
+                       } elseif ( isset( $defaultFilters[$name] ) ) {
+                               $existingFilter = $defaultFilters[$name];
+                       }
+
+                       if ( $existingFilter ) { // Filter with this name 
already exists
+                               if ( $existingFilter != $filter ) {
+                                       // filter with the same name but 
different config - need to
+                                       // rename by adding prefix
+                                       $newName = $prefix . '_' . $name;
+                                       $this->replaceFilter( $config, $name, 
$newName );
+                                       $resultFilters[$newName] = $filter;
+                               }
+                       } else {
+                               $resultFilters[$name] = $filter;
+                       }
+               }
+               return $resultFilters;
+       }
+
+       /**
+        * Replace certain filter name in all configs with different name.
+        * @param array[] $config Configuration being processed
+        * @param string $oldName
+        * @param string $newName
+        */
+       private function replaceFilter( array &$config, $oldName, $newName ) {
+               foreach ( $config['analyzer'] as &$analyzer ) {
+                       if ( !isset( $analyzer['filter'] ) ) {
+                               continue;
+                       }
+                       $analyzer['filter'] = array_map( function ( $filter ) 
use ( $oldName, $newName ) {
+                               if ( $filter === $oldName ) {
+                                       return $newName;
+                               }
+                               return $filter;
+                       }, $analyzer['filter'] );
+               }
+       }
+
+       /**
+        * Merge per-language config into the main config.
+        * It will copy specific analyzer and all dependant filters and 
char_filters.
+        * @param array $config Main config
+        * @param array $langConfig Per-language config
+        * @param string $name Name for analyzer whose config we're merging
+        * @param string $prefix Prefix for this configuration
+        */
+       private function mergeConfig( array &$config, array $langConfig, $name, 
$prefix ) {
+               $analyzer = $langConfig['analyzer'][$name];
+               $config['analyzer'][$prefix . '_' . $name] = $analyzer;
+               if ( !empty( $analyzer['filter'] ) ) {
+                       // Add private filters for this analyzer
+                       foreach ( $analyzer['filter'] as $filter ) {
+                               // Copy filters that are in language config but 
not in the main config.
+                               // We would not copy the same filter into the 
main config since due to
+                               // the resolution step we know they are the 
same (otherwise we would have
+                               // renamed it).
+                               if ( isset( $langConfig['filter'][$filter] ) &&
+                                       !isset( $config['filter'][$filter] ) ) {
+                                       $config['filter'][$filter] = 
$langConfig['filter'][$filter];
+                               }
+                       }
+               }
+               if ( !empty( $analyzer['char_filter'] ) ) {
+                       // Add private char_filters for this analyzer
+                       foreach ( $analyzer['char_filter'] as $filter ) {
+                               // Here unlike above we do not check for 
$langConfig since we assume
+                               // language config is not broken and all char 
filters are namespaced
+                               // nicely, so if the filter is mentioned in 
analyzer it is also defined.
+                               if ( !isset( $config['char_filter'][$filter] ) 
) {
+                                       $config['char_filter'][$filter] = 
$langConfig['char_filter'][$filter];
+                               }
+                       }
+               }
+       }
+
+       /**
+        * Create per-language configs for specific analyzers which separates 
and namespaces
+        * filters that are different between languages.
+        * @param array &$config Existing config, will be modified
+        * @param string[] $languages List of languages to process
+        * @param string[] $analyzers List of analyzers to process
+        */
+       public function buildLanguageConfigs( array &$config, array $languages, 
array $analyzers ) {
+               $defaultFilters = $this->getDefaultFilters( $config, $analyzers 
);
+               foreach ( $languages as $lang ) {
+                       $langConfig = $this->buildConfig( $lang );
+                       $defaultFilters += $this->getDefaultFilters( 
$langConfig, $analyzers );
+               }
+               foreach ( $languages as $lang ) {
+                       $langConfig = $this->buildConfig( $lang );
+                       // Analyzer is: tokenizer + filter + char_filter
+                       // Tokenizers don't seem to be subject to customization 
now
+                       // Char filters are nicely namespaced
+                       // Filters are NOT - e.g. lowercase & icu_folding 
filters are different for different
+                       // languages! So we need to do some disambiguation here.
+                       $langConfig['filter'] = $this->resolveFilters( 
$langConfig, $config['filter'], $defaultFilters, $lang );
+                       // Merge configs
+                       foreach ( $analyzers as $analyzer ) {
+                               $this->mergeConfig( $config, $langConfig, 
$analyzer, $lang );
+                       }
+               }
+       }
+
+       /**
         * @return bool true if the icu analyzer is available.
         */
        public function isIcuAvailable() {
diff --git a/tests/unit/Maintenance/AnalysisConfigBuilderTest.php 
b/tests/unit/Maintenance/AnalysisConfigBuilderTest.php
index e188ed6..9f11ee7 100644
--- a/tests/unit/Maintenance/AnalysisConfigBuilderTest.php
+++ b/tests/unit/Maintenance/AnalysisConfigBuilderTest.php
@@ -418,4 +418,101 @@
                        $this->assertEquals( $expected, $builder->buildConfig() 
);
                }
        }
+
+       public function languageConfigDataProvider() {
+               $emptyConfig = [
+                       'analyzer' => [],
+                       'filter' => [],
+                       'char_filter' => []
+               ];
+               $allPlugins = [
+                       'extra',
+                       'analysis-icu',
+                       'analysis-stempel',
+                       'analysis-kuromoji',
+                       'analysis-smartcn',
+                       'analysis-hebrew',
+                       'analysis-ukrainian',
+                       'analysis-stconvert'
+               ];
+
+               return [
+                       "some languages" => [
+                               [ 'en', 'ru', 'es', 'de', 'zh' ],
+                               $emptyConfig,
+                               $allPlugins,
+                               'en-ru-es-de-zh',
+                       ],
+                       // sv has custom icu_folding filter
+                       "sv" => [
+                               [ 'en', 'zh', 'sv' ],
+                               $emptyConfig,
+                               $allPlugins,
+                               'en-zh-sv',
+                       ],
+                       "with plugins" => [
+                               [ 'he', 'uk' ],
+                               $emptyConfig,
+                               $allPlugins,
+                               'he-uk',
+                       ],
+                       "without language plugins" => [
+                               [ 'he', 'uk' ],
+                               $emptyConfig,
+                               [ 'extra', 'analysis-icu' ],
+                               'he-uk-nolang',
+                       ],
+                       "without any plugins" => [
+                               [ 'he', 'uk' ],
+                               $emptyConfig,
+                               [],
+                               'he-uk-noplug',
+                       ],
+                       "all default languages" => [
+                               [ 'ch', 'fy', 'kab', 'ti', 'xmf' ],
+                               $emptyConfig,
+                               [ 'extra', 'analysis-icu' ],
+                               'all_defaults',
+                       ],
+               ];
+       }
+
+       /**
+        * @param string[] $languages
+        * @param array $oldConfig
+        * @param string[] $plugins
+        * @param string $expectedConfig Filename with expected config
+        * @dataProvider languageConfigDataProvider
+        */
+       public function testAnalysisConfig( $languages, $oldConfig, $plugins, 
$expectedConfig ) {
+               // We use these static settings because we rely on tests in main
+               // AnalysisConfigBuilderTest to handle variations
+               $config = new HashSearchConfig( [ 'CirrusSearchUseIcuFolding' 
=> 'default' ] );
+
+               $builder = new AnalysisConfigBuilder( 'en', $plugins, $config );
+               $prevConfig = $oldConfig;
+               $builder->buildLanguageConfigs( $oldConfig, $languages,
+                       [ 'plain', 'plain_search', 'text', 'text_search' ] );
+               $expectedFile = __DIR__ . 
"/../fixtures/analyzer/$expectedConfig.expected";
+               if ( is_file( $expectedFile ) ) {
+                       $expected = json_decode( file_get_contents( 
$expectedFile ), true );
+                       $this->assertEquals( $expected, $oldConfig );
+               } else {
+                       file_put_contents( $expectedFile, json_encode( 
$oldConfig, JSON_PRETTY_PRINT ) );
+                       $this->markTestSkipped( "Generated new fixture" );
+               }
+
+               $oldConfig = $prevConfig;
+               $builder->buildLanguageConfigs( $oldConfig, $languages,
+                       [ 'plain', 'plain_search' ] );
+               $expectedFile = __DIR__ . 
"/../fixtures/analyzer/$expectedConfig.plain.expected";
+               if ( is_file( $expectedFile ) ) {
+                       $expected = json_decode( file_get_contents( 
$expectedFile ), true );
+                       $this->assertEquals( $expected, $oldConfig );
+               } else {
+                       file_put_contents( $expectedFile, json_encode( 
$oldConfig, JSON_PRETTY_PRINT ) );
+                       $this->markTestSkipped( "Generated new fixture" );
+               }
+       }
+
 }
diff --git a/tests/unit/fixtures/analyzer/all_defaults.expected 
b/tests/unit/fixtures/analyzer/all_defaults.expected
new file mode 100644
index 0000000..1f14fb4
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/all_defaults.expected
@@ -0,0 +1,191 @@
+{
+    "analyzer": {
+        "ch_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "ch_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "ch_text": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ]
+        },
+        "ch_text_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ]
+        },
+        "fy_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "fy_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "fy_text": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ]
+        },
+        "fy_text_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ]
+        },
+        "kab_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "kab_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "kab_text": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ]
+        },
+        "kab_text_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ]
+        },
+        "ti_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "ti_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "ti_text": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ]
+        },
+        "ti_text_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ]
+        },
+        "xmf_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "xmf_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "xmf_text": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ]
+        },
+        "xmf_text_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ]
+        }
+    },
+    "filter": {
+        "icu_normalizer": {
+            "type": "icu_normalizer",
+            "name": "nfkc_cf"
+        }
+    },
+    "char_filter": {
+        "word_break_helper": {
+            "type": "mapping",
+            "mappings": [
+                "_=>\\u0020",
+                ".=>\\u0020",
+                "(=>\\u0020",
+                ")=>\\u0020"
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/all_defaults.plain.expected 
b/tests/unit/fixtures/analyzer/all_defaults.plain.expected
new file mode 100644
index 0000000..77c7142
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/all_defaults.plain.expected
@@ -0,0 +1,121 @@
+{
+    "analyzer": {
+        "ch_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "ch_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "fy_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "fy_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "kab_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "kab_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "ti_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "ti_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "xmf_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "xmf_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        }
+    },
+    "filter": {
+        "icu_normalizer": {
+            "type": "icu_normalizer",
+            "name": "nfkc_cf"
+        }
+    },
+    "char_filter": {
+        "word_break_helper": {
+            "type": "mapping",
+            "mappings": [
+                "_=>\\u0020",
+                ".=>\\u0020",
+                "(=>\\u0020",
+                ")=>\\u0020"
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/en-ru-es-de-zh.expected 
b/tests/unit/fixtures/analyzer/en-ru-es-de-zh.expected
new file mode 100644
index 0000000..5df50b9
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/en-ru-es-de-zh.expected
@@ -0,0 +1,379 @@
+{
+    "analyzer": {
+        "en_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer",
+                "preserve_original_recorder",
+                "icu_folding",
+                "preserve_original"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "en_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "en_text": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "char_filter": [
+                "word_break_helper",
+                "kana_map"
+            ],
+            "filter": [
+                "aggressive_splitting",
+                "possessive_english",
+                "icu_normalizer",
+                "stop",
+                "icu_folding",
+                "kstem",
+                "custom_stem"
+            ]
+        },
+        "en_text_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "char_filter": [
+                "word_break_helper",
+                "kana_map"
+            ],
+            "filter": [
+                "aggressive_splitting",
+                "possessive_english",
+                "icu_normalizer",
+                "stop",
+                "icu_folding",
+                "kstem",
+                "custom_stem"
+            ]
+        },
+        "ru_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper",
+                "russian_charfilter"
+            ]
+        },
+        "ru_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper",
+                "russian_charfilter"
+            ]
+        },
+        "ru_text": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "char_filter": [
+                "russian_charfilter"
+            ],
+            "filter": [
+                "icu_normalizer",
+                "russian_stop",
+                "russian_stemmer"
+            ]
+        },
+        "ru_text_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "char_filter": [
+                "russian_charfilter"
+            ],
+            "filter": [
+                "icu_normalizer",
+                "russian_stop",
+                "russian_stemmer"
+            ]
+        },
+        "es_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "es_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "es_text": {
+            "type": "spanish",
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "es_text_search": {
+            "type": "spanish",
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "de_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "de_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "de_text": {
+            "type": "german",
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "de_text_search": {
+            "type": "german",
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "zh_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "smartcn_stop",
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "zh_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "smartcn_stop",
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "zh_text": {
+            "type": "custom",
+            "tokenizer": "smartcn_tokenizer",
+            "char_filter": [
+                "stconvertfix",
+                "tsconvert"
+            ],
+            "filter": [
+                "smartcn_stop",
+                "icu_normalizer"
+            ]
+        },
+        "zh_text_search": {
+            "type": "custom",
+            "tokenizer": "smartcn_tokenizer",
+            "char_filter": [
+                "stconvertfix",
+                "tsconvert"
+            ],
+            "filter": [
+                "smartcn_stop",
+                "icu_normalizer"
+            ]
+        }
+    },
+    "filter": {
+        "icu_normalizer": {
+            "type": "icu_normalizer",
+            "name": "nfkc_cf"
+        },
+        "icu_folding": {
+            "type": "icu_folding"
+        },
+        "aggressive_splitting": {
+            "type": "word_delimiter",
+            "stem_english_possessive": false,
+            "preserve_original": false
+        },
+        "possessive_english": {
+            "type": "stemmer",
+            "language": "possessive_english"
+        },
+        "custom_stem": {
+            "type": "stemmer_override",
+            "rules": "guidelines => guideline"
+        },
+        "russian_stop": {
+            "type": "stop",
+            "stopwords": "_russian_"
+        },
+        "russian_stemmer": {
+            "type": "stemmer",
+            "language": "russian"
+        },
+        "smartcn_stop": {
+            "type": "stop",
+            "stopwords": [
+                ","
+            ]
+        }
+    },
+    "char_filter": {
+        "word_break_helper": {
+            "type": "mapping",
+            "mappings": [
+                "_=>\\u0020",
+                ".=>\\u0020",
+                "(=>\\u0020",
+                ")=>\\u0020"
+            ]
+        },
+        "kana_map": {
+            "type": "mapping",
+            "mappings": [
+                "\\u3041=>\\u30a1",
+                "\\u3042=>\\u30a2",
+                "\\u3043=>\\u30a3",
+                "\\u3044=>\\u30a4",
+                "\\u3045=>\\u30a5",
+                "\\u3046=>\\u30a6",
+                "\\u3094=>\\u30f4",
+                "\\u3047=>\\u30a7",
+                "\\u3048=>\\u30a8",
+                "\\u3049=>\\u30a9",
+                "\\u304a=>\\u30aa",
+                "\\u3095=>\\u30f5",
+                "\\u304b=>\\u30ab",
+                "\\u304c=>\\u30ac",
+                "\\u304d=>\\u30ad",
+                "\\u304e=>\\u30ae",
+                "\\u304f=>\\u30af",
+                "\\u3050=>\\u30b0",
+                "\\u3096=>\\u30f6",
+                "\\u3051=>\\u30b1",
+                "\\u3052=>\\u30b2",
+                "\\u3053=>\\u30b3",
+                "\\u3054=>\\u30b4",
+                "\\u3055=>\\u30b5",
+                "\\u3056=>\\u30b6",
+                "\\u3057=>\\u30b7",
+                "\\u3058=>\\u30b8",
+                "\\u3059=>\\u30b9",
+                "\\u305a=>\\u30ba",
+                "\\u305b=>\\u30bb",
+                "\\u305c=>\\u30bc",
+                "\\u305d=>\\u30bd",
+                "\\u305e=>\\u30be",
+                "\\u305f=>\\u30bf",
+                "\\u3060=>\\u30c0",
+                "\\u3061=>\\u30c1",
+                "\\u3062=>\\u30c2",
+                "\\u3063=>\\u30c3",
+                "\\u3064=>\\u30c4",
+                "\\u3065=>\\u30c5",
+                "\\u3066=>\\u30c6",
+                "\\u3067=>\\u30c7",
+                "\\u3068=>\\u30c8",
+                "\\u3069=>\\u30c9",
+                "\\u306a=>\\u30ca",
+                "\\u306b=>\\u30cb",
+                "\\u306c=>\\u30cc",
+                "\\u306d=>\\u30cd",
+                "\\u306e=>\\u30ce",
+                "\\u306f=>\\u30cf",
+                "\\u3070=>\\u30d0",
+                "\\u3071=>\\u30d1",
+                "\\u3072=>\\u30d2",
+                "\\u3073=>\\u30d3",
+                "\\u3074=>\\u30d4",
+                "\\u3075=>\\u30d5",
+                "\\u3076=>\\u30d6",
+                "\\u3077=>\\u30d7",
+                "\\u3078=>\\u30d8",
+                "\\u3079=>\\u30d9",
+                "\\u307a=>\\u30da",
+                "\\u307b=>\\u30db",
+                "\\u307c=>\\u30dc",
+                "\\u307d=>\\u30dd",
+                "\\u307e=>\\u30de",
+                "\\u307f=>\\u30df",
+                "\\u3080=>\\u30e0",
+                "\\u3081=>\\u30e1",
+                "\\u3082=>\\u30e2",
+                "\\u3083=>\\u30e3",
+                "\\u3084=>\\u30e4",
+                "\\u3085=>\\u30e5",
+                "\\u3086=>\\u30e6",
+                "\\u3087=>\\u30e7",
+                "\\u3088=>\\u30e8",
+                "\\u3089=>\\u30e9",
+                "\\u308a=>\\u30ea",
+                "\\u308b=>\\u30eb",
+                "\\u308c=>\\u30ec",
+                "\\u308d=>\\u30ed",
+                "\\u308e=>\\u30ee",
+                "\\u308f=>\\u30ef",
+                "\\u3090=>\\u30f0",
+                "\\u3091=>\\u30f1",
+                "\\u3092=>\\u30f2",
+                "\\u3093=>\\u30f3"
+            ]
+        },
+        "russian_charfilter": {
+            "type": "mapping",
+            "mappings": [
+                "\\u0301=>",
+                "\\u0130=>I",
+                "\\u0435\\u0308=>\\u0435",
+                "\\u0415\\u0308=>\\u0415",
+                "\\u0451=>\\u0435",
+                "\\u0401=>\\u0415"
+            ]
+        },
+        "stconvertfix": {
+            "type": "mapping",
+            "mappings": [
+                "\\u606d\\u5f18=>\\u606d \\u5f18",
+                "\\u5138=>\\u3469"
+            ]
+        },
+        "tsconvert": {
+            "type": "stconvert",
+            "delimiter": "#",
+            "keep_both": false,
+            "convert_type": "t2s"
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/en-ru-es-de-zh.plain.expected 
b/tests/unit/fixtures/analyzer/en-ru-es-de-zh.plain.expected
new file mode 100644
index 0000000..2ddbb56
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/en-ru-es-de-zh.plain.expected
@@ -0,0 +1,148 @@
+{
+    "analyzer": {
+        "en_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer",
+                "preserve_original_recorder",
+                "icu_folding",
+                "preserve_original"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "en_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "ru_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper",
+                "russian_charfilter"
+            ]
+        },
+        "ru_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper",
+                "russian_charfilter"
+            ]
+        },
+        "es_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "es_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "de_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "de_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "zh_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "smartcn_stop",
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "zh_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "smartcn_stop",
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        }
+    },
+    "filter": {
+        "icu_normalizer": {
+            "type": "icu_normalizer",
+            "name": "nfkc_cf"
+        },
+        "icu_folding": {
+            "type": "icu_folding"
+        },
+        "smartcn_stop": {
+            "type": "stop",
+            "stopwords": [
+                ","
+            ]
+        }
+    },
+    "char_filter": {
+        "word_break_helper": {
+            "type": "mapping",
+            "mappings": [
+                "_=>\\u0020",
+                ".=>\\u0020",
+                "(=>\\u0020",
+                ")=>\\u0020"
+            ]
+        },
+        "russian_charfilter": {
+            "type": "mapping",
+            "mappings": [
+                "\\u0301=>",
+                "\\u0130=>I",
+                "\\u0435\\u0308=>\\u0435",
+                "\\u0415\\u0308=>\\u0415",
+                "\\u0451=>\\u0435",
+                "\\u0401=>\\u0415"
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/en-zh-sv.expected 
b/tests/unit/fixtures/analyzer/en-zh-sv.expected
new file mode 100644
index 0000000..3990158
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/en-zh-sv.expected
@@ -0,0 +1,309 @@
+{
+    "analyzer": {
+        "en_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer",
+                "preserve_original_recorder",
+                "icu_folding",
+                "preserve_original"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "en_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "en_text": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "char_filter": [
+                "word_break_helper",
+                "kana_map"
+            ],
+            "filter": [
+                "aggressive_splitting",
+                "possessive_english",
+                "icu_normalizer",
+                "stop",
+                "icu_folding",
+                "kstem",
+                "custom_stem"
+            ]
+        },
+        "en_text_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "char_filter": [
+                "word_break_helper",
+                "kana_map"
+            ],
+            "filter": [
+                "aggressive_splitting",
+                "possessive_english",
+                "icu_normalizer",
+                "stop",
+                "icu_folding",
+                "kstem",
+                "custom_stem"
+            ]
+        },
+        "zh_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "smartcn_stop",
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "zh_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "smartcn_stop",
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "zh_text": {
+            "type": "custom",
+            "tokenizer": "smartcn_tokenizer",
+            "char_filter": [
+                "stconvertfix",
+                "tsconvert"
+            ],
+            "filter": [
+                "smartcn_stop",
+                "icu_normalizer"
+            ]
+        },
+        "zh_text_search": {
+            "type": "custom",
+            "tokenizer": "smartcn_tokenizer",
+            "char_filter": [
+                "stconvertfix",
+                "tsconvert"
+            ],
+            "filter": [
+                "smartcn_stop",
+                "icu_normalizer"
+            ]
+        },
+        "sv_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer",
+                "preserve_original_recorder",
+                "sv_icu_folding",
+                "preserve_original"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "sv_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "sv_text": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer",
+                "swedish_stop",
+                "swedish_stemmer",
+                "preserve_original_recorder",
+                "sv_icu_folding",
+                "preserve_original"
+            ]
+        },
+        "sv_text_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer",
+                "swedish_stop",
+                "swedish_stemmer",
+                "preserve_original_recorder",
+                "sv_icu_folding",
+                "preserve_original"
+            ]
+        }
+    },
+    "filter": {
+        "icu_normalizer": {
+            "type": "icu_normalizer",
+            "name": "nfkc_cf"
+        },
+        "icu_folding": {
+            "type": "icu_folding"
+        },
+        "aggressive_splitting": {
+            "type": "word_delimiter",
+            "stem_english_possessive": false,
+            "preserve_original": false
+        },
+        "possessive_english": {
+            "type": "stemmer",
+            "language": "possessive_english"
+        },
+        "custom_stem": {
+            "type": "stemmer_override",
+            "rules": "guidelines => guideline"
+        },
+        "smartcn_stop": {
+            "type": "stop",
+            "stopwords": [
+                ","
+            ]
+        },
+        "sv_icu_folding": {
+            "type": "icu_folding",
+            "unicodeSetFilter": "[^\u00e5\u00e4\u00f6\u00c5\u00c4\u00d6]"
+        },
+        "swedish_stop": {
+            "type": "stop",
+            "stopwords": "_swedish_"
+        },
+        "swedish_stemmer": {
+            "type": "stemmer",
+            "language": "swedish"
+        }
+    },
+    "char_filter": {
+        "word_break_helper": {
+            "type": "mapping",
+            "mappings": [
+                "_=>\\u0020",
+                ".=>\\u0020",
+                "(=>\\u0020",
+                ")=>\\u0020"
+            ]
+        },
+        "kana_map": {
+            "type": "mapping",
+            "mappings": [
+                "\\u3041=>\\u30a1",
+                "\\u3042=>\\u30a2",
+                "\\u3043=>\\u30a3",
+                "\\u3044=>\\u30a4",
+                "\\u3045=>\\u30a5",
+                "\\u3046=>\\u30a6",
+                "\\u3094=>\\u30f4",
+                "\\u3047=>\\u30a7",
+                "\\u3048=>\\u30a8",
+                "\\u3049=>\\u30a9",
+                "\\u304a=>\\u30aa",
+                "\\u3095=>\\u30f5",
+                "\\u304b=>\\u30ab",
+                "\\u304c=>\\u30ac",
+                "\\u304d=>\\u30ad",
+                "\\u304e=>\\u30ae",
+                "\\u304f=>\\u30af",
+                "\\u3050=>\\u30b0",
+                "\\u3096=>\\u30f6",
+                "\\u3051=>\\u30b1",
+                "\\u3052=>\\u30b2",
+                "\\u3053=>\\u30b3",
+                "\\u3054=>\\u30b4",
+                "\\u3055=>\\u30b5",
+                "\\u3056=>\\u30b6",
+                "\\u3057=>\\u30b7",
+                "\\u3058=>\\u30b8",
+                "\\u3059=>\\u30b9",
+                "\\u305a=>\\u30ba",
+                "\\u305b=>\\u30bb",
+                "\\u305c=>\\u30bc",
+                "\\u305d=>\\u30bd",
+                "\\u305e=>\\u30be",
+                "\\u305f=>\\u30bf",
+                "\\u3060=>\\u30c0",
+                "\\u3061=>\\u30c1",
+                "\\u3062=>\\u30c2",
+                "\\u3063=>\\u30c3",
+                "\\u3064=>\\u30c4",
+                "\\u3065=>\\u30c5",
+                "\\u3066=>\\u30c6",
+                "\\u3067=>\\u30c7",
+                "\\u3068=>\\u30c8",
+                "\\u3069=>\\u30c9",
+                "\\u306a=>\\u30ca",
+                "\\u306b=>\\u30cb",
+                "\\u306c=>\\u30cc",
+                "\\u306d=>\\u30cd",
+                "\\u306e=>\\u30ce",
+                "\\u306f=>\\u30cf",
+                "\\u3070=>\\u30d0",
+                "\\u3071=>\\u30d1",
+                "\\u3072=>\\u30d2",
+                "\\u3073=>\\u30d3",
+                "\\u3074=>\\u30d4",
+                "\\u3075=>\\u30d5",
+                "\\u3076=>\\u30d6",
+                "\\u3077=>\\u30d7",
+                "\\u3078=>\\u30d8",
+                "\\u3079=>\\u30d9",
+                "\\u307a=>\\u30da",
+                "\\u307b=>\\u30db",
+                "\\u307c=>\\u30dc",
+                "\\u307d=>\\u30dd",
+                "\\u307e=>\\u30de",
+                "\\u307f=>\\u30df",
+                "\\u3080=>\\u30e0",
+                "\\u3081=>\\u30e1",
+                "\\u3082=>\\u30e2",
+                "\\u3083=>\\u30e3",
+                "\\u3084=>\\u30e4",
+                "\\u3085=>\\u30e5",
+                "\\u3086=>\\u30e6",
+                "\\u3087=>\\u30e7",
+                "\\u3088=>\\u30e8",
+                "\\u3089=>\\u30e9",
+                "\\u308a=>\\u30ea",
+                "\\u308b=>\\u30eb",
+                "\\u308c=>\\u30ec",
+                "\\u308d=>\\u30ed",
+                "\\u308e=>\\u30ee",
+                "\\u308f=>\\u30ef",
+                "\\u3090=>\\u30f0",
+                "\\u3091=>\\u30f1",
+                "\\u3092=>\\u30f2",
+                "\\u3093=>\\u30f3"
+            ]
+        },
+        "stconvertfix": {
+            "type": "mapping",
+            "mappings": [
+                "\\u606d\\u5f18=>\\u606d \\u5f18",
+                "\\u5138=>\\u3469"
+            ]
+        },
+        "tsconvert": {
+            "type": "stconvert",
+            "delimiter": "#",
+            "keep_both": false,
+            "convert_type": "t2s"
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/en-zh-sv.plain.expected 
b/tests/unit/fixtures/analyzer/en-zh-sv.plain.expected
new file mode 100644
index 0000000..0309757
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/en-zh-sv.plain.expected
@@ -0,0 +1,102 @@
+{
+    "analyzer": {
+        "en_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer",
+                "preserve_original_recorder",
+                "icu_folding",
+                "preserve_original"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "en_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "zh_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "smartcn_stop",
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "zh_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "smartcn_stop",
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "sv_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer",
+                "preserve_original_recorder",
+                "sv_icu_folding",
+                "preserve_original"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "sv_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        }
+    },
+    "filter": {
+        "icu_normalizer": {
+            "type": "icu_normalizer",
+            "name": "nfkc_cf"
+        },
+        "icu_folding": {
+            "type": "icu_folding"
+        },
+        "smartcn_stop": {
+            "type": "stop",
+            "stopwords": [
+                ","
+            ]
+        },
+        "sv_icu_folding": {
+            "type": "icu_folding",
+            "unicodeSetFilter": "[^\u00e5\u00e4\u00f6\u00c5\u00c4\u00d6]"
+        }
+    },
+    "char_filter": {
+        "word_break_helper": {
+            "type": "mapping",
+            "mappings": [
+                "_=>\\u0020",
+                ".=>\\u0020",
+                "(=>\\u0020",
+                ")=>\\u0020"
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/he-uk-nolang.expected 
b/tests/unit/fixtures/analyzer/he-uk-nolang.expected
new file mode 100644
index 0000000..0935ec7
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/he-uk-nolang.expected
@@ -0,0 +1,95 @@
+{
+    "analyzer": {
+        "he_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer",
+                "preserve_original_recorder",
+                "icu_folding",
+                "preserve_original"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "he_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "he_text": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ]
+        },
+        "he_text_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ]
+        },
+        "uk_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "uk_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "uk_text": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ]
+        },
+        "uk_text_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ]
+        }
+    },
+    "filter": {
+        "icu_normalizer": {
+            "type": "icu_normalizer",
+            "name": "nfkc_cf"
+        },
+        "icu_folding": {
+            "type": "icu_folding"
+        }
+    },
+    "char_filter": {
+        "word_break_helper": {
+            "type": "mapping",
+            "mappings": [
+                "_=>\\u0020",
+                ".=>\\u0020",
+                "(=>\\u0020",
+                ")=>\\u0020"
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/he-uk-nolang.plain.expected 
b/tests/unit/fixtures/analyzer/he-uk-nolang.plain.expected
new file mode 100644
index 0000000..0abb4bd
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/he-uk-nolang.plain.expected
@@ -0,0 +1,67 @@
+{
+    "analyzer": {
+        "he_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer",
+                "preserve_original_recorder",
+                "icu_folding",
+                "preserve_original"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "he_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "uk_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "uk_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        }
+    },
+    "filter": {
+        "icu_normalizer": {
+            "type": "icu_normalizer",
+            "name": "nfkc_cf"
+        },
+        "icu_folding": {
+            "type": "icu_folding"
+        }
+    },
+    "char_filter": {
+        "word_break_helper": {
+            "type": "mapping",
+            "mappings": [
+                "_=>\\u0020",
+                ".=>\\u0020",
+                "(=>\\u0020",
+                ")=>\\u0020"
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/he-uk-noplug.expected 
b/tests/unit/fixtures/analyzer/he-uk-noplug.expected
new file mode 100644
index 0000000..af0c051
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/he-uk-noplug.expected
@@ -0,0 +1,88 @@
+{
+    "analyzer": {
+        "he_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "lowercase"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "he_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "lowercase"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "he_text": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "lowercase"
+            ]
+        },
+        "he_text_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "lowercase"
+            ]
+        },
+        "uk_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "lowercase"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "uk_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "lowercase"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "uk_text": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "lowercase"
+            ]
+        },
+        "uk_text_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "lowercase"
+            ]
+        }
+    },
+    "filter": {
+        "lowercase": {
+            "type": "lowercase"
+        }
+    },
+    "char_filter": {
+        "word_break_helper": {
+            "type": "mapping",
+            "mappings": [
+                "_=>\\u0020",
+                ".=>\\u0020",
+                "(=>\\u0020",
+                ")=>\\u0020"
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/he-uk-noplug.plain.expected 
b/tests/unit/fixtures/analyzer/he-uk-noplug.plain.expected
new file mode 100644
index 0000000..ac947b1
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/he-uk-noplug.plain.expected
@@ -0,0 +1,60 @@
+{
+    "analyzer": {
+        "he_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "lowercase"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "he_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "lowercase"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "uk_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "lowercase"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "uk_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "lowercase"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        }
+    },
+    "filter": {
+        "lowercase": {
+            "type": "lowercase"
+        }
+    },
+    "char_filter": {
+        "word_break_helper": {
+            "type": "mapping",
+            "mappings": [
+                "_=>\\u0020",
+                ".=>\\u0020",
+                "(=>\\u0020",
+                ")=>\\u0020"
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/he-uk.expected 
b/tests/unit/fixtures/analyzer/he-uk.expected
new file mode 100644
index 0000000..d419d4d
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/he-uk.expected
@@ -0,0 +1,99 @@
+{
+    "analyzer": {
+        "he_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer",
+                "preserve_original_recorder",
+                "icu_folding",
+                "preserve_original"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "he_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "he_text": {
+            "type": "custom",
+            "tokenizer": "hebrew",
+            "filter": [
+                "niqqud",
+                "hebrew_lemmatizer",
+                "icu_normalizer",
+                "icu_folding"
+            ]
+        },
+        "he_text_search": {
+            "type": "custom",
+            "tokenizer": "hebrew",
+            "filter": [
+                "niqqud",
+                "hebrew_lemmatizer",
+                "icu_normalizer",
+                "icu_folding"
+            ]
+        },
+        "uk_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "uk_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "uk_text": {
+            "type": "ukrainian",
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "uk_text_search": {
+            "type": "ukrainian",
+            "char_filter": [
+                "word_break_helper"
+            ]
+        }
+    },
+    "filter": {
+        "icu_normalizer": {
+            "type": "icu_normalizer",
+            "name": "nfkc_cf"
+        },
+        "icu_folding": {
+            "type": "icu_folding"
+        }
+    },
+    "char_filter": {
+        "word_break_helper": {
+            "type": "mapping",
+            "mappings": [
+                "_=>\\u0020",
+                ".=>\\u0020",
+                "(=>\\u0020",
+                ")=>\\u0020"
+            ]
+        }
+    }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/he-uk.plain.expected 
b/tests/unit/fixtures/analyzer/he-uk.plain.expected
new file mode 100644
index 0000000..0abb4bd
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/he-uk.plain.expected
@@ -0,0 +1,67 @@
+{
+    "analyzer": {
+        "he_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer",
+                "preserve_original_recorder",
+                "icu_folding",
+                "preserve_original"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "he_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "uk_plain": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        },
+        "uk_plain_search": {
+            "type": "custom",
+            "tokenizer": "standard",
+            "filter": [
+                "icu_normalizer"
+            ],
+            "char_filter": [
+                "word_break_helper"
+            ]
+        }
+    },
+    "filter": {
+        "icu_normalizer": {
+            "type": "icu_normalizer",
+            "name": "nfkc_cf"
+        },
+        "icu_folding": {
+            "type": "icu_folding"
+        }
+    },
+    "char_filter": {
+        "word_break_helper": {
+            "type": "mapping",
+            "mappings": [
+                "_=>\\u0020",
+                ".=>\\u0020",
+                "(=>\\u0020",
+                ")=>\\u0020"
+            ]
+        }
+    }
+}
\ No newline at end of file

-- 
To view, visit https://gerrit.wikimedia.org/r/393689
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I6abaf6b75aac86b39d416372c612e4099bfddfaa
Gerrit-PatchSet: 6
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Smalyshev <[email protected]>
Gerrit-Reviewer: Cindy-the-browser-test-bot <[email protected]>
Gerrit-Reviewer: DCausse <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Gehel <[email protected]>
Gerrit-Reviewer: Smalyshev <[email protected]>
Gerrit-Reviewer: Tjones <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to