jenkins-bot has submitted this change and it was merged. (
https://gerrit.wikimedia.org/r/393689 )
Change subject: Port per-language indexing code from Wikibase to CirrusSearch
......................................................................
Port per-language indexing code from Wikibase to CirrusSearch
Bug: T176903
Change-Id: I6abaf6b75aac86b39d416372c612e4099bfddfaa
---
M includes/Maintenance/AnalysisConfigBuilder.php
M tests/unit/Maintenance/AnalysisConfigBuilderTest.php
A tests/unit/fixtures/analyzer/all_defaults.expected
A tests/unit/fixtures/analyzer/all_defaults.plain.expected
A tests/unit/fixtures/analyzer/en-ru-es-de-zh.expected
A tests/unit/fixtures/analyzer/en-ru-es-de-zh.plain.expected
A tests/unit/fixtures/analyzer/en-zh-sv.expected
A tests/unit/fixtures/analyzer/en-zh-sv.plain.expected
A tests/unit/fixtures/analyzer/he-uk-nolang.expected
A tests/unit/fixtures/analyzer/he-uk-nolang.plain.expected
A tests/unit/fixtures/analyzer/he-uk-noplug.expected
A tests/unit/fixtures/analyzer/he-uk-noplug.plain.expected
A tests/unit/fixtures/analyzer/he-uk.expected
A tests/unit/fixtures/analyzer/he-uk.plain.expected
14 files changed, 1,967 insertions(+), 0 deletions(-)
Approvals:
Tjones: Looks good to me, approved
Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
jenkins-bot: Verified
diff --git a/includes/Maintenance/AnalysisConfigBuilder.php
b/includes/Maintenance/AnalysisConfigBuilder.php
index 0ee472b..ae57a5c 100644
--- a/includes/Maintenance/AnalysisConfigBuilder.php
+++ b/includes/Maintenance/AnalysisConfigBuilder.php
@@ -987,6 +987,150 @@
}
/**
+ * Get list of filters that are mentioned in analyzers but not defined
+ * explicitly.
+ * @param array[] $config Full configuration array
+ * @param string[] $analyzers List of analyzers to consider.
+ * @return array List of default filters, each containing only filter
type
+ */
+ private function getDefaultFilters( array &$config, array $analyzers )
{
+ $defaultFilters = [];
+ foreach ( $analyzers as $analyzer ) {
+ if ( empty( $config['analyzer'][$analyzer]['filter'] )
) {
+ continue;
+ }
+ foreach ( $config['analyzer'][$analyzer]['filter'] as
$filterName ) {
+ if ( !isset( $config['filter'][$filterName] ) )
{
+ // This is default definition for the
built-in filter
+ $defaultFilters[$filterName] = [ 'type'
=> $filterName ];
+ }
+ }
+ }
+ return $defaultFilters;
+ }
+
+ /**
+ * Check every filter in the config - if it's the same as in old config,
+ * ignore it. If it has the same name, but different content - create
new filter
+ * with different name by prefixing it with language code.
+ *
+ * @param array[] $config Configuration being processed
+ * @param array[] $standardFilters Existing filters list
+ * @param array[] $defaultFilters List of default filters already
mentioned in the config
+ * @param string $prefix Prefix for disambiguation
+ * @return array[] The list of filters not in the old config.
+ */
+ private function resolveFilters( array &$config, array
$standardFilters, array $defaultFilters, $prefix ) {
+ $resultFilters = [];
+ foreach ( $config['filter'] as $name => $filter ) {
+ $existingFilter = null;
+ if ( isset( $standardFilters[$name] ) ) {
+ $existingFilter = $standardFilters[$name];
+ } elseif ( isset( $defaultFilters[$name] ) ) {
+ $existingFilter = $defaultFilters[$name];
+ }
+
+ if ( $existingFilter ) { // Filter with this name
already exists
+ if ( $existingFilter != $filter ) {
+ // filter with the same name but
different config - need to
+ // rename by adding prefix
+ $newName = $prefix . '_' . $name;
+ $this->replaceFilter( $config, $name,
$newName );
+ $resultFilters[$newName] = $filter;
+ }
+ } else {
+ $resultFilters[$name] = $filter;
+ }
+ }
+ return $resultFilters;
+ }
+
+ /**
+ * Replace certain filter name in all configs with different name.
+ * @param array[] $config Configuration being processed
+ * @param string $oldName
+ * @param string $newName
+ */
+ private function replaceFilter( array &$config, $oldName, $newName ) {
+ foreach ( $config['analyzer'] as &$analyzer ) {
+ if ( !isset( $analyzer['filter'] ) ) {
+ continue;
+ }
+ $analyzer['filter'] = array_map( function ( $filter )
use ( $oldName, $newName ) {
+ if ( $filter === $oldName ) {
+ return $newName;
+ }
+ return $filter;
+ }, $analyzer['filter'] );
+ }
+ }
+
+ /**
+ * Merge per-language config into the main config.
+ * It will copy specific analyzer and all dependant filters and
char_filters.
+ * @param array $config Main config
+ * @param array $langConfig Per-language config
+ * @param string $name Name for analyzer whose config we're merging
+ * @param string $prefix Prefix for this configuration
+ */
+ private function mergeConfig( array &$config, array $langConfig, $name,
$prefix ) {
+ $analyzer = $langConfig['analyzer'][$name];
+ $config['analyzer'][$prefix . '_' . $name] = $analyzer;
+ if ( !empty( $analyzer['filter'] ) ) {
+ // Add private filters for this analyzer
+ foreach ( $analyzer['filter'] as $filter ) {
+ // Copy filters that are in language config but
not in the main config.
+ // We would not copy the same filter into the
main config since due to
+ // the resolution step we know they are the
same (otherwise we would have
+ // renamed it).
+ if ( isset( $langConfig['filter'][$filter] ) &&
+ !isset( $config['filter'][$filter] ) ) {
+ $config['filter'][$filter] =
$langConfig['filter'][$filter];
+ }
+ }
+ }
+ if ( !empty( $analyzer['char_filter'] ) ) {
+ // Add private char_filters for this analyzer
+ foreach ( $analyzer['char_filter'] as $filter ) {
+ // Here unlike above we do not check for
$langConfig since we assume
+ // language config is not broken and all char
filters are namespaced
+ // nicely, so if the filter is mentioned in
analyzer it is also defined.
+ if ( !isset( $config['char_filter'][$filter] )
) {
+ $config['char_filter'][$filter] =
$langConfig['char_filter'][$filter];
+ }
+ }
+ }
+ }
+
+ /**
+ * Create per-language configs for specific analyzers which separates
and namespaces
+ * filters that are different between languages.
+ * @param array &$config Existing config, will be modified
+ * @param string[] $languages List of languages to process
+ * @param string[] $analyzers List of analyzers to process
+ */
+ public function buildLanguageConfigs( array &$config, array $languages,
array $analyzers ) {
+ $defaultFilters = $this->getDefaultFilters( $config, $analyzers
);
+ foreach ( $languages as $lang ) {
+ $langConfig = $this->buildConfig( $lang );
+ $defaultFilters += $this->getDefaultFilters(
$langConfig, $analyzers );
+ }
+ foreach ( $languages as $lang ) {
+ $langConfig = $this->buildConfig( $lang );
+ // Analyzer is: tokenizer + filter + char_filter
+ // Tokenizers don't seem to be subject to customization
now
+ // Char filters are nicely namespaced
+ // Filters are NOT - e.g. lowercase & icu_folding
filters are different for different
+ // languages! So we need to do some disambiguation here.
+ $langConfig['filter'] = $this->resolveFilters(
$langConfig, $config['filter'], $defaultFilters, $lang );
+ // Merge configs
+ foreach ( $analyzers as $analyzer ) {
+ $this->mergeConfig( $config, $langConfig,
$analyzer, $lang );
+ }
+ }
+ }
+
+ /**
* @return bool true if the icu analyzer is available.
*/
public function isIcuAvailable() {
diff --git a/tests/unit/Maintenance/AnalysisConfigBuilderTest.php
b/tests/unit/Maintenance/AnalysisConfigBuilderTest.php
index e188ed6..9f11ee7 100644
--- a/tests/unit/Maintenance/AnalysisConfigBuilderTest.php
+++ b/tests/unit/Maintenance/AnalysisConfigBuilderTest.php
@@ -418,4 +418,101 @@
$this->assertEquals( $expected, $builder->buildConfig()
);
}
}
+
+ public function languageConfigDataProvider() {
+ $emptyConfig = [
+ 'analyzer' => [],
+ 'filter' => [],
+ 'char_filter' => []
+ ];
+ $allPlugins = [
+ 'extra',
+ 'analysis-icu',
+ 'analysis-stempel',
+ 'analysis-kuromoji',
+ 'analysis-smartcn',
+ 'analysis-hebrew',
+ 'analysis-ukrainian',
+ 'analysis-stconvert'
+ ];
+
+ return [
+ "some languages" => [
+ [ 'en', 'ru', 'es', 'de', 'zh' ],
+ $emptyConfig,
+ $allPlugins,
+ 'en-ru-es-de-zh',
+ ],
+ // sv has custom icu_folding filter
+ "sv" => [
+ [ 'en', 'zh', 'sv' ],
+ $emptyConfig,
+ $allPlugins,
+ 'en-zh-sv',
+ ],
+ "with plugins" => [
+ [ 'he', 'uk' ],
+ $emptyConfig,
+ $allPlugins,
+ 'he-uk',
+ ],
+ "without language plugins" => [
+ [ 'he', 'uk' ],
+ $emptyConfig,
+ [ 'extra', 'analysis-icu' ],
+ 'he-uk-nolang',
+ ],
+ "without any plugins" => [
+ [ 'he', 'uk' ],
+ $emptyConfig,
+ [],
+ 'he-uk-noplug',
+ ],
+ "all default languages" => [
+ [ 'ch', 'fy', 'kab', 'ti', 'xmf' ],
+ $emptyConfig,
+ [ 'extra', 'analysis-icu' ],
+ 'all_defaults',
+ ],
+ ];
+ }
+
+ /**
+ * @param string[] $languages
+ * @param array $oldConfig
+ * @param string[] $plugins
+ * @param string $expectedConfig Filename with expected config
+ * @dataProvider languageConfigDataProvider
+ */
+ public function testAnalysisConfig( $languages, $oldConfig, $plugins,
$expectedConfig ) {
+ // We use these static settings because we rely on tests in main
+ // AnalysisConfigBuilderTest to handle variations
+ $config = new HashSearchConfig( [ 'CirrusSearchUseIcuFolding'
=> 'default' ] );
+
+ $builder = new AnalysisConfigBuilder( 'en', $plugins, $config );
+ $prevConfig = $oldConfig;
+ $builder->buildLanguageConfigs( $oldConfig, $languages,
+ [ 'plain', 'plain_search', 'text', 'text_search' ] );
+ $expectedFile = __DIR__ .
"/../fixtures/analyzer/$expectedConfig.expected";
+ if ( is_file( $expectedFile ) ) {
+ $expected = json_decode( file_get_contents(
$expectedFile ), true );
+ $this->assertEquals( $expected, $oldConfig );
+ } else {
+ file_put_contents( $expectedFile, json_encode(
$oldConfig, JSON_PRETTY_PRINT ) );
+ $this->markTestSkipped( "Generated new fixture" );
+ }
+
+ $oldConfig = $prevConfig;
+ $builder->buildLanguageConfigs( $oldConfig, $languages,
+ [ 'plain', 'plain_search' ] );
+ $expectedFile = __DIR__ .
"/../fixtures/analyzer/$expectedConfig.plain.expected";
+ if ( is_file( $expectedFile ) ) {
+ $expected = json_decode( file_get_contents(
$expectedFile ), true );
+ $this->assertEquals( $expected, $oldConfig );
+ } else {
+ file_put_contents( $expectedFile, json_encode(
$oldConfig, JSON_PRETTY_PRINT ) );
+ $this->markTestSkipped( "Generated new fixture" );
+ }
+ }
+
}
diff --git a/tests/unit/fixtures/analyzer/all_defaults.expected
b/tests/unit/fixtures/analyzer/all_defaults.expected
new file mode 100644
index 0000000..1f14fb4
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/all_defaults.expected
@@ -0,0 +1,191 @@
+{
+ "analyzer": {
+ "ch_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "ch_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "ch_text": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ]
+ },
+ "ch_text_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ]
+ },
+ "fy_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "fy_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "fy_text": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ]
+ },
+ "fy_text_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ]
+ },
+ "kab_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "kab_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "kab_text": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ]
+ },
+ "kab_text_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ]
+ },
+ "ti_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "ti_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "ti_text": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ]
+ },
+ "ti_text_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ]
+ },
+ "xmf_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "xmf_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "xmf_text": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ]
+ },
+ "xmf_text_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ]
+ }
+ },
+ "filter": {
+ "icu_normalizer": {
+ "type": "icu_normalizer",
+ "name": "nfkc_cf"
+ }
+ },
+ "char_filter": {
+ "word_break_helper": {
+ "type": "mapping",
+ "mappings": [
+ "_=>\\u0020",
+ ".=>\\u0020",
+ "(=>\\u0020",
+ ")=>\\u0020"
+ ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/all_defaults.plain.expected
b/tests/unit/fixtures/analyzer/all_defaults.plain.expected
new file mode 100644
index 0000000..77c7142
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/all_defaults.plain.expected
@@ -0,0 +1,121 @@
+{
+ "analyzer": {
+ "ch_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "ch_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "fy_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "fy_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "kab_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "kab_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "ti_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "ti_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "xmf_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "xmf_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ }
+ },
+ "filter": {
+ "icu_normalizer": {
+ "type": "icu_normalizer",
+ "name": "nfkc_cf"
+ }
+ },
+ "char_filter": {
+ "word_break_helper": {
+ "type": "mapping",
+ "mappings": [
+ "_=>\\u0020",
+ ".=>\\u0020",
+ "(=>\\u0020",
+ ")=>\\u0020"
+ ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/en-ru-es-de-zh.expected
b/tests/unit/fixtures/analyzer/en-ru-es-de-zh.expected
new file mode 100644
index 0000000..5df50b9
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/en-ru-es-de-zh.expected
@@ -0,0 +1,379 @@
+{
+ "analyzer": {
+ "en_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer",
+ "preserve_original_recorder",
+ "icu_folding",
+ "preserve_original"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "en_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "en_text": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "char_filter": [
+ "word_break_helper",
+ "kana_map"
+ ],
+ "filter": [
+ "aggressive_splitting",
+ "possessive_english",
+ "icu_normalizer",
+ "stop",
+ "icu_folding",
+ "kstem",
+ "custom_stem"
+ ]
+ },
+ "en_text_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "char_filter": [
+ "word_break_helper",
+ "kana_map"
+ ],
+ "filter": [
+ "aggressive_splitting",
+ "possessive_english",
+ "icu_normalizer",
+ "stop",
+ "icu_folding",
+ "kstem",
+ "custom_stem"
+ ]
+ },
+ "ru_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper",
+ "russian_charfilter"
+ ]
+ },
+ "ru_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper",
+ "russian_charfilter"
+ ]
+ },
+ "ru_text": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "char_filter": [
+ "russian_charfilter"
+ ],
+ "filter": [
+ "icu_normalizer",
+ "russian_stop",
+ "russian_stemmer"
+ ]
+ },
+ "ru_text_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "char_filter": [
+ "russian_charfilter"
+ ],
+ "filter": [
+ "icu_normalizer",
+ "russian_stop",
+ "russian_stemmer"
+ ]
+ },
+ "es_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "es_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "es_text": {
+ "type": "spanish",
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "es_text_search": {
+ "type": "spanish",
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "de_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "de_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "de_text": {
+ "type": "german",
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "de_text_search": {
+ "type": "german",
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "zh_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "smartcn_stop",
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "zh_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "smartcn_stop",
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "zh_text": {
+ "type": "custom",
+ "tokenizer": "smartcn_tokenizer",
+ "char_filter": [
+ "stconvertfix",
+ "tsconvert"
+ ],
+ "filter": [
+ "smartcn_stop",
+ "icu_normalizer"
+ ]
+ },
+ "zh_text_search": {
+ "type": "custom",
+ "tokenizer": "smartcn_tokenizer",
+ "char_filter": [
+ "stconvertfix",
+ "tsconvert"
+ ],
+ "filter": [
+ "smartcn_stop",
+ "icu_normalizer"
+ ]
+ }
+ },
+ "filter": {
+ "icu_normalizer": {
+ "type": "icu_normalizer",
+ "name": "nfkc_cf"
+ },
+ "icu_folding": {
+ "type": "icu_folding"
+ },
+ "aggressive_splitting": {
+ "type": "word_delimiter",
+ "stem_english_possessive": false,
+ "preserve_original": false
+ },
+ "possessive_english": {
+ "type": "stemmer",
+ "language": "possessive_english"
+ },
+ "custom_stem": {
+ "type": "stemmer_override",
+ "rules": "guidelines => guideline"
+ },
+ "russian_stop": {
+ "type": "stop",
+ "stopwords": "_russian_"
+ },
+ "russian_stemmer": {
+ "type": "stemmer",
+ "language": "russian"
+ },
+ "smartcn_stop": {
+ "type": "stop",
+ "stopwords": [
+ ","
+ ]
+ }
+ },
+ "char_filter": {
+ "word_break_helper": {
+ "type": "mapping",
+ "mappings": [
+ "_=>\\u0020",
+ ".=>\\u0020",
+ "(=>\\u0020",
+ ")=>\\u0020"
+ ]
+ },
+ "kana_map": {
+ "type": "mapping",
+ "mappings": [
+ "\\u3041=>\\u30a1",
+ "\\u3042=>\\u30a2",
+ "\\u3043=>\\u30a3",
+ "\\u3044=>\\u30a4",
+ "\\u3045=>\\u30a5",
+ "\\u3046=>\\u30a6",
+ "\\u3094=>\\u30f4",
+ "\\u3047=>\\u30a7",
+ "\\u3048=>\\u30a8",
+ "\\u3049=>\\u30a9",
+ "\\u304a=>\\u30aa",
+ "\\u3095=>\\u30f5",
+ "\\u304b=>\\u30ab",
+ "\\u304c=>\\u30ac",
+ "\\u304d=>\\u30ad",
+ "\\u304e=>\\u30ae",
+ "\\u304f=>\\u30af",
+ "\\u3050=>\\u30b0",
+ "\\u3096=>\\u30f6",
+ "\\u3051=>\\u30b1",
+ "\\u3052=>\\u30b2",
+ "\\u3053=>\\u30b3",
+ "\\u3054=>\\u30b4",
+ "\\u3055=>\\u30b5",
+ "\\u3056=>\\u30b6",
+ "\\u3057=>\\u30b7",
+ "\\u3058=>\\u30b8",
+ "\\u3059=>\\u30b9",
+ "\\u305a=>\\u30ba",
+ "\\u305b=>\\u30bb",
+ "\\u305c=>\\u30bc",
+ "\\u305d=>\\u30bd",
+ "\\u305e=>\\u30be",
+ "\\u305f=>\\u30bf",
+ "\\u3060=>\\u30c0",
+ "\\u3061=>\\u30c1",
+ "\\u3062=>\\u30c2",
+ "\\u3063=>\\u30c3",
+ "\\u3064=>\\u30c4",
+ "\\u3065=>\\u30c5",
+ "\\u3066=>\\u30c6",
+ "\\u3067=>\\u30c7",
+ "\\u3068=>\\u30c8",
+ "\\u3069=>\\u30c9",
+ "\\u306a=>\\u30ca",
+ "\\u306b=>\\u30cb",
+ "\\u306c=>\\u30cc",
+ "\\u306d=>\\u30cd",
+ "\\u306e=>\\u30ce",
+ "\\u306f=>\\u30cf",
+ "\\u3070=>\\u30d0",
+ "\\u3071=>\\u30d1",
+ "\\u3072=>\\u30d2",
+ "\\u3073=>\\u30d3",
+ "\\u3074=>\\u30d4",
+ "\\u3075=>\\u30d5",
+ "\\u3076=>\\u30d6",
+ "\\u3077=>\\u30d7",
+ "\\u3078=>\\u30d8",
+ "\\u3079=>\\u30d9",
+ "\\u307a=>\\u30da",
+ "\\u307b=>\\u30db",
+ "\\u307c=>\\u30dc",
+ "\\u307d=>\\u30dd",
+ "\\u307e=>\\u30de",
+ "\\u307f=>\\u30df",
+ "\\u3080=>\\u30e0",
+ "\\u3081=>\\u30e1",
+ "\\u3082=>\\u30e2",
+ "\\u3083=>\\u30e3",
+ "\\u3084=>\\u30e4",
+ "\\u3085=>\\u30e5",
+ "\\u3086=>\\u30e6",
+ "\\u3087=>\\u30e7",
+ "\\u3088=>\\u30e8",
+ "\\u3089=>\\u30e9",
+ "\\u308a=>\\u30ea",
+ "\\u308b=>\\u30eb",
+ "\\u308c=>\\u30ec",
+ "\\u308d=>\\u30ed",
+ "\\u308e=>\\u30ee",
+ "\\u308f=>\\u30ef",
+ "\\u3090=>\\u30f0",
+ "\\u3091=>\\u30f1",
+ "\\u3092=>\\u30f2",
+ "\\u3093=>\\u30f3"
+ ]
+ },
+ "russian_charfilter": {
+ "type": "mapping",
+ "mappings": [
+ "\\u0301=>",
+ "\\u0130=>I",
+ "\\u0435\\u0308=>\\u0435",
+ "\\u0415\\u0308=>\\u0415",
+ "\\u0451=>\\u0435",
+ "\\u0401=>\\u0415"
+ ]
+ },
+ "stconvertfix": {
+ "type": "mapping",
+ "mappings": [
+ "\\u606d\\u5f18=>\\u606d \\u5f18",
+ "\\u5138=>\\u3469"
+ ]
+ },
+ "tsconvert": {
+ "type": "stconvert",
+ "delimiter": "#",
+ "keep_both": false,
+ "convert_type": "t2s"
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/en-ru-es-de-zh.plain.expected
b/tests/unit/fixtures/analyzer/en-ru-es-de-zh.plain.expected
new file mode 100644
index 0000000..2ddbb56
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/en-ru-es-de-zh.plain.expected
@@ -0,0 +1,148 @@
+{
+ "analyzer": {
+ "en_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer",
+ "preserve_original_recorder",
+ "icu_folding",
+ "preserve_original"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "en_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "ru_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper",
+ "russian_charfilter"
+ ]
+ },
+ "ru_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper",
+ "russian_charfilter"
+ ]
+ },
+ "es_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "es_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "de_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "de_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "zh_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "smartcn_stop",
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "zh_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "smartcn_stop",
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ }
+ },
+ "filter": {
+ "icu_normalizer": {
+ "type": "icu_normalizer",
+ "name": "nfkc_cf"
+ },
+ "icu_folding": {
+ "type": "icu_folding"
+ },
+ "smartcn_stop": {
+ "type": "stop",
+ "stopwords": [
+ ","
+ ]
+ }
+ },
+ "char_filter": {
+ "word_break_helper": {
+ "type": "mapping",
+ "mappings": [
+ "_=>\\u0020",
+ ".=>\\u0020",
+ "(=>\\u0020",
+ ")=>\\u0020"
+ ]
+ },
+ "russian_charfilter": {
+ "type": "mapping",
+ "mappings": [
+ "\\u0301=>",
+ "\\u0130=>I",
+ "\\u0435\\u0308=>\\u0435",
+ "\\u0415\\u0308=>\\u0415",
+ "\\u0451=>\\u0435",
+ "\\u0401=>\\u0415"
+ ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/en-zh-sv.expected
b/tests/unit/fixtures/analyzer/en-zh-sv.expected
new file mode 100644
index 0000000..3990158
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/en-zh-sv.expected
@@ -0,0 +1,309 @@
+{
+ "analyzer": {
+ "en_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer",
+ "preserve_original_recorder",
+ "icu_folding",
+ "preserve_original"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "en_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "en_text": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "char_filter": [
+ "word_break_helper",
+ "kana_map"
+ ],
+ "filter": [
+ "aggressive_splitting",
+ "possessive_english",
+ "icu_normalizer",
+ "stop",
+ "icu_folding",
+ "kstem",
+ "custom_stem"
+ ]
+ },
+ "en_text_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "char_filter": [
+ "word_break_helper",
+ "kana_map"
+ ],
+ "filter": [
+ "aggressive_splitting",
+ "possessive_english",
+ "icu_normalizer",
+ "stop",
+ "icu_folding",
+ "kstem",
+ "custom_stem"
+ ]
+ },
+ "zh_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "smartcn_stop",
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "zh_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "smartcn_stop",
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "zh_text": {
+ "type": "custom",
+ "tokenizer": "smartcn_tokenizer",
+ "char_filter": [
+ "stconvertfix",
+ "tsconvert"
+ ],
+ "filter": [
+ "smartcn_stop",
+ "icu_normalizer"
+ ]
+ },
+ "zh_text_search": {
+ "type": "custom",
+ "tokenizer": "smartcn_tokenizer",
+ "char_filter": [
+ "stconvertfix",
+ "tsconvert"
+ ],
+ "filter": [
+ "smartcn_stop",
+ "icu_normalizer"
+ ]
+ },
+ "sv_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer",
+ "preserve_original_recorder",
+ "sv_icu_folding",
+ "preserve_original"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "sv_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "sv_text": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer",
+ "swedish_stop",
+ "swedish_stemmer",
+ "preserve_original_recorder",
+ "sv_icu_folding",
+ "preserve_original"
+ ]
+ },
+ "sv_text_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer",
+ "swedish_stop",
+ "swedish_stemmer",
+ "preserve_original_recorder",
+ "sv_icu_folding",
+ "preserve_original"
+ ]
+ }
+ },
+ "filter": {
+ "icu_normalizer": {
+ "type": "icu_normalizer",
+ "name": "nfkc_cf"
+ },
+ "icu_folding": {
+ "type": "icu_folding"
+ },
+ "aggressive_splitting": {
+ "type": "word_delimiter",
+ "stem_english_possessive": false,
+ "preserve_original": false
+ },
+ "possessive_english": {
+ "type": "stemmer",
+ "language": "possessive_english"
+ },
+ "custom_stem": {
+ "type": "stemmer_override",
+ "rules": "guidelines => guideline"
+ },
+ "smartcn_stop": {
+ "type": "stop",
+ "stopwords": [
+ ","
+ ]
+ },
+ "sv_icu_folding": {
+ "type": "icu_folding",
+ "unicodeSetFilter": "[^\u00e5\u00e4\u00f6\u00c5\u00c4\u00d6]"
+ },
+ "swedish_stop": {
+ "type": "stop",
+ "stopwords": "_swedish_"
+ },
+ "swedish_stemmer": {
+ "type": "stemmer",
+ "language": "swedish"
+ }
+ },
+ "char_filter": {
+ "word_break_helper": {
+ "type": "mapping",
+ "mappings": [
+ "_=>\\u0020",
+ ".=>\\u0020",
+ "(=>\\u0020",
+ ")=>\\u0020"
+ ]
+ },
+ "kana_map": {
+ "type": "mapping",
+ "mappings": [
+ "\\u3041=>\\u30a1",
+ "\\u3042=>\\u30a2",
+ "\\u3043=>\\u30a3",
+ "\\u3044=>\\u30a4",
+ "\\u3045=>\\u30a5",
+ "\\u3046=>\\u30a6",
+ "\\u3094=>\\u30f4",
+ "\\u3047=>\\u30a7",
+ "\\u3048=>\\u30a8",
+ "\\u3049=>\\u30a9",
+ "\\u304a=>\\u30aa",
+ "\\u3095=>\\u30f5",
+ "\\u304b=>\\u30ab",
+ "\\u304c=>\\u30ac",
+ "\\u304d=>\\u30ad",
+ "\\u304e=>\\u30ae",
+ "\\u304f=>\\u30af",
+ "\\u3050=>\\u30b0",
+ "\\u3096=>\\u30f6",
+ "\\u3051=>\\u30b1",
+ "\\u3052=>\\u30b2",
+ "\\u3053=>\\u30b3",
+ "\\u3054=>\\u30b4",
+ "\\u3055=>\\u30b5",
+ "\\u3056=>\\u30b6",
+ "\\u3057=>\\u30b7",
+ "\\u3058=>\\u30b8",
+ "\\u3059=>\\u30b9",
+ "\\u305a=>\\u30ba",
+ "\\u305b=>\\u30bb",
+ "\\u305c=>\\u30bc",
+ "\\u305d=>\\u30bd",
+ "\\u305e=>\\u30be",
+ "\\u305f=>\\u30bf",
+ "\\u3060=>\\u30c0",
+ "\\u3061=>\\u30c1",
+ "\\u3062=>\\u30c2",
+ "\\u3063=>\\u30c3",
+ "\\u3064=>\\u30c4",
+ "\\u3065=>\\u30c5",
+ "\\u3066=>\\u30c6",
+ "\\u3067=>\\u30c7",
+ "\\u3068=>\\u30c8",
+ "\\u3069=>\\u30c9",
+ "\\u306a=>\\u30ca",
+ "\\u306b=>\\u30cb",
+ "\\u306c=>\\u30cc",
+ "\\u306d=>\\u30cd",
+ "\\u306e=>\\u30ce",
+ "\\u306f=>\\u30cf",
+ "\\u3070=>\\u30d0",
+ "\\u3071=>\\u30d1",
+ "\\u3072=>\\u30d2",
+ "\\u3073=>\\u30d3",
+ "\\u3074=>\\u30d4",
+ "\\u3075=>\\u30d5",
+ "\\u3076=>\\u30d6",
+ "\\u3077=>\\u30d7",
+ "\\u3078=>\\u30d8",
+ "\\u3079=>\\u30d9",
+ "\\u307a=>\\u30da",
+ "\\u307b=>\\u30db",
+ "\\u307c=>\\u30dc",
+ "\\u307d=>\\u30dd",
+ "\\u307e=>\\u30de",
+ "\\u307f=>\\u30df",
+ "\\u3080=>\\u30e0",
+ "\\u3081=>\\u30e1",
+ "\\u3082=>\\u30e2",
+ "\\u3083=>\\u30e3",
+ "\\u3084=>\\u30e4",
+ "\\u3085=>\\u30e5",
+ "\\u3086=>\\u30e6",
+ "\\u3087=>\\u30e7",
+ "\\u3088=>\\u30e8",
+ "\\u3089=>\\u30e9",
+ "\\u308a=>\\u30ea",
+ "\\u308b=>\\u30eb",
+ "\\u308c=>\\u30ec",
+ "\\u308d=>\\u30ed",
+ "\\u308e=>\\u30ee",
+ "\\u308f=>\\u30ef",
+ "\\u3090=>\\u30f0",
+ "\\u3091=>\\u30f1",
+ "\\u3092=>\\u30f2",
+ "\\u3093=>\\u30f3"
+ ]
+ },
+ "stconvertfix": {
+ "type": "mapping",
+ "mappings": [
+ "\\u606d\\u5f18=>\\u606d \\u5f18",
+ "\\u5138=>\\u3469"
+ ]
+ },
+ "tsconvert": {
+ "type": "stconvert",
+ "delimiter": "#",
+ "keep_both": false,
+ "convert_type": "t2s"
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/en-zh-sv.plain.expected
b/tests/unit/fixtures/analyzer/en-zh-sv.plain.expected
new file mode 100644
index 0000000..0309757
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/en-zh-sv.plain.expected
@@ -0,0 +1,102 @@
+{
+ "analyzer": {
+ "en_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer",
+ "preserve_original_recorder",
+ "icu_folding",
+ "preserve_original"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "en_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "zh_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "smartcn_stop",
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "zh_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "smartcn_stop",
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "sv_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer",
+ "preserve_original_recorder",
+ "sv_icu_folding",
+ "preserve_original"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "sv_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ }
+ },
+ "filter": {
+ "icu_normalizer": {
+ "type": "icu_normalizer",
+ "name": "nfkc_cf"
+ },
+ "icu_folding": {
+ "type": "icu_folding"
+ },
+ "smartcn_stop": {
+ "type": "stop",
+ "stopwords": [
+ ","
+ ]
+ },
+ "sv_icu_folding": {
+ "type": "icu_folding",
+ "unicodeSetFilter": "[^\u00e5\u00e4\u00f6\u00c5\u00c4\u00d6]"
+ }
+ },
+ "char_filter": {
+ "word_break_helper": {
+ "type": "mapping",
+ "mappings": [
+ "_=>\\u0020",
+ ".=>\\u0020",
+ "(=>\\u0020",
+ ")=>\\u0020"
+ ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/he-uk-nolang.expected
b/tests/unit/fixtures/analyzer/he-uk-nolang.expected
new file mode 100644
index 0000000..0935ec7
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/he-uk-nolang.expected
@@ -0,0 +1,95 @@
+{
+ "analyzer": {
+ "he_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer",
+ "preserve_original_recorder",
+ "icu_folding",
+ "preserve_original"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "he_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "he_text": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ]
+ },
+ "he_text_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ]
+ },
+ "uk_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "uk_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "uk_text": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ]
+ },
+ "uk_text_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ]
+ }
+ },
+ "filter": {
+ "icu_normalizer": {
+ "type": "icu_normalizer",
+ "name": "nfkc_cf"
+ },
+ "icu_folding": {
+ "type": "icu_folding"
+ }
+ },
+ "char_filter": {
+ "word_break_helper": {
+ "type": "mapping",
+ "mappings": [
+ "_=>\\u0020",
+ ".=>\\u0020",
+ "(=>\\u0020",
+ ")=>\\u0020"
+ ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/he-uk-nolang.plain.expected
b/tests/unit/fixtures/analyzer/he-uk-nolang.plain.expected
new file mode 100644
index 0000000..0abb4bd
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/he-uk-nolang.plain.expected
@@ -0,0 +1,67 @@
+{
+ "analyzer": {
+ "he_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer",
+ "preserve_original_recorder",
+ "icu_folding",
+ "preserve_original"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "he_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "uk_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "uk_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ }
+ },
+ "filter": {
+ "icu_normalizer": {
+ "type": "icu_normalizer",
+ "name": "nfkc_cf"
+ },
+ "icu_folding": {
+ "type": "icu_folding"
+ }
+ },
+ "char_filter": {
+ "word_break_helper": {
+ "type": "mapping",
+ "mappings": [
+ "_=>\\u0020",
+ ".=>\\u0020",
+ "(=>\\u0020",
+ ")=>\\u0020"
+ ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/he-uk-noplug.expected
b/tests/unit/fixtures/analyzer/he-uk-noplug.expected
new file mode 100644
index 0000000..af0c051
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/he-uk-noplug.expected
@@ -0,0 +1,88 @@
+{
+ "analyzer": {
+ "he_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "lowercase"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "he_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "lowercase"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "he_text": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "lowercase"
+ ]
+ },
+ "he_text_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "lowercase"
+ ]
+ },
+ "uk_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "lowercase"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "uk_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "lowercase"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "uk_text": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "lowercase"
+ ]
+ },
+ "uk_text_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "lowercase"
+ ]
+ }
+ },
+ "filter": {
+ "lowercase": {
+ "type": "lowercase"
+ }
+ },
+ "char_filter": {
+ "word_break_helper": {
+ "type": "mapping",
+ "mappings": [
+ "_=>\\u0020",
+ ".=>\\u0020",
+ "(=>\\u0020",
+ ")=>\\u0020"
+ ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/he-uk-noplug.plain.expected
b/tests/unit/fixtures/analyzer/he-uk-noplug.plain.expected
new file mode 100644
index 0000000..ac947b1
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/he-uk-noplug.plain.expected
@@ -0,0 +1,60 @@
+{
+ "analyzer": {
+ "he_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "lowercase"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "he_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "lowercase"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "uk_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "lowercase"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "uk_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "lowercase"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ }
+ },
+ "filter": {
+ "lowercase": {
+ "type": "lowercase"
+ }
+ },
+ "char_filter": {
+ "word_break_helper": {
+ "type": "mapping",
+ "mappings": [
+ "_=>\\u0020",
+ ".=>\\u0020",
+ "(=>\\u0020",
+ ")=>\\u0020"
+ ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/he-uk.expected
b/tests/unit/fixtures/analyzer/he-uk.expected
new file mode 100644
index 0000000..d419d4d
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/he-uk.expected
@@ -0,0 +1,99 @@
+{
+ "analyzer": {
+ "he_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer",
+ "preserve_original_recorder",
+ "icu_folding",
+ "preserve_original"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "he_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "he_text": {
+ "type": "custom",
+ "tokenizer": "hebrew",
+ "filter": [
+ "niqqud",
+ "hebrew_lemmatizer",
+ "icu_normalizer",
+ "icu_folding"
+ ]
+ },
+ "he_text_search": {
+ "type": "custom",
+ "tokenizer": "hebrew",
+ "filter": [
+ "niqqud",
+ "hebrew_lemmatizer",
+ "icu_normalizer",
+ "icu_folding"
+ ]
+ },
+ "uk_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "uk_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "uk_text": {
+ "type": "ukrainian",
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "uk_text_search": {
+ "type": "ukrainian",
+ "char_filter": [
+ "word_break_helper"
+ ]
+ }
+ },
+ "filter": {
+ "icu_normalizer": {
+ "type": "icu_normalizer",
+ "name": "nfkc_cf"
+ },
+ "icu_folding": {
+ "type": "icu_folding"
+ }
+ },
+ "char_filter": {
+ "word_break_helper": {
+ "type": "mapping",
+ "mappings": [
+ "_=>\\u0020",
+ ".=>\\u0020",
+ "(=>\\u0020",
+ ")=>\\u0020"
+ ]
+ }
+ }
+}
\ No newline at end of file
diff --git a/tests/unit/fixtures/analyzer/he-uk.plain.expected
b/tests/unit/fixtures/analyzer/he-uk.plain.expected
new file mode 100644
index 0000000..0abb4bd
--- /dev/null
+++ b/tests/unit/fixtures/analyzer/he-uk.plain.expected
@@ -0,0 +1,67 @@
+{
+ "analyzer": {
+ "he_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer",
+ "preserve_original_recorder",
+ "icu_folding",
+ "preserve_original"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "he_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "uk_plain": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ },
+ "uk_plain_search": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [
+ "icu_normalizer"
+ ],
+ "char_filter": [
+ "word_break_helper"
+ ]
+ }
+ },
+ "filter": {
+ "icu_normalizer": {
+ "type": "icu_normalizer",
+ "name": "nfkc_cf"
+ },
+ "icu_folding": {
+ "type": "icu_folding"
+ }
+ },
+ "char_filter": {
+ "word_break_helper": {
+ "type": "mapping",
+ "mappings": [
+ "_=>\\u0020",
+ ".=>\\u0020",
+ "(=>\\u0020",
+ ")=>\\u0020"
+ ]
+ }
+ }
+}
\ No newline at end of file
--
To view, visit https://gerrit.wikimedia.org/r/393689
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I6abaf6b75aac86b39d416372c612e4099bfddfaa
Gerrit-PatchSet: 6
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Smalyshev <[email protected]>
Gerrit-Reviewer: Cindy-the-browser-test-bot <[email protected]>
Gerrit-Reviewer: DCausse <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Gehel <[email protected]>
Gerrit-Reviewer: Smalyshev <[email protected]>
Gerrit-Reviewer: Tjones <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits