jenkins-bot has submitted this change and it was merged.
Change subject: Add support for ICU tokenization
......................................................................
Add support for ICU tokenization
The icu tokenizer uses an approach based on dictionnaries to break
words.
For chinese: 灯笼 is properly tokenized as a single token while the
standard tokenizer would emit two separate tokens.
Change-Id: I930e34b24db825b21c1a7eca5bf28cc09a76c152
---
M CirrusSearch.php
M includes/Maintenance/AnalysisConfigBuilder.php
M includes/Maintenance/SuggesterAnalysisConfigBuilder.php
M tests/unit/Maintenance/AnalysisConfigBuilderTest.php
4 files changed, 135 insertions(+), 7 deletions(-)
Approvals:
Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
Tjones: Looks good to me, but someone else must approve
EBernhardson: Looks good to me, approved
jenkins-bot: Verified
diff --git a/CirrusSearch.php b/CirrusSearch.php
index 258c3ca..5e60b7a 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -795,6 +795,21 @@
$wgCirrusSearchICUFoldingUnicodeSetFilter = null;
/**
+ * Enable the ICU Tokenizer instead of the standard filter
+ * for plain fields.
+ * It may be more suited for languages that do not use spaces
+ * to break words.
+ * Requires the ICU plugin installed
+ * Set to:
+ * - default: let cirrus decides if the ICU tokenizer can be enabled according
to wiki language
+ * - yes: force the use of ICU tokenizer
+ * - no: disable the ICU tokenizer even if cirrus thinks it can be enabled
+ * NOTE: Experimental
+ */
+$wgCirrusSearchUseIcuTokenizer = 'default';
+
+
+/**
* Set the default scoring function to be used by
maintenance/updateSuggesterIndex.php
* @see includes/BuildDocument/SuggestScoring.php for more details about
scoring functions
* NOTE: if you change the scoring method you'll have to rebuild the suggester
index.
diff --git a/includes/Maintenance/AnalysisConfigBuilder.php
b/includes/Maintenance/AnalysisConfigBuilder.php
index f489f05..831d34a 100644
--- a/includes/Maintenance/AnalysisConfigBuilder.php
+++ b/includes/Maintenance/AnalysisConfigBuilder.php
@@ -48,7 +48,12 @@
/**
* @var boolean true if icu folding is requested and available
*/
- private $icuFolding;
+ protected $icuFolding;
+
+ /**
+ * @var boolean true if the icu tokenizer is requested and available
+ */
+ protected $icuTokenizer;
/**
* @var array Similarity algo (tf/idf, bm25, etc) configuration
@@ -85,6 +90,7 @@
$this->config = $config;
$this->icuFolding = $this->shouldActivateIcuFolding( $plugins );
+ $this->icuTokenizer = $this->shouldActivateIcuTokenization();
}
/**
@@ -117,6 +123,27 @@
}
/**
+ * Determine if the icu tokenizer can be enabled
+ * @return bool
+ */
+ private function shouldActivateIcuTokenization() {
+ if ( !$this->icu ) {
+ // requires the icu plugin
+ return false;
+ }
+ $in_config = $this->config->get( 'CirrusSearchUseIcuTokenizer'
);
+ switch( $in_config ) {
+ case 'yes': return true;
+ case 'no': return false;
+ case 'default':
+ if ( isset(
$this->languagesWithIcuTokenization[$this->language] ) ) {
+ return
$this->languagesWithIcuTokenization[$this->language];
+ }
+ default: return false;
+ }
+ }
+
+ /**
* Build the analysis config.
*
* @return array the analysis config
@@ -124,6 +151,9 @@
public function buildConfig() {
$config = $this->customize( $this->defaults() );
Hooks::run( 'CirrusSearchAnalysisConfig', [ &$config ] );
+ if ( $this->icuTokenizer ) {
+ $config = $this->enableICUTokenizer( $config );
+ }
if ( $this->icuFolding ) {
$config = $this->enableICUFolding( $config );
}
@@ -141,6 +171,22 @@
return $this->similarity['similarity'];
}
return null;
+ }
+ /**
+ * replace the standard tokenizer with icu_tokenizer
+ * @param mixed[] $config
+ * @return mixed[] update config
+ */
+ public function enableICUTokenizer( array $config ) {
+ foreach( $config['analyzer'] as $name => &$value ) {
+ if ( isset( $value['type'] ) && $value['type'] !=
'custom' ) {
+ continue;
+ }
+ if ( isset( $value['tokenizer'] ) && 'standard' ===
$value['tokenizer'] ) {
+ $value['tokenizer'] = 'icu_tokenizer';
+ }
+ }
+ return $config;
}
/**
@@ -866,6 +912,12 @@
private $languagesWithIcuFolding = [];
/**
+ * @var bool[] indexed by language code, languages where ICU
tokenization
+ * can be enabled by default
+ */
+ private $languagesWithIcuTokenization = [];
+
+ /**
* @var array[]
*/
private $elasticsearchLanguageAnalyzersFromPlugins = [
diff --git a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
index 621a6bc..8d1d0a0 100644
--- a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
+++ b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
@@ -60,6 +60,15 @@
$folding_type['unicodeSetFilter'] =
$unicodeSetFilter;
}
}
+ $textTokenizer = 'standard';
+ $plainTokenizer = 'whitespace';
+ if ( $this->icuTokenizer ) {
+ $textTokenizer = 'icu_tokenizer';
+ // We cannot use the icu_tokenizer for plain here
+ // even if icu tokenization is mostly needed for
languages
+ // where space is not used to break words. We don't want
+ // to break some punctuation chars like ':'
+ }
$defaults = [
'char_filter' => [
'word_break_helper' => [
@@ -115,7 +124,7 @@
"accentfolding",
"token_limit"
],
- "tokenizer" => "standard"
+ "tokenizer" => $textTokenizer,
],
// We do not remove stop words when searching,
// this leads to extremely weird behaviors while
@@ -127,7 +136,7 @@
"accentfolding",
"token_limit"
],
- "tokenizer" => "standard"
+ "tokenizer" => $textTokenizer,
],
"plain" => [
"type" => "custom",
@@ -136,7 +145,7 @@
"token_limit",
"lowercase"
],
- "tokenizer" => "whitespace"
+ "tokenizer" => $plainTokenizer,
],
"plain_search" => [
"type" => "custom",
@@ -145,7 +154,7 @@
"token_limit",
"lowercase"
],
- "tokenizer" => "whitespace"
+ "tokenizer" => $plainTokenizer,
],
],
];
@@ -157,7 +166,7 @@
"accentfolding",
"token_limit"
],
- "tokenizer" => "standard"
+ "tokenizer" => $textTokenizer,
];
$defaults['analyzer']['subphrases_search'] = [
"type" => "custom",
@@ -166,7 +175,7 @@
"accentfolding",
"token_limit"
],
- "tokenizer" => "standard"
+ "tokenizer" => $textTokenizer,
];
}
return $defaults;
diff --git a/tests/unit/Maintenance/AnalysisConfigBuilderTest.php
b/tests/unit/Maintenance/AnalysisConfigBuilderTest.php
index 6a5d46c..f464a45 100644
--- a/tests/unit/Maintenance/AnalysisConfigBuilderTest.php
+++ b/tests/unit/Maintenance/AnalysisConfigBuilderTest.php
@@ -54,6 +54,15 @@
$this->assertFalse( $builder->isIcuFolding() );
}
+ /** @dataProvider provideICUTokenizer */
+ public function testICUTokinizer( array $input, array $expected ) {
+ $config = new HashSearchConfig( ['CirrusSearchUseIcuTokenizer'
=> 'yes'] );
+ $plugins = ['extra', 'analysis-icu'];
+ $builder = new AnalysisConfigBuilder( 'en', $plugins, $config );
+ $result = $builder->enableICUTokenizer( $input );
+ $this->assertEquals( $expected['analyzer'], $result['analyzer']
);
+ }
+
public static function provideASCIIFoldingFilters() {
return [
'only custom is updated' => [
@@ -295,4 +304,47 @@
],
];
}
+
+ public static function provideICUTokenizer() {
+ return [
+ 'only custom is updated' => [
+ [
+ 'analyzer' => [
+ 'french' => [
+ 'type' => 'french',
+ 'filter' => ['random']
+ ]
+ ],
+ ],
+ [
+ 'analyzer' => [
+ 'french' => [
+ 'type' => 'french',
+ 'filter' => ['random']
+ ]
+ ],
+ ],
+ ],
+ 'only custom is updated' => [
+ [
+ 'analyzer' => [
+ 'chinese' => [
+ 'type' => 'custom',
+ 'tokenizer' =>
'standard',
+ 'filter' => ['random']
+ ]
+ ],
+ ],
+ [
+ 'analyzer' => [
+ 'chinese' => [
+ 'type' => 'custom',
+ 'tokenizer' =>
'icu_tokenizer',
+ 'filter' => ['random']
+ ]
+ ],
+ ],
+ ],
+ ];
+ }
}
--
To view, visit https://gerrit.wikimedia.org/r/313577
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I930e34b24db825b21c1a7eca5bf28cc09a76c152
Gerrit-PatchSet: 15
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: DCausse <[email protected]>
Gerrit-Reviewer: Cindy-the-browser-test-bot <[email protected]>
Gerrit-Reviewer: DCausse <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Gehel <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: Smalyshev <[email protected]>
Gerrit-Reviewer: Tjones <[email protected]>
Gerrit-Reviewer: jenkins-bot <>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits