[MediaWiki-commits] [Gerrit] CompletionSuggester: add support for ICU Folding - change (mediawiki...CirrusSearch)
jenkins-bot has submitted this change and it was merged. Change subject: CompletionSuggester: add support for ICU Folding .. CompletionSuggester: add support for ICU Folding Users can set $wgCirrusSearchUseIcuFolding to true to enable this filter in place of the default ASCII Folding (requires ICU plugin). It allows to support a wider range of unicode characters for accent squashing. Bug: T129502 Change-Id: Id715a6670e2c2f8eb9dd4a3ed1a89bea4064d6dd --- M CirrusSearch.php M includes/Maintenance/AnalysisConfigBuilder.php M includes/Maintenance/SuggesterAnalysisConfigBuilder.php M maintenance/updateSuggesterIndex.php 4 files changed, 40 insertions(+), 10 deletions(-) Approvals: Cindy-the-browser-test-bot: Looks good to me, but someone else must approve EBernhardson: Looks good to me, approved jenkins-bot: Verified diff --git a/CirrusSearch.php b/CirrusSearch.php index 1a0a9d5..07151a0 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -760,6 +760,17 @@ $wgCirrusSearchCompletionSettings = $wgCirrusSearchCompletionProfiles['default']; /** + * Enable ICU Folding instead of the default ASCII Folding. + * It allows to cover a wider range of characters when squashing diacritics. + * see https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu-folding.html + * Currently this settings is only used by the CompletionSuggester. + * Requires the ICU plugin installed. + * Set to true to enable, false to use the default ASCII Folding + * NOTE: Experimental + */ +$wgCirrusSearchUseIcuFolding = false; + +/** * Set the default scoring function to be used by maintenance/updateSuggesterIndex.php * @see includes/BuildDocument/SuggestScoring.php for more details about scoring functions * NOTE: if you change the scoring method you'll have to rebuild the suggester index. diff --git a/includes/Maintenance/AnalysisConfigBuilder.php b/includes/Maintenance/AnalysisConfigBuilder.php index 5ba02d1..c652bb3 100644 --- a/includes/Maintenance/AnalysisConfigBuilder.php +++ b/includes/Maintenance/AnalysisConfigBuilder.php @@ -49,6 +49,11 @@ private $similarity; /** +* @var SearchConfig cirrus config +*/ + protected $config; + + /** * Constructor * @param string $langCode The language code to build config for * @param array(string) $plugins list of plugins installed in Elasticsearch @@ -66,6 +71,7 @@ $config = ConfigFactory::getDefaultInstance()->makeConfig( 'CirrusSearch' ); } $this->similarity = $config->get( 'CirrusSearchSimilarityProfile' ); + $this->config = $config; } /** diff --git a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php index 936ae98..7abde94 100644 --- a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php +++ b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php @@ -34,14 +34,20 @@ * @param string $langCode The language code to build config for * @param array(string) $plugins list of plugins installed in Elasticsearch */ - public function __construct( $langCode, $plugins ) { - parent::__construct( $langCode, $plugins ); + public function __construct( $langCode, $plugins, $config = null ) { + parent::__construct( $langCode, $plugins, $config ); } /** * Build and analysis config with sane defaults */ protected function defaults() { + // Use the default Lucene ASCII filter + $folding_type = 'asciifolding'; + if ( $this->isIcuAvailable() && $this->config->get( 'CirrusSearchUseIcuFolding' ) === true ) { + // Use ICU Folding if the plugin is available and activated in the config + $folding_type = 'icu_folding'; + } $defaults = array( 'char_filter' => array( 'word_break_helper' => array( @@ -81,9 +87,8 @@ "stopwords" => "_none_", "remove_trailing" => "true" ), - "asciifolding_preserve" => array( - "type" => "asciifolding", - "preserve_original" => "false", + "asciifolding" => array( + "type" => $folding_type, ), "icu_normalizer" => array( "type" => "icu_normalizer", @@ -101,7 +106,7 @@ "standard", "lowercase",
[MediaWiki-commits] [Gerrit] CompletionSuggester: add support for ICU Folding - change (mediawiki...CirrusSearch)
DCausse has uploaded a new change for review. https://gerrit.wikimedia.org/r/277249 Change subject: CompletionSuggester: add support for ICU Folding .. CompletionSuggester: add support for ICU Folding Users can set $wgCirrusSearchUseIcuFolding to true to enable this filter in place of the default ASCII Folding (requires ICU plugin). It allows to support a wider range of unicode characters for accent squashing. Bug: T129502 Change-Id: Id715a6670e2c2f8eb9dd4a3ed1a89bea4064d6dd --- M CirrusSearch.php M includes/Maintenance/AnalysisConfigBuilder.php M includes/Maintenance/SuggesterAnalysisConfigBuilder.php M maintenance/updateSuggesterIndex.php 4 files changed, 39 insertions(+), 10 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/49/277249/1 diff --git a/CirrusSearch.php b/CirrusSearch.php index 1a0a9d5..07151a0 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -760,6 +760,17 @@ $wgCirrusSearchCompletionSettings = $wgCirrusSearchCompletionProfiles['default']; /** + * Enable ICU Folding instead of the default ASCII Folding. + * It allows to cover a wider range of characters when squashing diacritics. + * see https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu-folding.html + * Currently this settings is only used by the CompletionSuggester. + * Requires the ICU plugin installed. + * Set to true to enable, false to use the default ASCII Folding + * NOTE: Experimental + */ +$wgCirrusSearchUseIcuFolding = false; + +/** * Set the default scoring function to be used by maintenance/updateSuggesterIndex.php * @see includes/BuildDocument/SuggestScoring.php for more details about scoring functions * NOTE: if you change the scoring method you'll have to rebuild the suggester index. diff --git a/includes/Maintenance/AnalysisConfigBuilder.php b/includes/Maintenance/AnalysisConfigBuilder.php index 5ba02d1..c652bb3 100644 --- a/includes/Maintenance/AnalysisConfigBuilder.php +++ b/includes/Maintenance/AnalysisConfigBuilder.php @@ -49,6 +49,11 @@ private $similarity; /** +* @var SearchConfig cirrus config +*/ + protected $config; + + /** * Constructor * @param string $langCode The language code to build config for * @param array(string) $plugins list of plugins installed in Elasticsearch @@ -66,6 +71,7 @@ $config = ConfigFactory::getDefaultInstance()->makeConfig( 'CirrusSearch' ); } $this->similarity = $config->get( 'CirrusSearchSimilarityProfile' ); + $this->config = $config; } /** diff --git a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php index 936ae98..5f3f7bf 100644 --- a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php +++ b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php @@ -34,14 +34,19 @@ * @param string $langCode The language code to build config for * @param array(string) $plugins list of plugins installed in Elasticsearch */ - public function __construct( $langCode, $plugins ) { - parent::__construct( $langCode, $plugins ); + public function __construct( $langCode, $plugins, $config = null ) { + parent::__construct( $langCode, $plugins, $config ); } /** * Build and analysis config with sane defaults */ protected function defaults() { + $folding_type = 'asciifolding'; + if ( $this->isIcuAvailable() && $this->config->get( 'CirrusSearchUseIcuFolding' ) === true ) { + $this->customizations[] = 'icu_folding'; + $folding_type = 'icu_folding'; + } $defaults = array( 'char_filter' => array( 'word_break_helper' => array( @@ -81,9 +86,8 @@ "stopwords" => "_none_", "remove_trailing" => "true" ), - "asciifolding_preserve" => array( - "type" => "asciifolding", - "preserve_original" => "false", + "asciifolding" => array( + "type" => $folding_type, ), "icu_normalizer" => array( "type" => "icu_normalizer", @@ -101,7 +105,7 @@ "standard", "lowercase", "stop_filter", -