jenkins-bot has submitted this change and it was merged. Change subject: CompletionSuggester: add support for ICU Folding ......................................................................
CompletionSuggester: add support for ICU Folding Users can set $wgCirrusSearchUseIcuFolding to true to enable this filter in place of the default ASCII Folding (requires ICU plugin). It allows to support a wider range of unicode characters for accent squashing. Bug: T129502 Change-Id: Id715a6670e2c2f8eb9dd4a3ed1a89bea4064d6dd --- M CirrusSearch.php M includes/Maintenance/AnalysisConfigBuilder.php M includes/Maintenance/SuggesterAnalysisConfigBuilder.php M maintenance/updateSuggesterIndex.php 4 files changed, 40 insertions(+), 10 deletions(-) Approvals: Cindy-the-browser-test-bot: Looks good to me, but someone else must approve EBernhardson: Looks good to me, approved jenkins-bot: Verified diff --git a/CirrusSearch.php b/CirrusSearch.php index 1a0a9d5..07151a0 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -760,6 +760,17 @@ $wgCirrusSearchCompletionSettings = $wgCirrusSearchCompletionProfiles['default']; /** + * Enable ICU Folding instead of the default ASCII Folding. + * It allows to cover a wider range of characters when squashing diacritics. + * see https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu-folding.html + * Currently this settings is only used by the CompletionSuggester. + * Requires the ICU plugin installed. + * Set to true to enable, false to use the default ASCII Folding + * NOTE: Experimental + */ +$wgCirrusSearchUseIcuFolding = false; + +/** * Set the default scoring function to be used by maintenance/updateSuggesterIndex.php * @see includes/BuildDocument/SuggestScoring.php for more details about scoring functions * NOTE: if you change the scoring method you'll have to rebuild the suggester index. diff --git a/includes/Maintenance/AnalysisConfigBuilder.php b/includes/Maintenance/AnalysisConfigBuilder.php index 5ba02d1..c652bb3 100644 --- a/includes/Maintenance/AnalysisConfigBuilder.php +++ b/includes/Maintenance/AnalysisConfigBuilder.php @@ -49,6 +49,11 @@ private $similarity; /** + * @var SearchConfig cirrus config + */ + protected $config; + + /** * Constructor * @param string $langCode The language code to build config for * @param array(string) $plugins list of plugins installed in Elasticsearch @@ -66,6 +71,7 @@ $config = ConfigFactory::getDefaultInstance()->makeConfig( 'CirrusSearch' ); } $this->similarity = $config->get( 'CirrusSearchSimilarityProfile' ); + $this->config = $config; } /** diff --git a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php index 936ae98..7abde94 100644 --- a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php +++ b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php @@ -34,14 +34,20 @@ * @param string $langCode The language code to build config for * @param array(string) $plugins list of plugins installed in Elasticsearch */ - public function __construct( $langCode, $plugins ) { - parent::__construct( $langCode, $plugins ); + public function __construct( $langCode, $plugins, $config = null ) { + parent::__construct( $langCode, $plugins, $config ); } /** * Build and analysis config with sane defaults */ protected function defaults() { + // Use the default Lucene ASCII filter + $folding_type = 'asciifolding'; + if ( $this->isIcuAvailable() && $this->config->get( 'CirrusSearchUseIcuFolding' ) === true ) { + // Use ICU Folding if the plugin is available and activated in the config + $folding_type = 'icu_folding'; + } $defaults = array( 'char_filter' => array( 'word_break_helper' => array( @@ -81,9 +87,8 @@ "stopwords" => "_none_", "remove_trailing" => "true" ), - "asciifolding_preserve" => array( - "type" => "asciifolding", - "preserve_original" => "false", + "asciifolding" => array( + "type" => $folding_type, ), "icu_normalizer" => array( "type" => "icu_normalizer", @@ -101,7 +106,7 @@ "standard", "lowercase", "stop_filter", - "asciifolding_preserve", + "asciifolding", "token_limit" ), "tokenizer" => "standard" @@ -114,7 +119,7 @@ "filter" => array( "standard", "lowercase", - "asciifolding_preserve", + "asciifolding", "token_limit" ), "tokenizer" => "standard" diff --git a/maintenance/updateSuggesterIndex.php b/maintenance/updateSuggesterIndex.php index e893368..59936ca 100644 --- a/maintenance/updateSuggesterIndex.php +++ b/maintenance/updateSuggesterIndex.php @@ -8,6 +8,7 @@ use CirrusSearch\Util; use CirrusSearch\BuildDocument\SuggestBuilder; use CirrusSearch\BuildDocument\SuggestScoringMethodFactory; +use CirrusSearch\Maintenance\Validators\AnalyzersValidator; use Elastica; use Elastica\Query; use Elastica\Request; @@ -187,6 +188,9 @@ $this->langCode = $wgLanguageCode; $this->bannedPlugins = $wgCirrusSearchBannedPlugins; + $this->availablePlugins = $this->utils->scanAvailablePlugins( $this->bannedPlugins ); + $this->analysisConfigBuilder = $this->pickAnalyzer( $this->langCode, $this->availablePlugins ); + $this->utils->checkElasticsearchVersion(); $this->maxShardsPerNode = isset( $wgCirrusSearchMaxShardsPerNode[ $this->indexTypeName ] ) ? $wgCirrusSearchMaxShardsPerNode[ $this->indexTypeName ] : 'unlimited'; @@ -276,9 +280,6 @@ $this->oldIndex = $this->getConnection()->getIndex( $this->indexBaseName, $this->indexTypeName, $oldIndexIdentifier ); $this->indexIdentifier = $this->utils->pickIndexIdentifierFromOption( 'now', $this->getIndexTypeName() ); - $this->availablePlugins = $this->utils->scanAvailablePlugins( $this->bannedPlugins ); - $this->analysisConfigBuilder = $this->pickAnalyzer( $this->langCode, $this->availablePlugins ); - $this->createIndex(); $this->indexData(); $this->indexData( Connection::GENERAL_INDEX_TYPE ); @@ -337,6 +338,13 @@ return false; } + $validator = new AnalyzersValidator( $oldIndex, $this->analysisConfigBuilder, $this ); + $status = $validator->validate(); + if ( !$status->isOK() ) { + $this->error( 'Analysis config differs, cannot recycle.' ); + return false; + } + return true; } -- To view, visit https://gerrit.wikimedia.org/r/277249 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Id715a6670e2c2f8eb9dd4a3ed1a89bea4064d6dd Gerrit-PatchSet: 2 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: DCausse <[email protected]> Gerrit-Reviewer: Cindy-the-browser-test-bot <[email protected]> Gerrit-Reviewer: EBernhardson <[email protected]> Gerrit-Reviewer: Gehel <[email protected]> Gerrit-Reviewer: Manybubbles <[email protected]> Gerrit-Reviewer: Smalyshev <[email protected]> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
