jenkins-bot has submitted this change and it was merged.

Change subject: CompletionSuggester: add support for ICU Folding
......................................................................


CompletionSuggester: add support for ICU Folding

Users can set $wgCirrusSearchUseIcuFolding to true to enable this filter
in place of the default ASCII Folding (requires ICU plugin).
It allows to support a wider range of unicode characters for accent squashing.

Bug: T129502
Change-Id: Id715a6670e2c2f8eb9dd4a3ed1a89bea4064d6dd
---
M CirrusSearch.php
M includes/Maintenance/AnalysisConfigBuilder.php
M includes/Maintenance/SuggesterAnalysisConfigBuilder.php
M maintenance/updateSuggesterIndex.php
4 files changed, 40 insertions(+), 10 deletions(-)

Approvals:
  Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
  EBernhardson: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/CirrusSearch.php b/CirrusSearch.php
index 1a0a9d5..07151a0 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -760,6 +760,17 @@
 $wgCirrusSearchCompletionSettings = 
$wgCirrusSearchCompletionProfiles['default'];
 
 /**
+ * Enable ICU Folding instead of the default ASCII Folding.
+ * It allows to cover a wider range of characters when squashing diacritics.
+ * see 
https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu-folding.html
+ * Currently this settings is only used by the CompletionSuggester.
+ * Requires the ICU plugin installed.
+ * Set to true to enable, false to use the default ASCII Folding
+ * NOTE: Experimental
+ */
+$wgCirrusSearchUseIcuFolding = false;
+
+/**
  * Set the default scoring function to be used by 
maintenance/updateSuggesterIndex.php
  * @see includes/BuildDocument/SuggestScoring.php for more details about 
scoring functions
  * NOTE: if you change the scoring method you'll have to rebuild the suggester 
index.
diff --git a/includes/Maintenance/AnalysisConfigBuilder.php 
b/includes/Maintenance/AnalysisConfigBuilder.php
index 5ba02d1..c652bb3 100644
--- a/includes/Maintenance/AnalysisConfigBuilder.php
+++ b/includes/Maintenance/AnalysisConfigBuilder.php
@@ -49,6 +49,11 @@
        private $similarity;
 
        /**
+        * @var SearchConfig cirrus config
+        */
+       protected $config;
+
+       /**
         * Constructor
         * @param string $langCode The language code to build config for
         * @param array(string) $plugins list of plugins installed in 
Elasticsearch
@@ -66,6 +71,7 @@
                        $config = 
ConfigFactory::getDefaultInstance()->makeConfig( 'CirrusSearch' );
                }
                $this->similarity = $config->get( 
'CirrusSearchSimilarityProfile' );
+               $this->config = $config;
        }
 
        /**
diff --git a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php 
b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
index 936ae98..7abde94 100644
--- a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
+++ b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
@@ -34,14 +34,20 @@
         * @param string $langCode The language code to build config for
         * @param array(string) $plugins list of plugins installed in 
Elasticsearch
         */
-       public function __construct( $langCode, $plugins ) {
-               parent::__construct( $langCode, $plugins );
+       public function __construct( $langCode, $plugins, $config = null ) {
+               parent::__construct( $langCode, $plugins, $config );
        }
 
        /**
         * Build and analysis config with sane defaults
         */
        protected function defaults() {
+               // Use the default Lucene ASCII filter
+               $folding_type = 'asciifolding';
+               if ( $this->isIcuAvailable() && $this->config->get( 
'CirrusSearchUseIcuFolding' ) === true ) {
+                       // Use ICU Folding if the plugin is available and 
activated in the config
+                       $folding_type = 'icu_folding';
+               }
                $defaults = array(
                        'char_filter' => array(
                                'word_break_helper' => array(
@@ -81,9 +87,8 @@
                                        "stopwords" => "_none_",
                                        "remove_trailing" => "true"
                                ),
-                               "asciifolding_preserve" => array(
-                                       "type" => "asciifolding",
-                                       "preserve_original" => "false",
+                               "asciifolding" => array(
+                                       "type" => $folding_type,
                                ),
                                "icu_normalizer" => array(
                                        "type" => "icu_normalizer",
@@ -101,7 +106,7 @@
                                                "standard",
                                                "lowercase",
                                                "stop_filter",
-                                               "asciifolding_preserve",
+                                               "asciifolding",
                                                "token_limit"
                                        ),
                                        "tokenizer" => "standard"
@@ -114,7 +119,7 @@
                                        "filter" => array(
                                                "standard",
                                                "lowercase",
-                                               "asciifolding_preserve",
+                                               "asciifolding",
                                                "token_limit"
                                        ),
                                        "tokenizer" => "standard"
diff --git a/maintenance/updateSuggesterIndex.php 
b/maintenance/updateSuggesterIndex.php
index e893368..59936ca 100644
--- a/maintenance/updateSuggesterIndex.php
+++ b/maintenance/updateSuggesterIndex.php
@@ -8,6 +8,7 @@
 use CirrusSearch\Util;
 use CirrusSearch\BuildDocument\SuggestBuilder;
 use CirrusSearch\BuildDocument\SuggestScoringMethodFactory;
+use CirrusSearch\Maintenance\Validators\AnalyzersValidator;
 use Elastica;
 use Elastica\Query;
 use Elastica\Request;
@@ -187,6 +188,9 @@
                $this->langCode = $wgLanguageCode;
                $this->bannedPlugins = $wgCirrusSearchBannedPlugins;
 
+               $this->availablePlugins = $this->utils->scanAvailablePlugins( 
$this->bannedPlugins );
+               $this->analysisConfigBuilder = $this->pickAnalyzer( 
$this->langCode, $this->availablePlugins );
+
                $this->utils->checkElasticsearchVersion();
 
                $this->maxShardsPerNode = isset( 
$wgCirrusSearchMaxShardsPerNode[ $this->indexTypeName ] ) ? 
$wgCirrusSearchMaxShardsPerNode[ $this->indexTypeName ] : 'unlimited';
@@ -276,9 +280,6 @@
                $this->oldIndex = $this->getConnection()->getIndex( 
$this->indexBaseName, $this->indexTypeName, $oldIndexIdentifier );
                $this->indexIdentifier = 
$this->utils->pickIndexIdentifierFromOption( 'now', $this->getIndexTypeName() );
 
-               $this->availablePlugins = $this->utils->scanAvailablePlugins( 
$this->bannedPlugins );
-               $this->analysisConfigBuilder = $this->pickAnalyzer( 
$this->langCode, $this->availablePlugins );
-
                $this->createIndex();
                $this->indexData();
                $this->indexData( Connection::GENERAL_INDEX_TYPE );
@@ -337,6 +338,13 @@
                        return false;
                }
 
+               $validator = new AnalyzersValidator( $oldIndex, 
$this->analysisConfigBuilder, $this );
+               $status = $validator->validate();
+               if ( !$status->isOK() ) {
+                       $this->error( 'Analysis config differs, cannot 
recycle.' );
+                       return false;
+               }
+
                return true;
        }
 

-- 
To view, visit https://gerrit.wikimedia.org/r/277249
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: Id715a6670e2c2f8eb9dd4a3ed1a89bea4064d6dd
Gerrit-PatchSet: 2
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: DCausse <[email protected]>
Gerrit-Reviewer: Cindy-the-browser-test-bot <[email protected]>
Gerrit-Reviewer: EBernhardson <[email protected]>
Gerrit-Reviewer: Gehel <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: Smalyshev <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to