[MediaWiki-commits] [Gerrit] CompletionSuggester: add support for ICU Folding - change (mediawiki...CirrusSearch)

2016-03-14 Thread jenkins-bot (Code Review)
jenkins-bot has submitted this change and it was merged.

Change subject: CompletionSuggester: add support for ICU Folding
..


CompletionSuggester: add support for ICU Folding

Users can set $wgCirrusSearchUseIcuFolding to true to enable this filter
in place of the default ASCII Folding (requires ICU plugin).
It allows to support a wider range of unicode characters for accent squashing.

Bug: T129502
Change-Id: Id715a6670e2c2f8eb9dd4a3ed1a89bea4064d6dd
---
M CirrusSearch.php
M includes/Maintenance/AnalysisConfigBuilder.php
M includes/Maintenance/SuggesterAnalysisConfigBuilder.php
M maintenance/updateSuggesterIndex.php
4 files changed, 40 insertions(+), 10 deletions(-)

Approvals:
  Cindy-the-browser-test-bot: Looks good to me, but someone else must approve
  EBernhardson: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/CirrusSearch.php b/CirrusSearch.php
index 1a0a9d5..07151a0 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -760,6 +760,17 @@
 $wgCirrusSearchCompletionSettings = 
$wgCirrusSearchCompletionProfiles['default'];
 
 /**
+ * Enable ICU Folding instead of the default ASCII Folding.
+ * It allows to cover a wider range of characters when squashing diacritics.
+ * see 
https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu-folding.html
+ * Currently this settings is only used by the CompletionSuggester.
+ * Requires the ICU plugin installed.
+ * Set to true to enable, false to use the default ASCII Folding
+ * NOTE: Experimental
+ */
+$wgCirrusSearchUseIcuFolding = false;
+
+/**
  * Set the default scoring function to be used by 
maintenance/updateSuggesterIndex.php
  * @see includes/BuildDocument/SuggestScoring.php for more details about 
scoring functions
  * NOTE: if you change the scoring method you'll have to rebuild the suggester 
index.
diff --git a/includes/Maintenance/AnalysisConfigBuilder.php 
b/includes/Maintenance/AnalysisConfigBuilder.php
index 5ba02d1..c652bb3 100644
--- a/includes/Maintenance/AnalysisConfigBuilder.php
+++ b/includes/Maintenance/AnalysisConfigBuilder.php
@@ -49,6 +49,11 @@
private $similarity;
 
/**
+* @var SearchConfig cirrus config
+*/
+   protected $config;
+
+   /**
 * Constructor
 * @param string $langCode The language code to build config for
 * @param array(string) $plugins list of plugins installed in 
Elasticsearch
@@ -66,6 +71,7 @@
$config = 
ConfigFactory::getDefaultInstance()->makeConfig( 'CirrusSearch' );
}
$this->similarity = $config->get( 
'CirrusSearchSimilarityProfile' );
+   $this->config = $config;
}
 
/**
diff --git a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php 
b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
index 936ae98..7abde94 100644
--- a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
+++ b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
@@ -34,14 +34,20 @@
 * @param string $langCode The language code to build config for
 * @param array(string) $plugins list of plugins installed in 
Elasticsearch
 */
-   public function __construct( $langCode, $plugins ) {
-   parent::__construct( $langCode, $plugins );
+   public function __construct( $langCode, $plugins, $config = null ) {
+   parent::__construct( $langCode, $plugins, $config );
}
 
/**
 * Build and analysis config with sane defaults
 */
protected function defaults() {
+   // Use the default Lucene ASCII filter
+   $folding_type = 'asciifolding';
+   if ( $this->isIcuAvailable() && $this->config->get( 
'CirrusSearchUseIcuFolding' ) === true ) {
+   // Use ICU Folding if the plugin is available and 
activated in the config
+   $folding_type = 'icu_folding';
+   }
$defaults = array(
'char_filter' => array(
'word_break_helper' => array(
@@ -81,9 +87,8 @@
"stopwords" => "_none_",
"remove_trailing" => "true"
),
-   "asciifolding_preserve" => array(
-   "type" => "asciifolding",
-   "preserve_original" => "false",
+   "asciifolding" => array(
+   "type" => $folding_type,
),
"icu_normalizer" => array(
"type" => "icu_normalizer",
@@ -101,7 +106,7 @@
"standard",
"lowercase",
  

[MediaWiki-commits] [Gerrit] CompletionSuggester: add support for ICU Folding - change (mediawiki...CirrusSearch)

2016-03-14 Thread DCausse (Code Review)
DCausse has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/277249

Change subject: CompletionSuggester: add support for ICU Folding
..

CompletionSuggester: add support for ICU Folding

Users can set $wgCirrusSearchUseIcuFolding to true to enable this filter
in place of the default ASCII Folding (requires ICU plugin).
It allows to support a wider range of unicode characters for accent squashing.

Bug: T129502
Change-Id: Id715a6670e2c2f8eb9dd4a3ed1a89bea4064d6dd
---
M CirrusSearch.php
M includes/Maintenance/AnalysisConfigBuilder.php
M includes/Maintenance/SuggesterAnalysisConfigBuilder.php
M maintenance/updateSuggesterIndex.php
4 files changed, 39 insertions(+), 10 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/49/277249/1

diff --git a/CirrusSearch.php b/CirrusSearch.php
index 1a0a9d5..07151a0 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -760,6 +760,17 @@
 $wgCirrusSearchCompletionSettings = 
$wgCirrusSearchCompletionProfiles['default'];
 
 /**
+ * Enable ICU Folding instead of the default ASCII Folding.
+ * It allows to cover a wider range of characters when squashing diacritics.
+ * see 
https://www.elastic.co/guide/en/elasticsearch/plugins/current/analysis-icu-folding.html
+ * Currently this settings is only used by the CompletionSuggester.
+ * Requires the ICU plugin installed.
+ * Set to true to enable, false to use the default ASCII Folding
+ * NOTE: Experimental
+ */
+$wgCirrusSearchUseIcuFolding = false;
+
+/**
  * Set the default scoring function to be used by 
maintenance/updateSuggesterIndex.php
  * @see includes/BuildDocument/SuggestScoring.php for more details about 
scoring functions
  * NOTE: if you change the scoring method you'll have to rebuild the suggester 
index.
diff --git a/includes/Maintenance/AnalysisConfigBuilder.php 
b/includes/Maintenance/AnalysisConfigBuilder.php
index 5ba02d1..c652bb3 100644
--- a/includes/Maintenance/AnalysisConfigBuilder.php
+++ b/includes/Maintenance/AnalysisConfigBuilder.php
@@ -49,6 +49,11 @@
private $similarity;
 
/**
+* @var SearchConfig cirrus config
+*/
+   protected $config;
+
+   /**
 * Constructor
 * @param string $langCode The language code to build config for
 * @param array(string) $plugins list of plugins installed in 
Elasticsearch
@@ -66,6 +71,7 @@
$config = 
ConfigFactory::getDefaultInstance()->makeConfig( 'CirrusSearch' );
}
$this->similarity = $config->get( 
'CirrusSearchSimilarityProfile' );
+   $this->config = $config;
}
 
/**
diff --git a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php 
b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
index 936ae98..5f3f7bf 100644
--- a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
+++ b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php
@@ -34,14 +34,19 @@
 * @param string $langCode The language code to build config for
 * @param array(string) $plugins list of plugins installed in 
Elasticsearch
 */
-   public function __construct( $langCode, $plugins ) {
-   parent::__construct( $langCode, $plugins );
+   public function __construct( $langCode, $plugins, $config = null ) {
+   parent::__construct( $langCode, $plugins, $config );
}
 
/**
 * Build and analysis config with sane defaults
 */
protected function defaults() {
+   $folding_type = 'asciifolding';
+   if ( $this->isIcuAvailable() && $this->config->get( 
'CirrusSearchUseIcuFolding' ) === true ) {
+   $this->customizations[] = 'icu_folding';
+   $folding_type = 'icu_folding';
+   }
$defaults = array(
'char_filter' => array(
'word_break_helper' => array(
@@ -81,9 +86,8 @@
"stopwords" => "_none_",
"remove_trailing" => "true"
),
-   "asciifolding_preserve" => array(
-   "type" => "asciifolding",
-   "preserve_original" => "false",
+   "asciifolding" => array(
+   "type" => $folding_type,
),
"icu_normalizer" => array(
"type" => "icu_normalizer",
@@ -101,7 +105,7 @@
"standard",
"lowercase",
"stop_filter",
-