Smalyshev has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/393689 )

Change subject: [WIP] Port per-language indexing code from Wikibase to 
CirrusSearch
......................................................................

[WIP] Port per-language indexing code from Wikibase to CirrusSearch

Bug: T176903
Change-Id: I6abaf6b75aac86b39d416372c612e4099bfddfaa
---
M includes/Maintenance/AnalysisConfigBuilder.php
1 file changed, 144 insertions(+), 0 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/89/393689/1

diff --git a/includes/Maintenance/AnalysisConfigBuilder.php 
b/includes/Maintenance/AnalysisConfigBuilder.php
index 0ee472b..9615687 100644
--- a/includes/Maintenance/AnalysisConfigBuilder.php
+++ b/includes/Maintenance/AnalysisConfigBuilder.php
@@ -987,6 +987,150 @@
        }
 
        /**
+        * Get list of filters that are mentioned in analyzers but not defined
+        * explicitly.
+        * @param array[] $config Full configuration array
+        * @param string[] $analyzers List of analyzers to consider.
+        * @return array List of default filters, each containing only filter 
type
+        */
+       private function getDefaultFilters( array &$config,  array $analyzers ) 
{
+               $defaultFilters = [];
+               foreach ( $analyzers as $analyzer ) {
+                       if ( empty( $config['analyzer'][$analyzer]['filter'] ) 
) {
+                               continue;
+                       }
+                       foreach ( $config['analyzer'][$analyzer]['filter'] as 
$filterName ) {
+                               if ( !isset( $config['filter'][$filterName] ) ) 
{
+                                       // This is default definition for the 
built-in filter
+                                       $defaultFilters[$filterName] = [ 'type' 
=> $filterName ];
+                               }
+                       }
+               }
+               return $defaultFilters;
+       }
+
+       /**
+        * Check every filter in the config - if it's the same as in old config,
+        * ignore it. If it has the same name, but different content - create 
new filter
+        * with different name by prefixing it with language name.
+        *
+        * @param array[] $config Configuration being processed
+        * @param array[] $standardFilters Existing filters list
+        * @param array[] $defaultFilters List of default filters already 
mentioned in the config
+        * @param string $prefix Prefix for disambiguation
+        * @return array[] The list of filters not in the old config.
+        */
+       private function resolveFilters( array &$config, array 
$standardFilters, array $defaultFilters, $prefix ) {
+               $resultFilters = [];
+               foreach ( $config['filter'] as $name => $filter ) {
+                       $existingFilter = null;
+                       if ( isset( $standardFilters[$name] ) ) {
+                               $existingFilter = $standardFilters[$name];
+                       } elseif ( isset( $defaultFilters[$name] ) ) {
+                               $existingFilter = $defaultFilters[$name];
+                       }
+
+                       if ( $existingFilter ) { // Filter with this name 
already exists
+                               if ( $existingFilter != $filter ) {
+                                       // filter with the same name but 
different config - need to
+                                       // rename by adding prefix
+                                       $newName = $prefix . '_' . $name;
+                                       $this->replaceFilter( $config, $name, 
$newName );
+                                       $resultFilters[$newName] = $filter;
+                               }
+                       } else {
+                               $resultFilters[$name] = $filter;
+                       }
+               }
+               return $resultFilters;
+       }
+
+       /**
+        * Replace certain filter name in all configs with different name.
+        * @param array[] $config Configuration being processed
+        * @param string $oldName
+        * @param string $newName
+        */
+       private function replaceFilter( array &$config, $oldName, $newName ) {
+               foreach ( $config['analyzer'] as &$analyzer ) {
+                       if ( !isset( $analyzer['filter'] ) ) {
+                               continue;
+                       }
+                       $analyzer['filter'] = array_map( function ( $filter ) 
use ( $oldName, $newName ) {
+                               if ( $filter === $oldName ) {
+                                       return $newName;
+                               }
+                               return $filter;
+                       }, $analyzer['filter'] );
+               }
+       }
+
+       /**
+        * Merge per-language config into the main config.
+        * It will copy specific analyzer and all dependant filters and 
char_filters.
+        * @param array $config Main config
+        * @param array $langConfig Per-language config
+        * @param string $name Name for analyzer whose config we're merging
+        * @param string $prefix Prefix for this configuration
+        */
+       private function mergeConfig( array &$config, array $langConfig, $name, 
$prefix ) {
+               $analyzer = $langConfig['analyzer'][$name];
+               $config['analyzer'][$prefix . '_' . $name] = $analyzer;
+               if ( !empty( $analyzer['filter'] ) ) {
+                       // Add private filters for this analyzer
+                       foreach ( $analyzer['filter'] as $filter ) {
+                               // Copy filters that are in language config but 
not in the main config.
+                               // We would not copy the same filter into the 
main config since due to
+                               // the resolution step we know they are the 
same (otherwise we would have
+                               // renamed it).
+                               if ( isset( $langConfig['filter'][$filter] ) &&
+                                    !isset( $config['filter'][$filter] ) ) {
+                                       $config['filter'][$filter] = 
$langConfig['filter'][$filter];
+                               }
+                       }
+               }
+               if ( !empty( $analyzer['char_filter'] ) ) {
+                       // Add private char_filters for this analyzer
+                       foreach ( $analyzer['char_filter'] as $filter ) {
+                               // Here unlike above we do not check for 
$langConfig since we assume
+                               // language config is not broken and all char 
filters are namespaced
+                               // nicely, so if the filter is mentioned in 
analyzer it is also defined.
+                               if ( !isset( $config['char_filter'][$filter] ) 
) {
+                                       $config['char_filter'][$filter] = 
$langConfig['char_filter'][$filter];
+                               }
+                       }
+               }
+       }
+
+       /**
+        * Create per-language configs for specific analyzers which separates 
and namespaces
+        * filters that are different between languages.
+        * @param array[] $config Existing config, will be modified
+        * @param string[] $languages List of languages to process
+        * @param string[] $analyzers List of analyzers to process
+        */
+       public function buildLanguageConfigs( array &$config, array $languages, 
array $analyzers ) {
+               $defaultFilters = $this->getDefaultFilters( $config, $analyzers 
);
+               foreach ( $languages as $lang ) {
+                       $langConfig = $this->buildConfig( $lang );
+                       $defaultFilters += $this->getDefaultFilters( 
$langConfig, $analyzers );
+               }
+               foreach ( $languages as $lang ) {
+                       $langConfig = $this->buildConfig( $lang );
+                       // Analyzer is: tokenizer + filter + char_filter
+                       // Tokenizers don't seem to be subject to customization 
now
+                       // Char filters are nicely namespaced
+                       // Filters are NOT - e.g. lowercase & icu_folding 
filters are different for different
+                       // languages! So we need to do some disambiguation here.
+                       $langConfig['filter'] = $this->resolveFilters( 
$langConfig, $config['filter'], $defaultFilters, $lang );
+                       // Merge configs
+                       foreach ( $analyzers as $analyzer ) {
+                               $this->mergeConfig( $config, $langConfig, 
$analyzer, $lang );
+                       }
+               }
+       }
+
+       /**
         * @return bool true if the icu analyzer is available.
         */
        public function isIcuAvailable() {

-- 
To view, visit https://gerrit.wikimedia.org/r/393689
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I6abaf6b75aac86b39d416372c612e4099bfddfaa
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Smalyshev <smalys...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to