Smalyshev has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/393689 )
Change subject: [WIP] Port per-language indexing code from Wikibase to CirrusSearch ...................................................................... [WIP] Port per-language indexing code from Wikibase to CirrusSearch Bug: T176903 Change-Id: I6abaf6b75aac86b39d416372c612e4099bfddfaa --- M includes/Maintenance/AnalysisConfigBuilder.php 1 file changed, 144 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/89/393689/1 diff --git a/includes/Maintenance/AnalysisConfigBuilder.php b/includes/Maintenance/AnalysisConfigBuilder.php index 0ee472b..9615687 100644 --- a/includes/Maintenance/AnalysisConfigBuilder.php +++ b/includes/Maintenance/AnalysisConfigBuilder.php @@ -987,6 +987,150 @@ } /** + * Get list of filters that are mentioned in analyzers but not defined + * explicitly. + * @param array[] $config Full configuration array + * @param string[] $analyzers List of analyzers to consider. + * @return array List of default filters, each containing only filter type + */ + private function getDefaultFilters( array &$config, array $analyzers ) { + $defaultFilters = []; + foreach ( $analyzers as $analyzer ) { + if ( empty( $config['analyzer'][$analyzer]['filter'] ) ) { + continue; + } + foreach ( $config['analyzer'][$analyzer]['filter'] as $filterName ) { + if ( !isset( $config['filter'][$filterName] ) ) { + // This is default definition for the built-in filter + $defaultFilters[$filterName] = [ 'type' => $filterName ]; + } + } + } + return $defaultFilters; + } + + /** + * Check every filter in the config - if it's the same as in old config, + * ignore it. If it has the same name, but different content - create new filter + * with different name by prefixing it with language name. + * + * @param array[] $config Configuration being processed + * @param array[] $standardFilters Existing filters list + * @param array[] $defaultFilters List of default filters already mentioned in the config + * @param string $prefix Prefix for disambiguation + * @return array[] The list of filters not in the old config. + */ + private function resolveFilters( array &$config, array $standardFilters, array $defaultFilters, $prefix ) { + $resultFilters = []; + foreach ( $config['filter'] as $name => $filter ) { + $existingFilter = null; + if ( isset( $standardFilters[$name] ) ) { + $existingFilter = $standardFilters[$name]; + } elseif ( isset( $defaultFilters[$name] ) ) { + $existingFilter = $defaultFilters[$name]; + } + + if ( $existingFilter ) { // Filter with this name already exists + if ( $existingFilter != $filter ) { + // filter with the same name but different config - need to + // rename by adding prefix + $newName = $prefix . '_' . $name; + $this->replaceFilter( $config, $name, $newName ); + $resultFilters[$newName] = $filter; + } + } else { + $resultFilters[$name] = $filter; + } + } + return $resultFilters; + } + + /** + * Replace certain filter name in all configs with different name. + * @param array[] $config Configuration being processed + * @param string $oldName + * @param string $newName + */ + private function replaceFilter( array &$config, $oldName, $newName ) { + foreach ( $config['analyzer'] as &$analyzer ) { + if ( !isset( $analyzer['filter'] ) ) { + continue; + } + $analyzer['filter'] = array_map( function ( $filter ) use ( $oldName, $newName ) { + if ( $filter === $oldName ) { + return $newName; + } + return $filter; + }, $analyzer['filter'] ); + } + } + + /** + * Merge per-language config into the main config. + * It will copy specific analyzer and all dependant filters and char_filters. + * @param array $config Main config + * @param array $langConfig Per-language config + * @param string $name Name for analyzer whose config we're merging + * @param string $prefix Prefix for this configuration + */ + private function mergeConfig( array &$config, array $langConfig, $name, $prefix ) { + $analyzer = $langConfig['analyzer'][$name]; + $config['analyzer'][$prefix . '_' . $name] = $analyzer; + if ( !empty( $analyzer['filter'] ) ) { + // Add private filters for this analyzer + foreach ( $analyzer['filter'] as $filter ) { + // Copy filters that are in language config but not in the main config. + // We would not copy the same filter into the main config since due to + // the resolution step we know they are the same (otherwise we would have + // renamed it). + if ( isset( $langConfig['filter'][$filter] ) && + !isset( $config['filter'][$filter] ) ) { + $config['filter'][$filter] = $langConfig['filter'][$filter]; + } + } + } + if ( !empty( $analyzer['char_filter'] ) ) { + // Add private char_filters for this analyzer + foreach ( $analyzer['char_filter'] as $filter ) { + // Here unlike above we do not check for $langConfig since we assume + // language config is not broken and all char filters are namespaced + // nicely, so if the filter is mentioned in analyzer it is also defined. + if ( !isset( $config['char_filter'][$filter] ) ) { + $config['char_filter'][$filter] = $langConfig['char_filter'][$filter]; + } + } + } + } + + /** + * Create per-language configs for specific analyzers which separates and namespaces + * filters that are different between languages. + * @param array[] $config Existing config, will be modified + * @param string[] $languages List of languages to process + * @param string[] $analyzers List of analyzers to process + */ + public function buildLanguageConfigs( array &$config, array $languages, array $analyzers ) { + $defaultFilters = $this->getDefaultFilters( $config, $analyzers ); + foreach ( $languages as $lang ) { + $langConfig = $this->buildConfig( $lang ); + $defaultFilters += $this->getDefaultFilters( $langConfig, $analyzers ); + } + foreach ( $languages as $lang ) { + $langConfig = $this->buildConfig( $lang ); + // Analyzer is: tokenizer + filter + char_filter + // Tokenizers don't seem to be subject to customization now + // Char filters are nicely namespaced + // Filters are NOT - e.g. lowercase & icu_folding filters are different for different + // languages! So we need to do some disambiguation here. + $langConfig['filter'] = $this->resolveFilters( $langConfig, $config['filter'], $defaultFilters, $lang ); + // Merge configs + foreach ( $analyzers as $analyzer ) { + $this->mergeConfig( $config, $langConfig, $analyzer, $lang ); + } + } + } + + /** * @return bool true if the icu analyzer is available. */ public function isIcuAvailable() { -- To view, visit https://gerrit.wikimedia.org/r/393689 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I6abaf6b75aac86b39d416372c612e4099bfddfaa Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: Smalyshev <smalys...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits