Smalyshev has uploaded a new change for review. (
https://gerrit.wikimedia.org/r/393689 )
Change subject: [WIP] Port per-language indexing code from Wikibase to
CirrusSearch
......................................................................
[WIP] Port per-language indexing code from Wikibase to CirrusSearch
Bug: T176903
Change-Id: I6abaf6b75aac86b39d416372c612e4099bfddfaa
---
M includes/Maintenance/AnalysisConfigBuilder.php
1 file changed, 144 insertions(+), 0 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch
refs/changes/89/393689/1
diff --git a/includes/Maintenance/AnalysisConfigBuilder.php
b/includes/Maintenance/AnalysisConfigBuilder.php
index 0ee472b..9615687 100644
--- a/includes/Maintenance/AnalysisConfigBuilder.php
+++ b/includes/Maintenance/AnalysisConfigBuilder.php
@@ -987,6 +987,150 @@
}
/**
+ * Get list of filters that are mentioned in analyzers but not defined
+ * explicitly.
+ * @param array[] $config Full configuration array
+ * @param string[] $analyzers List of analyzers to consider.
+ * @return array List of default filters, each containing only filter
type
+ */
+ private function getDefaultFilters( array &$config, array $analyzers )
{
+ $defaultFilters = [];
+ foreach ( $analyzers as $analyzer ) {
+ if ( empty( $config['analyzer'][$analyzer]['filter'] )
) {
+ continue;
+ }
+ foreach ( $config['analyzer'][$analyzer]['filter'] as
$filterName ) {
+ if ( !isset( $config['filter'][$filterName] ) )
{
+ // This is default definition for the
built-in filter
+ $defaultFilters[$filterName] = [ 'type'
=> $filterName ];
+ }
+ }
+ }
+ return $defaultFilters;
+ }
+
+ /**
+ * Check every filter in the config - if it's the same as in old config,
+ * ignore it. If it has the same name, but different content - create
new filter
+ * with different name by prefixing it with language name.
+ *
+ * @param array[] $config Configuration being processed
+ * @param array[] $standardFilters Existing filters list
+ * @param array[] $defaultFilters List of default filters already
mentioned in the config
+ * @param string $prefix Prefix for disambiguation
+ * @return array[] The list of filters not in the old config.
+ */
+ private function resolveFilters( array &$config, array
$standardFilters, array $defaultFilters, $prefix ) {
+ $resultFilters = [];
+ foreach ( $config['filter'] as $name => $filter ) {
+ $existingFilter = null;
+ if ( isset( $standardFilters[$name] ) ) {
+ $existingFilter = $standardFilters[$name];
+ } elseif ( isset( $defaultFilters[$name] ) ) {
+ $existingFilter = $defaultFilters[$name];
+ }
+
+ if ( $existingFilter ) { // Filter with this name
already exists
+ if ( $existingFilter != $filter ) {
+ // filter with the same name but
different config - need to
+ // rename by adding prefix
+ $newName = $prefix . '_' . $name;
+ $this->replaceFilter( $config, $name,
$newName );
+ $resultFilters[$newName] = $filter;
+ }
+ } else {
+ $resultFilters[$name] = $filter;
+ }
+ }
+ return $resultFilters;
+ }
+
+ /**
+ * Replace certain filter name in all configs with different name.
+ * @param array[] $config Configuration being processed
+ * @param string $oldName
+ * @param string $newName
+ */
+ private function replaceFilter( array &$config, $oldName, $newName ) {
+ foreach ( $config['analyzer'] as &$analyzer ) {
+ if ( !isset( $analyzer['filter'] ) ) {
+ continue;
+ }
+ $analyzer['filter'] = array_map( function ( $filter )
use ( $oldName, $newName ) {
+ if ( $filter === $oldName ) {
+ return $newName;
+ }
+ return $filter;
+ }, $analyzer['filter'] );
+ }
+ }
+
+ /**
+ * Merge per-language config into the main config.
+ * It will copy specific analyzer and all dependant filters and
char_filters.
+ * @param array $config Main config
+ * @param array $langConfig Per-language config
+ * @param string $name Name for analyzer whose config we're merging
+ * @param string $prefix Prefix for this configuration
+ */
+ private function mergeConfig( array &$config, array $langConfig, $name,
$prefix ) {
+ $analyzer = $langConfig['analyzer'][$name];
+ $config['analyzer'][$prefix . '_' . $name] = $analyzer;
+ if ( !empty( $analyzer['filter'] ) ) {
+ // Add private filters for this analyzer
+ foreach ( $analyzer['filter'] as $filter ) {
+ // Copy filters that are in language config but
not in the main config.
+ // We would not copy the same filter into the
main config since due to
+ // the resolution step we know they are the
same (otherwise we would have
+ // renamed it).
+ if ( isset( $langConfig['filter'][$filter] ) &&
+ !isset( $config['filter'][$filter] ) ) {
+ $config['filter'][$filter] =
$langConfig['filter'][$filter];
+ }
+ }
+ }
+ if ( !empty( $analyzer['char_filter'] ) ) {
+ // Add private char_filters for this analyzer
+ foreach ( $analyzer['char_filter'] as $filter ) {
+ // Here unlike above we do not check for
$langConfig since we assume
+ // language config is not broken and all char
filters are namespaced
+ // nicely, so if the filter is mentioned in
analyzer it is also defined.
+ if ( !isset( $config['char_filter'][$filter] )
) {
+ $config['char_filter'][$filter] =
$langConfig['char_filter'][$filter];
+ }
+ }
+ }
+ }
+
+ /**
+ * Create per-language configs for specific analyzers which separates
and namespaces
+ * filters that are different between languages.
+ * @param array[] $config Existing config, will be modified
+ * @param string[] $languages List of languages to process
+ * @param string[] $analyzers List of analyzers to process
+ */
+ public function buildLanguageConfigs( array &$config, array $languages,
array $analyzers ) {
+ $defaultFilters = $this->getDefaultFilters( $config, $analyzers
);
+ foreach ( $languages as $lang ) {
+ $langConfig = $this->buildConfig( $lang );
+ $defaultFilters += $this->getDefaultFilters(
$langConfig, $analyzers );
+ }
+ foreach ( $languages as $lang ) {
+ $langConfig = $this->buildConfig( $lang );
+ // Analyzer is: tokenizer + filter + char_filter
+ // Tokenizers don't seem to be subject to customization
now
+ // Char filters are nicely namespaced
+ // Filters are NOT - e.g. lowercase & icu_folding
filters are different for different
+ // languages! So we need to do some disambiguation here.
+ $langConfig['filter'] = $this->resolveFilters(
$langConfig, $config['filter'], $defaultFilters, $lang );
+ // Merge configs
+ foreach ( $analyzers as $analyzer ) {
+ $this->mergeConfig( $config, $langConfig,
$analyzer, $lang );
+ }
+ }
+ }
+
+ /**
* @return bool true if the icu analyzer is available.
*/
public function isIcuAvailable() {
--
To view, visit https://gerrit.wikimedia.org/r/393689
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I6abaf6b75aac86b39d416372c612e4099bfddfaa
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Smalyshev <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits