EBernhardson has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/342141 )
Change subject: [WIP] Alternate fix for swedish folding ...................................................................... [WIP] Alternate fix for swedish folding Reworks the swedish analysis chain to introduce asciifolding. Depending on Cirrus config this might be switched from asciifolding to icu normalization. The problem with this approach is that it is a much larger change, in terms of changes to result sets. A different approach might be to add a char filter only for specific changes we want to make, like รค => a. After we downgrade relforge to 5.1.2 will load some indices and see how big of a change this actually is, and if we should consider the more limited approach. Bug: T155822 Change-Id: I47fb0377b0090e096a657d6d9a8d3029bea9d6af --- M includes/Maintenance/AnalysisConfigBuilder.php 1 file changed, 27 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/41/342141/1 diff --git a/includes/Maintenance/AnalysisConfigBuilder.php b/includes/Maintenance/AnalysisConfigBuilder.php index a42f87e..4b4eb82 100644 --- a/includes/Maintenance/AnalysisConfigBuilder.php +++ b/includes/Maintenance/AnalysisConfigBuilder.php @@ -803,6 +803,33 @@ case 'turkish': $config[ 'filter' ][ 'lowercase' ][ 'language' ] = 'turkish'; break; + case 'swedish': + // Add asciifolding_preserve to filters + $config[ 'analyzer' ][ 'lowercase_keyword' ][ 'filter' ][] = 'asciifolding_preserve'; + + // Unpack built-in swedish analyzer to add asciifolding_preserve + $config['filter']['swedish_stop'] = [ + 'type' => 'stop', + 'stopwords' => '_swedish_', + ]; + $config['filter']['swedish_stemmer'] = [ + 'type' => 'stemmer', + 'language' => 'swedish', + ]; + + $config['analyzer']['text'] = [ + 'tokenizer' => 'standard', + 'filter' => [ + 'lowercase', + 'swedish_stop', + 'swedish_stemmer', + 'asciifolding_preserve', + ], + ]; + + // In Swedish text_search is just a copy of text + $config['analyzer']['text_search'] = $config['analyzer']['text']; + break; } if ( $this->icu ) { foreach ( $config[ 'analyzer' ] as &$analyzer ) { -- To view, visit https://gerrit.wikimedia.org/r/342141 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I47fb0377b0090e096a657d6d9a8d3029bea9d6af Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits