Manybubbles has uploaded a new change for review. https://gerrit.wikimedia.org/r/123885
Change subject: WIP: Add support for the experimental highlighter ...................................................................... WIP: Add support for the experimental highlighter This highlighter must be installed as an Elasticsearch plugin before you can use it. Once you have you can turn it on and use it without making any changes to the index. You can also instruct Cirrus to optimize the index for the experimental highlighter. If you do the next time you reindex highlights will speed up and the index should shrink. If you turn off the experimental highlighter now that you've optimized the index for it searches will crash. To turn it off you'll have to turn off the optimization, reindex, and then turn it off. Got 54411 in the process because it was in the way. Bug: 60141 Bug: 54411 Bug: 54526 Change-Id: Ie546c1b50e6394b8f100766d3fb7ee1b5a0aaf8e --- M CirrusSearch.php M includes/MappingConfigBuilder.php M includes/ResultsType.php M maintenance/updateOneSearchIndexConfig.php 4 files changed, 145 insertions(+), 47 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/85/123885/1 diff --git a/CirrusSearch.php b/CirrusSearch.php index 0b223ac..c718839 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -57,6 +57,23 @@ // don't contain user information. $wgCirrusSearchSlowSearch = 10.0; +// Should CirrusSearch attempt to use the "experimental" highlighter. It is an +// Elasticsearch plugin that should produce better snippets for search results. +// Installation instructions are here: +// https://github.com/wikimedia/search-highlighter +// If you have the highlighter installed you can switch this on and off so long +// as you don't rebuild the index while +// $wgCirrusSearchOptimizeIndexForExperimentalHighlighter is true. Setting it +// to true without the highlighter installed will break search. +$wgCirrusSearchUseExperimentalHighlighter = false; + +// Should CirrusSearch optimize the index for the experimental highlighter. +// This will speed up indexing, save a ton of space, and speed up highlighting +// slightly. This only takes effect if you rebuild the index. The downside is +// that you can no longer switch $wgCirrusSearchUseExperimentalHighlighter on +// and off - it has to stay on. +$wgCirrusSearchOptimizeIndexForExperimentalHighlighter = false; + // By default, Cirrus will organize pages into one of two indexes (general or // content) based on whether a page is in a content namespace. This should // suffice for most wikis. This setting allows individual namespaces to be diff --git a/includes/MappingConfigBuilder.php b/includes/MappingConfigBuilder.php index ee9f0d9..7a57a8c 100644 --- a/includes/MappingConfigBuilder.php +++ b/includes/MappingConfigBuilder.php @@ -25,6 +25,7 @@ const MINIMAL = 0; const ENABLE_NORMS = 1; const COPY_TO_SUGGEST = 2; + const SPEED_UP_HIGHLIGHTING = 4; /** * Version number for the core analysis. Increment the major @@ -47,13 +48,20 @@ private $phraseUseText; /** + * @var bool should the index be optimized for the experimental highlighter? + */ + private $optimizeForExperimentalHighlighter; + + /** * Constructor * @param bool $anyWord Prefix search on any word * @param bool $useText Text uses suggestion analyzer + * @param bool should the index be optimized for the experimental highlighter? */ - public function __construct( $anyWord, $useText ) { + public function __construct( $anyWord, $useText, $optimizeForExperimentalHighlighter ) { $this->prefixSearchStartsWithAnyWord = $anyWord; $this->phraseUseText = $useText; + $this->optimizeForExperimentalHighlighter = $optimizeForExperimentalHighlighter; } /** @@ -79,7 +87,7 @@ } $textExtraAnalyzers = array(); - $textOptions = MappingConfigBuilder::ENABLE_NORMS; + $textOptions = MappingConfigBuilder::ENABLE_NORMS | MappingConfigBuilder::SPEED_UP_HIGHLIGHTING; if ( $this->phraseUseText ) { $textExtraAnalyzers[] = $suggestExtra; $textOptions |= MappingConfigBuilder::COPY_TO_SUGGEST; @@ -111,13 +119,14 @@ 'template' => $this->buildLowercaseKeywordField(), 'outgoing_link' => $this->buildKeywordField(), 'external_link' => $this->buildKeywordField(), - 'heading' => $this->buildStringField( MappingConfigBuilder::MINIMAL ), + 'heading' => $this->buildStringField( MappingConfigBuilder::SPEED_UP_HIGHLIGHTING ), 'text_bytes' => $this->buildLongField( false ), 'redirect' => array( 'dynamic' => false, 'properties' => array( 'namespace' => $this->buildLongField(), - 'title' => $this->buildStringField( MappingConfigBuilder::COPY_TO_SUGGEST, + 'title' => $this->buildStringField( + MappingConfigBuilder::COPY_TO_SUGGEST | MappingConfigBuilder::SPEED_UP_HIGHLIGHTING, $titleExtraAnalyzers ), ) ), @@ -140,6 +149,8 @@ * ENABLE_NORMS: Gnable norms on the field. Good for text you search against but bad for array fields and useless * for fields that don't get involved in the score. * COPY_TO_SUGGEST: Copy the contents of this field to the suggest field for "Did you mean". + * SPEED_UP_HIGHLIGHTING: Store extra data in the field to speed up highlighting. This is important for long + * strings or fields with many values. * @return array definition of the field */ public function buildStringField( $options, $extra = array() ) { @@ -147,15 +158,23 @@ $field = array( 'type' => 'string', 'analyzer' => 'text', - 'term_vector' => 'with_positions_offsets', 'fields' => array( 'plain' => array( 'type' => 'string', 'analyzer' => 'plain', - 'term_vector' => 'with_positions_offsets', ), ) ); + if ( $this->optimizeForExperimentalHighlighter ) { + if ( $options & MappingConfigBuilder::SPEED_UP_HIGHLIGHTING ) { + $field[ 'index_options' ] = 'offsets'; + $field[ 'fields' ][ 'plain' ][ 'index_options' ] = 'offsets'; + } + } else { + // We use the FVH on all fields so turn on term vectors + $field[ 'term_vector' ] = 'with_positions_offsets'; + $field[ 'fields' ][ 'plain' ][ 'term_vector' ] = 'with_positions_offsets'; + } $disableNorms = ( $options & MappingConfigBuilder::ENABLE_NORMS ) === 0; if ( $disableNorms ) { $disableNorms = array( 'norms' => array( 'enabled' => false ) ); diff --git a/includes/ResultsType.php b/includes/ResultsType.php index e3cbecd..ef76e70 100644 --- a/includes/ResultsType.php +++ b/includes/ResultsType.php @@ -62,21 +62,40 @@ } public function getHighlightingConfiguration() { - // This is similar to the FullTextResults type but against the near_match and - // with the plain highlighter. Near match because that is how the field is - // queried. Plain highlighter because we don't want to add the FVH's space - // overhead for storing extra stuff and we don't need it for combining fields. - $entireValue = array( - 'number_of_fragments' => 0, - 'type' => 'plain', - ); - $entireValueInListField = array( - 'number_of_fragments' => 1, // Just one of the values in the list - 'fragment_size' => 10000, // We want the whole value but more than this is crazy - 'type' => 'plain', - ); + global $wgCirrusSearchUseExperimentalHighlighter; + + if ( $wgCirrusSearchUseExperimentalHighlighter ) { + // This is much less esoteric then the plain highlighter based + // invocation but does the same thing. The magic is that the none + // fragmenter still fragments on multi valued fields. + $entireValue = array( + 'type' => 'experimental', + 'fragmenter' => 'none', + 'number_of_fragments' => 1, + ); + $entireValueInListField = array( + 'type' => 'experimental', + 'fragmenter' => 'none', + 'order' => 'score', + 'number_of_fragments' => 1, + ); + } else { + // This is similar to the FullTextResults type but against the near_match and + // with the plain highlighter. Near match because that is how the field is + // queried. Plain highlighter because we don't want to add the FVH's space + // overhead for storing extra stuff and we don't need it for combining fields. + $entireValue = array( + 'type' => 'plain', + 'number_of_fragments' => 0, + ); + $entireValueInListField = array( + 'type' => 'plain', + 'fragment_size' => 10000, // We want the whole value but more than this is crazy + 'order' => 'score', + 'number_of_fragments' => 1, // Just one of the values in the list + ); + } return array( - 'order' => 'score', 'pre_tags' => array( Searcher::HIGHLIGHT_PRE ), 'post_tags' => array( Searcher::HIGHLIGHT_POST ), 'fields' => array( @@ -139,27 +158,63 @@ * @return array of highlighting configuration */ public function getHighlightingConfiguration() { - $entireValue = array( - 'number_of_fragments' => 0, - 'type' => 'fvh', - ); - $entireValueInListField = array( - 'number_of_fragments' => 1, // Just one of the values in the list - 'fragment_size' => 10000, // We want the whole value but more than this is crazy - 'type' => 'plain', // TODO switch to fvh when Elasticserach issue 3757 is fixed - ); - $singleFragment = array( - 'number_of_fragments' => 1, // Just one fragment - 'fragment_size' => 100, - 'type' => 'fvh', - ); + global $wgCirrusSearchUseExperimentalHighlighter; - // If there isn't a match just return a match sized chunk from the beginning of the page. - $text = $singleFragment; - $text[ 'no_match_size' ] = $text[ 'fragment_size' ]; + if ( $wgCirrusSearchUseExperimentalHighlighter ) { + $entireValue = array( + 'type' => 'experimental', + 'fragmenter' => 'none', + 'number_of_fragments' => 1, + ); + $entireValueInListField = array( + 'type' => 'experimental', + 'fragmenter' => 'none', + 'order' => 'score', + 'number_of_fragments' => 1, + ); + $singleFragment = array( + 'type' => 'experimental', + 'number_of_fragments' => 1, + 'fragmenter' => 'sentence', + 'options' => array( + 'locale' => wfGetLangObj()->getCode(), + 'top_scoring' => true, + 'boost_before' => array( + // Note these values are super arbitrary right now. + '20' => 8, + '50' => 7, + '200' => 4, + '1000' => 2, + ), + ), + ); + // If there isn't a match just return some of the the first few sentences . + $text = $singleFragment; + $text[ 'no_match_size' ] = 100; + } else { + $entireValue = array( + 'number_of_fragments' => 0, + 'type' => 'fvh', + 'order' => 'score', + ); + $entireValueInListField = array( + 'number_of_fragments' => 1, // Just one of the values in the list + 'fragment_size' => 10000, // We want the whole value but more than this is crazy + 'type' => 'fvh', + 'order' => 'score', + ); + $singleFragment = array( + 'number_of_fragments' => 1, // Just one fragment + 'fragment_size' => 100, + 'type' => 'fvh', + 'order' => 'score', + ); + // If there isn't a match just return a match sized chunk from the beginning of the page. + $text = $singleFragment; + $text[ 'no_match_size' ] = $text[ 'fragment_size' ]; + } return array( - 'order' => 'score', 'pre_tags' => array( Searcher::HIGHLIGHT_PRE ), 'post_tags' => array( Searcher::HIGHLIGHT_POST ), 'fields' => $this->addMatchedFields( array( @@ -168,9 +223,6 @@ 'file_text' => $singleFragment, 'redirect.title' => $entireValueInListField, 'heading' => $entireValueInListField, - // TODO remove when Elasticsearch issue 3757 is fixed - 'redirect.title.plain' => $entireValueInListField, - 'heading.plain' => $entireValueInListField, ) ), ); } @@ -181,11 +233,8 @@ } private function addMatchedFields( $fields ) { + $newFields = array(); foreach ( $fields as $name => $config ) { - // TODO remove when Elasticsearch issue 3757 is fixed - if ( $config[ 'type' ] !== 'fvh' ) { - continue; - } $config[ 'matched_fields' ] = array( $name, "$name.plain" ); $fields[ $name ] = $config; } diff --git a/maintenance/updateOneSearchIndexConfig.php b/maintenance/updateOneSearchIndexConfig.php index f8492fe..fba83f9 100644 --- a/maintenance/updateOneSearchIndexConfig.php +++ b/maintenance/updateOneSearchIndexConfig.php @@ -350,9 +350,19 @@ } private function validateMapping() { + global $wgCirrusSearchOptimizeIndexForExperimentalHighlighter; + $this->output( $this->indent . "Validating mappings..." ); + if ( $wgCirrusSearchOptimizeIndexForExperimentalHighlighter && + !in_array( 'experimental highlighter', $this->availablePlugins ) ) { + $this->output( "impossible!\n" ); + $this->error( "wgCirrusSearchOptimizeIndexForExperimentalHighlighter is set to true but the " . + "'experimental highlighter' plugin is not installed on all hosts.", 1 ); + } + $requiredPageMappings = new MappingConfigBuilder( - $this->prefixSearchStartsWithAny, $this->phraseUseText ); + $this->prefixSearchStartsWithAny, $this->phraseUseText, + $wgCirrusSearchOptimizeIndexForExperimentalHighlighter ); $requiredPageMappings = $requiredPageMappings->buildConfig(); if ( !$this->checkMapping( $requiredPageMappings ) ) { @@ -676,6 +686,8 @@ } private function reindexInternal( $children, $childNumber ) { + global $wgCirrusSearchOptimizeIndexForExperimentalHighlighter; + $filter = null; $messagePrefix = ""; if ( $childNumber === 1 && $children === 1 ) { @@ -691,7 +703,8 @@ "(doc['_uid'].value.hashCode() & Integer.MAX_VALUE) % $children == $childNumber" ); } $pageProperties = new MappingConfigBuilder( - $this->prefixSearchStartsWithAny, $this->phraseUseText ); + $this->prefixSearchStartsWithAny, $this->phraseUseText, + $wgCirrusSearchOptimizeIndexForExperimentalHighlighter ); $pageProperties = $pageProperties->buildConfig(); $pageProperties = $pageProperties[ 'properties' ]; try { -- To view, visit https://gerrit.wikimedia.org/r/123885 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ie546c1b50e6394b8f100766d3fb7ee1b5a0aaf8e Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: Manybubbles <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
