Manybubbles has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/123885

Change subject: WIP:  Add support for the experimental highlighter
......................................................................

WIP:  Add support for the experimental highlighter

This highlighter must be installed as an Elasticsearch plugin before you
can use it.  Once you have you can turn it on and use it without making
any changes to the index.  You can also instruct Cirrus to optimize the
index for the experimental highlighter.  If you do the next time you reindex
highlights will speed up and the index should shrink.  If you turn off
the experimental highlighter now that you've optimized the index for it
searches will crash.  To turn it off you'll have to turn off the optimization,
reindex, and then turn it off.

Got 54411 in the process because it was in the way.

Bug: 60141
Bug: 54411
Bug: 54526

Change-Id: Ie546c1b50e6394b8f100766d3fb7ee1b5a0aaf8e
---
M CirrusSearch.php
M includes/MappingConfigBuilder.php
M includes/ResultsType.php
M maintenance/updateOneSearchIndexConfig.php
4 files changed, 145 insertions(+), 47 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/85/123885/1

diff --git a/CirrusSearch.php b/CirrusSearch.php
index 0b223ac..c718839 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -57,6 +57,23 @@
 // don't contain user information.
 $wgCirrusSearchSlowSearch = 10.0;
 
+// Should CirrusSearch attempt to use the "experimental" highlighter.  It is an
+// Elasticsearch plugin that should produce better snippets for search results.
+// Installation instructions are here:
+// https://github.com/wikimedia/search-highlighter
+// If you have the highlighter installed you can switch this on and off so long
+// as you don't rebuild the index while
+// $wgCirrusSearchOptimizeIndexForExperimentalHighlighter is true.  Setting it
+// to true without the highlighter installed will break search.
+$wgCirrusSearchUseExperimentalHighlighter = false;
+
+// Should CirrusSearch optimize the index for the experimental highlighter.
+// This will speed up indexing, save a ton of space, and speed up highlighting
+// slightly.  This only takes effect if you rebuild the index. The downside is
+// that you can no longer switch $wgCirrusSearchUseExperimentalHighlighter on
+// and off - it has to stay on.
+$wgCirrusSearchOptimizeIndexForExperimentalHighlighter = false;
+
 // By default, Cirrus will organize pages into one of two indexes (general or
 // content) based on whether a page is in a content namespace. This should
 // suffice for most wikis. This setting allows individual namespaces to be
diff --git a/includes/MappingConfigBuilder.php 
b/includes/MappingConfigBuilder.php
index ee9f0d9..7a57a8c 100644
--- a/includes/MappingConfigBuilder.php
+++ b/includes/MappingConfigBuilder.php
@@ -25,6 +25,7 @@
        const MINIMAL = 0;
        const ENABLE_NORMS = 1;
        const COPY_TO_SUGGEST = 2;
+       const SPEED_UP_HIGHLIGHTING = 4;
 
        /**
         * Version number for the core analysis. Increment the major
@@ -47,13 +48,20 @@
        private $phraseUseText;
 
        /**
+        * @var bool should the index be optimized for the experimental 
highlighter?
+        */
+       private $optimizeForExperimentalHighlighter;
+
+       /**
         * Constructor
         * @param bool $anyWord Prefix search on any word
         * @param bool $useText Text uses suggestion analyzer
+        * @param bool should the index be optimized for the experimental 
highlighter?
         */
-       public function __construct( $anyWord, $useText ) {
+       public function __construct( $anyWord, $useText, 
$optimizeForExperimentalHighlighter ) {
                $this->prefixSearchStartsWithAnyWord = $anyWord;
                $this->phraseUseText = $useText;
+               $this->optimizeForExperimentalHighlighter = 
$optimizeForExperimentalHighlighter;
        }
 
        /**
@@ -79,7 +87,7 @@
                }
 
                $textExtraAnalyzers = array();
-               $textOptions = MappingConfigBuilder::ENABLE_NORMS;
+               $textOptions = MappingConfigBuilder::ENABLE_NORMS | 
MappingConfigBuilder::SPEED_UP_HIGHLIGHTING;
                if ( $this->phraseUseText ) {
                        $textExtraAnalyzers[] = $suggestExtra;
                        $textOptions |= MappingConfigBuilder::COPY_TO_SUGGEST;
@@ -111,13 +119,14 @@
                                'template' => 
$this->buildLowercaseKeywordField(),
                                'outgoing_link' => $this->buildKeywordField(),
                                'external_link' => $this->buildKeywordField(),
-                               'heading' => $this->buildStringField( 
MappingConfigBuilder::MINIMAL ),
+                               'heading' => $this->buildStringField( 
MappingConfigBuilder::SPEED_UP_HIGHLIGHTING ),
                                'text_bytes' => $this->buildLongField( false ),
                                'redirect' => array(
                                        'dynamic' => false,
                                        'properties' => array(
                                                'namespace' =>  
$this->buildLongField(),
-                                               'title' => 
$this->buildStringField( MappingConfigBuilder::COPY_TO_SUGGEST,
+                                               'title' => 
$this->buildStringField(
+                                                       
MappingConfigBuilder::COPY_TO_SUGGEST | 
MappingConfigBuilder::SPEED_UP_HIGHLIGHTING,
                                                        $titleExtraAnalyzers ),
                                        )
                                ),
@@ -140,6 +149,8 @@
         *   ENABLE_NORMS: Gnable norms on the field.  Good for text you search 
against but bad for array fields and useless
         *     for fields that don't get involved in the score.
         *   COPY_TO_SUGGEST: Copy the contents of this field to the suggest 
field for "Did you mean".
+        *   SPEED_UP_HIGHLIGHTING: Store extra data in the field to speed up 
highlighting.  This is important for long
+        *     strings or fields with many values.
         * @return array definition of the field
         */
        public function buildStringField( $options, $extra = array() ) {
@@ -147,15 +158,23 @@
                $field = array(
                        'type' => 'string',
                        'analyzer' => 'text',
-                       'term_vector' => 'with_positions_offsets',
                        'fields' => array(
                                'plain' => array(
                                        'type' => 'string',
                                        'analyzer' => 'plain',
-                                       'term_vector' => 
'with_positions_offsets',
                                ),
                        )
                );
+               if ( $this->optimizeForExperimentalHighlighter ) {
+                       if ( $options & 
MappingConfigBuilder::SPEED_UP_HIGHLIGHTING ) {
+                               $field[ 'index_options' ] = 'offsets';
+                               $field[ 'fields' ][ 'plain' ][ 'index_options' 
] = 'offsets';
+                       }
+               } else {
+                       // We use the FVH on all fields so turn on term vectors
+                       $field[ 'term_vector' ] = 'with_positions_offsets';
+                       $field[ 'fields' ][ 'plain' ][ 'term_vector' ] = 
'with_positions_offsets';
+               }
                $disableNorms = ( $options & MappingConfigBuilder::ENABLE_NORMS 
) === 0;
                if ( $disableNorms ) {
                        $disableNorms = array( 'norms' => array( 'enabled' => 
false ) );
diff --git a/includes/ResultsType.php b/includes/ResultsType.php
index e3cbecd..ef76e70 100644
--- a/includes/ResultsType.php
+++ b/includes/ResultsType.php
@@ -62,21 +62,40 @@
        }
 
        public function getHighlightingConfiguration() {
-               // This is similar to the FullTextResults type but against the 
near_match and
-               // with the plain highlighter.  Near match because that is how 
the field is
-               // queried.  Plain highlighter because we don't want to add the 
FVH's space
-               // overhead for storing extra stuff and we don't need it for 
combining fields.
-               $entireValue = array(
-                       'number_of_fragments' => 0,
-                       'type' => 'plain',
-               );
-               $entireValueInListField = array(
-                       'number_of_fragments' => 1, // Just one of the values 
in the list
-                       'fragment_size' => 10000,   // We want the whole value 
but more than this is crazy
-                       'type' => 'plain',
-               );
+               global $wgCirrusSearchUseExperimentalHighlighter;
+
+               if ( $wgCirrusSearchUseExperimentalHighlighter ) {
+                       // This is much less esoteric then the plain 
highlighter based
+                       // invocation but does the same thing.  The magic is 
that the none
+                       // fragmenter still fragments on multi valued fields.
+                       $entireValue = array(
+                               'type' => 'experimental',
+                               'fragmenter' => 'none',
+                               'number_of_fragments' => 1,
+                       );
+                       $entireValueInListField = array(
+                               'type' => 'experimental',
+                               'fragmenter' => 'none',
+                               'order' => 'score',
+                               'number_of_fragments' => 1,
+                       );
+               } else {
+                       // This is similar to the FullTextResults type but 
against the near_match and
+                       // with the plain highlighter.  Near match because that 
is how the field is
+                       // queried.  Plain highlighter because we don't want to 
add the FVH's space
+                       // overhead for storing extra stuff and we don't need 
it for combining fields.
+                       $entireValue = array(
+                               'type' => 'plain',
+                               'number_of_fragments' => 0,
+                       );
+                       $entireValueInListField = array(
+                               'type' => 'plain',
+                               'fragment_size' => 10000,   // We want the 
whole value but more than this is crazy
+                               'order' => 'score',
+                               'number_of_fragments' => 1, // Just one of the 
values in the list
+                       );
+               }
                return array(
-                       'order' => 'score',
                        'pre_tags' => array( Searcher::HIGHLIGHT_PRE ),
                        'post_tags' => array( Searcher::HIGHLIGHT_POST ),
                        'fields' => array(
@@ -139,27 +158,63 @@
         * @return array of highlighting configuration
         */
        public function getHighlightingConfiguration() {
-               $entireValue = array(
-                       'number_of_fragments' => 0,
-                       'type' => 'fvh',
-               );
-               $entireValueInListField = array(
-                       'number_of_fragments' => 1, // Just one of the values 
in the list
-                       'fragment_size' => 10000,   // We want the whole value 
but more than this is crazy
-                       'type' => 'plain',          // TODO switch to fvh when 
Elasticserach issue 3757 is fixed
-               );
-               $singleFragment = array(
-                       'number_of_fragments' => 1, // Just one fragment
-                       'fragment_size' => 100,
-                       'type' => 'fvh',
-               );
+               global $wgCirrusSearchUseExperimentalHighlighter;
 
-               // If there isn't a match just return a match sized chunk from 
the beginning of the page.
-               $text = $singleFragment;
-               $text[ 'no_match_size' ] = $text[ 'fragment_size' ];
+               if ( $wgCirrusSearchUseExperimentalHighlighter ) {
+                       $entireValue = array(
+                               'type' => 'experimental',
+                               'fragmenter' => 'none',
+                               'number_of_fragments' => 1,
+                       );
+                       $entireValueInListField = array(
+                               'type' => 'experimental',
+                               'fragmenter' => 'none',
+                               'order' => 'score',
+                               'number_of_fragments' => 1,
+                       );
+                       $singleFragment = array(
+                               'type' => 'experimental',
+                               'number_of_fragments' => 1,
+                               'fragmenter' => 'sentence',
+                               'options' => array(
+                                       'locale' => wfGetLangObj()->getCode(),
+                                       'top_scoring' => true,
+                                       'boost_before' => array(
+                                               // Note these values are super 
arbitrary right now.
+                                               '20' => 8,
+                                               '50' => 7,
+                                               '200' => 4,
+                                               '1000' => 2,
+                                       ),
+                               ),
+                       );
+                       // If there isn't a match just return some of the the 
first few sentences .
+                       $text = $singleFragment;
+                       $text[ 'no_match_size' ] = 100;
+               } else {
+                       $entireValue = array(
+                               'number_of_fragments' => 0,
+                               'type' => 'fvh',
+                               'order' => 'score',
+                       );
+                       $entireValueInListField = array(
+                               'number_of_fragments' => 1, // Just one of the 
values in the list
+                               'fragment_size' => 10000,   // We want the 
whole value but more than this is crazy
+                               'type' => 'fvh',
+                               'order' => 'score',
+                       );
+                       $singleFragment = array(
+                               'number_of_fragments' => 1, // Just one fragment
+                               'fragment_size' => 100,
+                               'type' => 'fvh',
+                               'order' => 'score',
+                       );
+                       // If there isn't a match just return a match sized 
chunk from the beginning of the page.
+                       $text = $singleFragment;
+                       $text[ 'no_match_size' ] = $text[ 'fragment_size' ];
+               }
 
                return array(
-                       'order' => 'score',
                        'pre_tags' => array( Searcher::HIGHLIGHT_PRE ),
                        'post_tags' => array( Searcher::HIGHLIGHT_POST ),
                        'fields' => $this->addMatchedFields( array(
@@ -168,9 +223,6 @@
                                'file_text' => $singleFragment,
                                'redirect.title' => $entireValueInListField,
                                'heading' => $entireValueInListField,
-                               // TODO remove when Elasticsearch issue 3757 is 
fixed
-                               'redirect.title.plain' => 
$entireValueInListField,
-                               'heading.plain' => $entireValueInListField,
                        ) ),
                );
        }
@@ -181,11 +233,8 @@
        }
 
        private function addMatchedFields( $fields ) {
+               $newFields = array();
                foreach ( $fields as $name => $config ) {
-                       // TODO remove when Elasticsearch issue 3757 is fixed
-                       if ( $config[ 'type' ] !== 'fvh' ) {
-                               continue;
-                       }
                        $config[ 'matched_fields' ] = array( $name, 
"$name.plain" );
                        $fields[ $name ] = $config;
                }
diff --git a/maintenance/updateOneSearchIndexConfig.php 
b/maintenance/updateOneSearchIndexConfig.php
index f8492fe..fba83f9 100644
--- a/maintenance/updateOneSearchIndexConfig.php
+++ b/maintenance/updateOneSearchIndexConfig.php
@@ -350,9 +350,19 @@
        }
 
        private function validateMapping() {
+               global $wgCirrusSearchOptimizeIndexForExperimentalHighlighter;
+
                $this->output( $this->indent . "Validating mappings..." );
+               if ( $wgCirrusSearchOptimizeIndexForExperimentalHighlighter &&
+                               !in_array( 'experimental highlighter', 
$this->availablePlugins ) ) {
+                       $this->output( "impossible!\n" );
+                       $this->error( 
"wgCirrusSearchOptimizeIndexForExperimentalHighlighter is set to true but the " 
.
+                               "'experimental highlighter' plugin is not 
installed on all hosts.", 1 );
+               }
+
                $requiredPageMappings = new MappingConfigBuilder(
-                       $this->prefixSearchStartsWithAny, $this->phraseUseText 
);
+                       $this->prefixSearchStartsWithAny, $this->phraseUseText,
+                       $wgCirrusSearchOptimizeIndexForExperimentalHighlighter 
);
                $requiredPageMappings = $requiredPageMappings->buildConfig();
 
                if ( !$this->checkMapping( $requiredPageMappings ) ) {
@@ -676,6 +686,8 @@
        }
 
        private function reindexInternal( $children, $childNumber ) {
+               global $wgCirrusSearchOptimizeIndexForExperimentalHighlighter;
+
                $filter = null;
                $messagePrefix = "";
                if ( $childNumber === 1 && $children === 1 ) {
@@ -691,7 +703,8 @@
                                "(doc['_uid'].value.hashCode() & 
Integer.MAX_VALUE) % $children == $childNumber" );
                }
                $pageProperties = new MappingConfigBuilder(
-                       $this->prefixSearchStartsWithAny, $this->phraseUseText 
);
+                       $this->prefixSearchStartsWithAny, $this->phraseUseText,
+                       $wgCirrusSearchOptimizeIndexForExperimentalHighlighter 
);
                $pageProperties = $pageProperties->buildConfig();
                $pageProperties = $pageProperties[ 'properties' ];
                try {

-- 
To view, visit https://gerrit.wikimedia.org/r/123885
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie546c1b50e6394b8f100766d3fb7ee1b5a0aaf8e
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to