jenkins-bot has submitted this change and it was merged. Change subject: Add support for ngram accelerated regexes ......................................................................
Add support for ngram accelerated regexes Its disabled by default because it requires the wikimedia/extra elasticsearch plugin and casual installers won't have it. It must be enabled in two stages: 1. build 2. use In the build stage when the index is rebuilt the source text is analyzed with an additional analyzer that is required for regex acceleration. It does not require the wikimedia/extra plugin to be installed on Elasticsearch. In the use stage the newly analyzed field is used to accelerate regular expressions. If the field isn't yet built all regular expressions will return no return no results. If the wikimedia/extra plugin isn't installed in Elasticsearch this will cause all regular expression searches to fail. Change-Id: If9fcb6a03c23e8abaf59b5ddfadab051263dac33 DEPLOYMENT: Lots of notes. Read commit message. --- M CirrusSearch.php M includes/ElasticsearchIntermediary.php A includes/Extra/Filter/SourceRegex.php M includes/Maintenance/AnalysisConfigBuilder.php M includes/Maintenance/MappingConfigBuilder.php M includes/Searcher.php M tests/browser/features/insource.feature M tests/jenkins/Jenkins.php 8 files changed, 225 insertions(+), 27 deletions(-) Approvals: Chad: Looks good to me, approved jenkins-bot: Verified diff --git a/CirrusSearch.php b/CirrusSearch.php index f30e39b..8973c91 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -81,6 +81,21 @@ // and off - it has to stay on. $wgCirrusSearchOptimizeIndexForExperimentalHighlighter = false; +// Should CirrusSearch try to use the wikimedia/extra plugin? An empty array +// means don't use it at all. +$wgCirrusSearchWikimediaExtraPlugin = array(); +// Here is an example to enable faster regex matching: +// $wgCirrusSearchWikimediaExtraPlugin = array( +// 'regex' => array( 'build', 'use', 'max_inspect' => 10000 ), +// ); +// The 'build' value instructs Cirrus to build the index required to speed up +// regex queries. The 'use' value instructs Cirrus to use it to power regular +// expression queries. If 'use' is added before the index is rebuilt with +// 'build' in the array then regex will fail to find anything. The value of +// the 'max_inspect' key is the maximum number of pages to recheck the regex +// against. It defaults to 10000 which seems like a reasonable compromize +// to keep regexes fast while still producing good results. + // By default, Cirrus will organize pages into one of two indexes (general or // content) based on whether a page is in a content namespace. This should // suffice for most wikis. This setting allows individual namespaces to be @@ -502,6 +517,7 @@ $includes = __DIR__ . "/includes/"; $apiDir = $includes . 'Api/'; $buildDocument = $includes . 'BuildDocument/'; +$extraFilterDir = $includes . 'Extra/Filter/'; $jobsDir = $includes . 'Job/'; $maintenanceDir = $includes . 'Maintenance/'; $sanity = $includes . 'Sanity/'; @@ -523,6 +539,7 @@ $wgAutoloadClasses['CirrusSearch\Connection'] = $includes . 'Connection.php'; $wgAutoloadClasses['CirrusSearch\Dump'] = $includes . 'Dump.php'; $wgAutoloadClasses['CirrusSearch\ElasticsearchIntermediary'] = $includes . 'ElasticsearchIntermediary.php'; +$wgAutoloadClasses['CirrusSearch\Extra\Filter\SourceRegex'] = $extraFilterDir . 'SourceRegex.php'; $wgAutoloadClasses['CirrusSearch\ForceSearchIndex'] = __DIR__ . '/maintenance/forceSearchIndex.php'; $wgAutoloadClasses['CirrusSearch\Hooks'] = $includes . 'Hooks.php'; $wgAutoloadClasses['CirrusSearch\InterwikiSearcher'] = $includes . 'InterwikiSearcher.php'; diff --git a/includes/ElasticsearchIntermediary.php b/includes/ElasticsearchIntermediary.php index c9f1b3d..b08b51e 100644 --- a/includes/ElasticsearchIntermediary.php +++ b/includes/ElasticsearchIntermediary.php @@ -196,6 +196,8 @@ // what else would have automatons and illegal argument exceptions. Just looking // for the exception won't suffice because other weird things could cause it. $seemsToUseRegexes = strpos( $message, 'import org.apache.lucene.util.automaton.*' ) !== false; + $usesExtraRegex = strpos( $message, 'org.wikimedia.search.extra.regex.SourceRegexFilter' ) !== false; + $seemsToUseRegexes |= $usesExtraRegex; $marker = 'IllegalArgumentException['; $markerLocation = strpos( $message, $marker ); if ( $seemsToUseRegexes && $markerLocation !== false ) { @@ -207,8 +209,14 @@ $matches = array(); if ( preg_match( '/(.+) at position ([0-9]+)/', $syntaxError, $matches ) ) { $errorMessage = $matches[ 1 ]; - // The 3 below offsets the .*( in front of the user pattern to make it unanchored. - $position = $matches[ 2 ] - 3; + $position = $matches[ 2 ]; + if ( !$usesExtraRegex ) { + // The 3 below offsets the .*( in front of the user pattern + // to make it unanchored. + $position -= 3; + } + } else if ( $syntaxError === 'unexpected end-of-string' ) { + $errorMessage = 'regex too short to be correct'; } $status = Status::newFatal( 'cirrussearch-regex-syntax-error', $errorMessage, $position ); return array( $status, 'Regex syntax error: ' . $syntaxError ); diff --git a/includes/Extra/Filter/SourceRegex.php b/includes/Extra/Filter/SourceRegex.php new file mode 100644 index 0000000..e0344be --- /dev/null +++ b/includes/Extra/Filter/SourceRegex.php @@ -0,0 +1,133 @@ +<?php + +namespace CirrusSearch\Extra\Filter; + +use Elastica\Filter\AbstractFilter; + +/** + * Source regex filter for trigram accelerated regex matching. + * + * @link https://github.com/wikimedia/search-extra/blob/master/docs/source_regex.md + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + */ + +class SourceRegex extends AbstractFilter { + /** + * @param null|string $regex optional regex to match against field + * @param null|string $field optional field who's source to check with the regex + * @param null|string $ngramField optional field that is indexed with ngrams to + * accelerate regex matching + */ + public function __construct( $regex = null, $field = null, $ngramField = null ) { + if ( $regex ) { + $this->setRegex( $regex ); + } + if ( $field ) { + $this->setField( $field ); + } + if ( $ngramField ) { + $this->setNgramField( $ngramField ); + } + } + + /** + * @param string $regex regex to match against field + * @return \CirrusSearch\Extra\Filter\SourceRegex this for chaining + */ + public function setRegex( $regex ) { + return $this->setParam( 'regex', $regex ); + } + + /** + * @param string $field field who's source to check with the regex + * @return \CirrusSearch\Extra\Filter\SourceRegex this for chaining + */ + public function setField( $field ) { + return $this->setParam( 'field', $field ); + } + + /** + * @param string $ngramField field that is indexed with ngrams to + * accelerate regex matching + * @return \CirrusSearch\Extra\Filter\SourceRegex this for chaining + */ + public function setNGramField( $ngramField ) { + return $this->setParam( 'ngram_field', $ngramField ); + } + + /** + * @param int $gramSize size of the ngrams extracted for acccelerating + * the regex. Defaults to 3 if not set. That gram size must have been + * produced by analyzing the ngramField. + * @return \CirrusSearch\Extra\Filter\SourceRegex this for chaining + */ + public function setGramSize( $gramSize ) { + return $this->setParam( 'gram_size', $gramSize ); + } + + /** + * @param int $maxExpand maximum range before outgoing automaton arcs are + * ignored. Roughly corresponds to the maximum number of characters in a + * character class ([abcd]) before it is treated as . for purposes of + * acceleration. Defaults to 4. + * @return \CirrusSearch\Extra\Filter\SourceRegex this for chaining + */ + public function setMaxExpand( $maxExpand ) { + return $this->setParam( 'max_expand', $maxExpand ); + } + + /** + * @param int $maxStatesTraced maximum number of automaton states that can + * be traced before the algorithm gives up and assumes the regex is too + * complex and throws an error back to the user. Defaults to 10000 which + * handily covers all regexes I cared to test. + * @return \CirrusSearch\Extra\Filter\SourceRegex this for chaining + */ + public function setMaxStatesTraced( $maxStatesTraced ) { + return $this->setParam( 'max_states_traced', $maxStatesTraced ); + } + + /** + * @param int $maxInspect maximum number of source field to run the regex + * against before giving up and just declaring all remaining fields not + * matching by fiat. Defaults to MAX_INT. Set this to 10000 or something + * nice and low to prevent regular expressions that cannot be sped up from + * taking up too many resources. + * @return \CirrusSearch\Extra\Filter\SourceRegex this for chaining + */ + public function setMaxInspect( $maxInspect ) { + return $this->setParam( 'max_inspect', $maxInspect ); + } + + /** + * @param bool $caseSensitive is the regex case insensitive? Defaults to + * case insensitive if not set. + * @return \CirrusSearch\Extra\Filter\SourceRegex this for chaining + */ + public function setCaseSensitive( $caseSensitive ) { + return $this->setParam( 'case_sensitive', $caseSensitive ); + } + + /** + * @param bool $locale locale used for case conversions. Its imporant that + * this matches the locale used for lowercasing in the ngram index. + * @return \CirrusSearch\Extra\Filter\SourceRegex this for chaining + */ + public function setLocale( $locale ) { + return $this->setParam( 'locale', $locale ); + } +} diff --git a/includes/Maintenance/AnalysisConfigBuilder.php b/includes/Maintenance/AnalysisConfigBuilder.php index c28d149..0b8621e 100644 --- a/includes/Maintenance/AnalysisConfigBuilder.php +++ b/includes/Maintenance/AnalysisConfigBuilder.php @@ -30,7 +30,7 @@ * and change the minor version when it changes but isn't * incompatible */ - const VERSION = '0.8'; + const VERSION = '0.9'; /** * Language code we're building analysis for @@ -128,6 +128,11 @@ 'tokenizer' => 'no_splitting', 'filter' => array( 'lowercase' ), ), + 'trigram' => array( + 'type' => 'custom', + 'tokenizer' => 'trigram', + 'filter' => array( 'lowercase' ), + ), ), 'filter' => array( 'suggest_shingle' => array( @@ -164,6 +169,11 @@ 'no_splitting' => array( // Just grab the whole term. 'type' => 'keyword', ), + 'trigram' => array( + 'type' => 'nGram', + 'min_gram' => 3, + 'max_gram' => 3, + ), ), 'char_filter' => array( // Flattens things that are space like to spaces in the near_match style analyzers diff --git a/includes/Maintenance/MappingConfigBuilder.php b/includes/Maintenance/MappingConfigBuilder.php index 747dbbf..067632a 100644 --- a/includes/Maintenance/MappingConfigBuilder.php +++ b/includes/Maintenance/MappingConfigBuilder.php @@ -44,7 +44,7 @@ * and change the minor version when it changes but isn't * incompatible */ - const VERSION = '1.6'; + const VERSION = '1.7'; /** * Whether to allow prefix searches to match on any word @@ -81,7 +81,8 @@ */ public function buildConfig() { global $wgCirrusSearchAllFields, - $wgCirrusSearchWeights; + $wgCirrusSearchWeights, + $wgCirrusSearchWikimediaExtraPlugin; $suggestExtra = array( 'analyzer' => 'suggest' ); // Note never to set something as type='object' here because that isn't returned by elasticsearch @@ -97,6 +98,14 @@ 'index_analyzer' => 'word_prefix', 'search_analyzer' => 'plain_search', 'index_options' => 'docs' + ); + } + $sourceExtraAnalyzers = array(); + if ( isset( $wgCirrusSearchWikimediaExtraPlugin[ 'regex' ] ) && + in_array( 'build', $wgCirrusSearchWikimediaExtraPlugin[ 'regex' ] ) ) { + $sourceExtraAnalyzers[] = array( + 'analyzer' => 'trigram', + 'index_options' => 'docs', ); } @@ -131,7 +140,9 @@ 'opening_text' => $this->buildStringField( MappingConfigBuilder::ENABLE_NORMS ), 'auxiliary_text' => $this->buildStringField( $textOptions ), 'file_text' => $this->buildStringField( $textOptions ), - 'source_text' => $this->buildStringField( MappingConfigBuilder::MINIMAL ), + 'source_text' => $this->buildStringField( MappingConfigBuilder::MINIMAL, + $sourceExtraAnalyzers + ), 'category' => $this->buildStringField( MappingConfigBuilder::MINIMAL, array( array( 'analyzer' => 'lowercase_keyword', diff --git a/includes/Searcher.php b/includes/Searcher.php index 837777f..2c773b5 100644 --- a/includes/Searcher.php +++ b/includes/Searcher.php @@ -3,6 +3,7 @@ namespace CirrusSearch; use Elastica; use \CirrusSearch; +use \CirrusSearch\Extra\Filter\SourceRegex; use \CirrusSearch\Search\Escaper; use \CirrusSearch\Search\Filters; use \CirrusSearch\Search\FullTextResultsType; @@ -423,7 +424,8 @@ $this->extractSpecialSyntaxFromTerm( '/(?<not>-)?insource:\/(?<pattern>(?:[^\\\\\/]|\\\\.)+)\/(?<insensitive>i)?/', function ( $matches ) use ( $searcher, &$filters, &$notFilters, &$searchContainedSyntax, &$searchType, &$highlightSource ) { - global $wgLanguageCode; + global $wgLanguageCode, + $wgCirrusSearchWikimediaExtraPlugin; $searchContainedSyntax = true; $searchType = 'regex'; @@ -439,10 +441,21 @@ 'insensitive' => $insensitive, ); } - // The setAllowMutate call is documented to speed up operations but be thread unsafe. You'd think - // that is ok because scripts are always executed in a single thread but it isn't ok. It causes - // all operations to unstable, so far as I can tell. - $script = <<<GROOVY + if ( isset( $wgCirrusSearchWikimediaExtraPlugin[ 'regex' ] ) && + in_array( 'build', $wgCirrusSearchWikimediaExtraPlugin[ 'regex' ] ) ) { + $filter = new SourceRegex( $matches[ 'pattern' ], 'source_text', 'source_text.trigram' ); + if ( isset( $wgCirrusSearchWikimediaExtraPlugin[ 'regex' ][ 'max_inspect'] ) ) { + $filter->setMaxInspect( $wgCirrusSearchWikimediaExtraPlugin[ 'regex' ][ 'max_inspect'] ); + } else { + $filter->setMaxInspect( 10000 ); + } + $filter->setCaseSensitive( !$insensitive ); + $filter->setLocale( $wgLanguageCode ); + $filterDestination[] = $filter; + } else { + // Without the extra plugin we need to use groovy to attempt the regex. + // Its less good but its something. + $script = <<<GROOVY import org.apache.lucene.util.automaton.*; sourceText = _source.get("source_text"); if (sourceText == null) { @@ -463,19 +476,20 @@ } GROOVY; - $filterDestination[] = new \Elastica\Filter\Script( new \Elastica\Script( - $script, - array( - 'pattern' => '.*(' . $matches[ 'pattern' ] . ').*', - 'insensitive' => $insensitive, - 'language' => $wgLanguageCode, - // These null here creates a slot in which the script will shove - // an automaton while executing. - 'automaton' => null, - 'locale' => null, - ), - 'groovy' - ) ); + $filterDestination[] = new \Elastica\Filter\Script( new \Elastica\Script( + $script, + array( + 'pattern' => '.*(' . $matches[ 'pattern' ] . ').*', + 'insensitive' => $insensitive, + 'language' => $wgLanguageCode, + // These null here creates a slot in which the script will shove + // an automaton while executing. + 'automaton' => null, + 'locale' => null, + ), + 'groovy' + ) ); + } } ); // Match filters that look like foobar:thing or foobar:"thing thing" diff --git a/tests/browser/features/insource.feature b/tests/browser/features/insource.feature index acdff2d..cadbd14 100644 --- a/tests/browser/features/insource.feature +++ b/tests/browser/features/insource.feature @@ -64,7 +64,7 @@ Then RegexSpaces is the first search result @regex - Scenario: insource:// can a url + Scenario: insource:// can find a url When I search for all:insource:/show_bug.cgi\?id=52908/ Then File:Savepage-greyed.png is the first search imageresult @@ -77,5 +77,5 @@ @regex Scenario: insource:// reports errors sanely - When I search for all:insource:/[/ - Then this error is reported: An error has occurred while searching: Regular expression syntax error at 4: expected ']' + When I search for all:insource:/[ / + Then this error is reported: An error has occurred while searching: Regular expression syntax error at 2: expected ']' diff --git a/tests/jenkins/Jenkins.php b/tests/jenkins/Jenkins.php index 64baef4..276a286 100644 --- a/tests/jenkins/Jenkins.php +++ b/tests/jenkins/Jenkins.php @@ -49,6 +49,11 @@ $wgSearchType = 'CirrusSearch'; $wgCirrusSearchUseExperimentalHighlighter = true; $wgCirrusSearchOptimizeIndexForExperimentalHighlighter = true; +// TODO enable this when the extra plugin is released +// $wgCirrusSearchWikimediaExtraPlugin = array( +// 'regex' => array( 'build', 'use' ), +// ); + $wgOggThumbLocation = '/usr/bin/oggThumb'; $wgGroupPermissions[ '*' ][ 'deleterevision' ] = true; $wgFileExtensions[] = 'pdf'; -- To view, visit https://gerrit.wikimedia.org/r/164551 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: If9fcb6a03c23e8abaf59b5ddfadab051263dac33 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: Manybubbles <never...@wikimedia.org> Gerrit-Reviewer: Chad <ch...@wikimedia.org> Gerrit-Reviewer: Manybubbles <never...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits