Manybubbles has uploaded a new change for review. https://gerrit.wikimedia.org/r/110974
Change subject: Introduce a rescore window for script scores ...................................................................... Introduce a rescore window for script scores The idea here is that if you query finds many thousands of results then it isn't worth it to run expensive script scoring on all of them. Instead we run the script scoring on the top 8192 per shard. If your query is a "good" full text query then Lucene's default scoring algorithm will already be pretty good and put the results you are probably looking for in the first couple thousand any way so the script score will pick it up. This is kind of difficult because Elasticsearch 0.90.X doesn't actually support multiple rescore windows. 1.X will. Instead we emulate it by just using one rescore if there is just one required, and by adding the script score to both the query and the rescore if there is already a rescore when we go to add the script score. We make some effort to not have a phrase rescore unless we're pretty sure it'll be important so we end up falling into the single rescore case quite frequently. Frequently enough that we'd catch all the queries that we currently see in the CirrusSearch-slow log file. This change reduces their run times by a factor of 60 which I imagine will reduce load as well. Because prefix searches rely on being able to sort large numbers of result quickly this moves most of them to a single sort. Those that have default templates boosted still have the 8192 window per shard which _should_ mostly be ok. DEPLOYMENT: If this is deployed before incoming_redirect_links has been full migrated to incoming_links then prefix sort order will change until the migration is complete. It won't break, but it'll change. Change-Id: I6c4c197ca8994f4005ba2b2b536b9099a8c6236e --- M CirrusSearch.php M includes/Searcher.php 2 files changed, 91 insertions(+), 38 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/74/110974/1 diff --git a/CirrusSearch.php b/CirrusSearch.php index 789a65a..ce2ea79 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -121,9 +121,16 @@ // this feature. $wgCirrusSearchPhraseRescoreBoost = 10.0; -// Number of documents for which automatic phrase matches are performed if it is enabled. +// Number of documents per shard for which automatic phrase matches are performed if it +// is enabled. Note that if both function and phrase rescoring is required then the +// phrase rescore window is used. TODO update this once Elasticsearch supports multiple +// rescore windows. $wgCirrusSearchPhraseRescoreWindowSize = 1024; +// Number of documents per shard for which function scoring is applied. This is stuff +// like incoming links boost, prefer-recent decay, and boost-templates. +$wgCirrusSearchFunctionRescoreWindowSize = 8192; + // If true CirrusSearch asks Elasticsearch to perform searches using a mode that should // product more accurate results at the cost of performance. See this for more info: // http://www.elasticsearch.org/blog/understanding-query-then-fetch-vs-dfs-query-then-fetch/ diff --git a/includes/Searcher.php b/includes/Searcher.php index b4a7f74..0a0dc30 100644 --- a/includes/Searcher.php +++ b/includes/Searcher.php @@ -110,11 +110,9 @@ */ private $preferRecentHalfLife = 0; /** - * @var string should the query results boost pages with more incoming links. Default to empty stream meaning - * don't boost. Other values are 'linear' meaning boost score linearly with number of incoming links or 'log' - * meaning boost score by log10(incoming_links + 2). + * @var boolean should the query results boost pages with more incoming links. Default to false. */ - private $boostLinks = ''; + private $boostLinks = false; /** * @var array template name to boost multiplier for having a template. Defaults to none but initialized by * queries that use it to self::getDefaultBoostTemplates() if they need it. That is too expensive to do by @@ -180,7 +178,6 @@ $match = new \Elastica\Query\Match(); $match->setField( 'title.near_match', $search ); $this->filters[] = new \Elastica\Filter\Query( $match ); - $this->boostLinks = ''; // No boost $result = $this->search( 'near_match', $search ); wfProfileOut( __METHOD__ ); @@ -209,8 +206,12 @@ } else { $this->filters[] = $this->buildPrefixFilter( $search ); } - $this->boostLinks = 'linear'; $this->boostTemplates = self::getDefaultBoostTemplates(); + // If there aren't any boost templates then we can use a sort for ordering + // rather than a boost. + if ( count( $this->boostTemplates ) === 0 ) { + $this->sort = 'incoming_links_desc'; + } $result = $this->search( 'prefix', $search ); wfProfileOut( __METHOD__ ); @@ -245,6 +246,7 @@ $originalTerm = $term; $this->showRedirects = $showRedirects; $this->term = trim( $term ); + $this->boostLinks = true; // Handle title prefix notation wfProfileIn( __METHOD__ . '-prefix-filter' ); $prefixPos = strpos( $this->term, 'prefix:' ); @@ -353,7 +355,6 @@ $this->filters = $filters; $this->notFilters = $notFilters; $this->boostTemplates = $boostTemplates; - $this->boostLinks = 'log'; wfProfileOut( __METHOD__ . '-other-filters' ); wfProfileIn( __METHOD__ . '-find-phrase-queries' ); // Match quoted phrases including those containing escaped quotes @@ -404,8 +405,16 @@ $this->buildFullTextSearchFields( $wgCirrusSearchStemmedWeight, '' ) ); $this->query = $this->buildSearchTextQuery( $fields, $queryStringQueryString ); - // Only do a phrase match rescore if the query doesn't include any phrases - if ( $wgCirrusSearchPhraseRescoreBoost > 1.0 && strpos( $queryStringQueryString, '"' ) === false ) { + // Only do a phrase match rescore if the query doesn't include any quotes and has a space + // TODO allow phrases without spaces to support things like words with dashes and languages + // that don't use spaces. The space check is really only important because it catches an + // common class of slow queries: <<-foo>> which it only needs to catch because Elasticsearch + // only supports a single rescore. If it supported multiple rescores it would be worth + // trying the phrase rescore because it wouldn't prevent us from having the script score in + // a rescore. + if ( $wgCirrusSearchPhraseRescoreBoost > 1.0 && + strpos( $queryStringQueryString, '"' ) === false && + strpos( $queryStringQueryString, ' ' ) !== false ) { $this->rescore = array( 'window_size' => $wgCirrusSearchPhraseRescoreWindowSize, 'query' => array( @@ -646,7 +655,11 @@ $this->query = new \Elastica\Query\Filtered( $this->query, $filter ); } - $query->setQuery( self::boostQuery( $this->query ) ); + // Call installBoosts right after we're done munging the query to include filters + // so any rescores installBoosts adds to the query are done against filtered results. + $this->installBoosts(); + + $query->setQuery( $this->query ); $highlight = $this->resultsType->getHighlightingConfiguration(); if ( $highlight ) { @@ -670,11 +683,12 @@ $query->setSize( $this->limit ); } if ( $this->rescore ) { - // Wrap the rescore query in the boostQuery just as we wrap the regular query. - $this->rescore[ 'query' ][ 'rescore_query' ] = - self::boostQuery( $this->rescore[ 'query' ][ 'rescore_query' ] )->toArray(); + // rescore_query has to be in array form before we send it to Elasticsearch but it is way easier to work + // with if we leave it in query for until now + $this->rescore[ 'query' ][ 'rescore_query' ] = $this->rescore[ 'query' ][ 'rescore_query' ]->toArray(); $query->setParam( 'rescore', $this->rescore ); } + $query->addParam( 'stats', $type ); switch ( $this->sort ) { case 'relevance': @@ -684,6 +698,18 @@ break; case 'title_desc': $query->setSort( array( 'title.keyword' => 'desc' ) ); + break; + case 'incoming_links_asc': + $query->setSort( array( 'incoming_links' => array( + 'order' => 'asc', + 'missing' => '_first', + ) ) ); + break; + case 'incoming_links_desc': + $query->setSort( array( 'incoming_links' => array( + 'order' => 'desc', + 'missing' => '_last', + ) ) ); break; default: wfLogWarning( "Invalid sort type: $this->sort" ); @@ -970,31 +996,24 @@ } /** - * Wrap query in a CustomScore query if its score need to be modified. - * @param $query Elastica\Query query to boost. - * @return query that will run $query and boost results based on links + * If there is any boosting to be done munge the the current query to get it right. */ - private function boostQuery( $query ) { - $fuctionScore = new \Elastica\Query\FunctionScore(); - $fuctionScore->setQuery( $query ); + private function installBoosts() { + global $wgCirrusSearchFunctionRescoreWindowSize; + + if ( $this->sort !== 'relevance' ) { + // Boosts are irrelevant if you aren't sorting by, well, relevance + return; + } + + $functionScore = new \Elastica\Query\FunctionScore(); $useFunctionScore = false; // Customize score by boosting based on incoming links count if ( $this->boostLinks ) { $incomingLinks = "(doc['incoming_links'].empty ? 0 : doc['incoming_links'].value)"; - // TODO remove redirect links once they are empty and switch prefix search to some kind of sort - $incomingRedirectLinks = "(doc['incoming_redirect_links'].empty ? 0 : doc['incoming_redirect_links'].value)"; - $scoreBoostMvel = "$incomingLinks + $incomingRedirectLinks"; - switch ( $this->boostLinks ) { - case 'linear': - break; // scoreBoostMvel already correct - case 'log': - $scoreBoostMvel = "log10($scoreBoostMvel + 2)"; - break; - default: - wfLogWarning( "Invalid links boost type: $this->boostLinks" ); - } - $fuctionScore->addScriptScoreFunction( new \Elastica\Script( $scoreBoostMvel ) ); + $scoreBoostMvel = "log10($incomingLinks + 2)"; + $functionScore->addScriptScoreFunction( new \Elastica\Script( $scoreBoostMvel ) ); $useFunctionScore = true; } @@ -1011,7 +1030,7 @@ // p(e^ct - 1) + 1 which is easier to calculate than, but reduces to 1 - p + pe^ct // Which breaks the score into an unscaled portion (1 - p) and a scaled portion (p) $lastUpdateDecayMvel = "$exponentialDecayMvel + 1"; - $fuctionScore->addScriptScoreFunction( new \Elastica\Script( $lastUpdateDecayMvel ) ); + $functionScore->addScriptScoreFunction( new \Elastica\Script( $lastUpdateDecayMvel ) ); $useFunctionScore = true; } @@ -1020,16 +1039,43 @@ $match = new \Elastica\Query\Match(); $match->setFieldQuery( 'template', $name ); // TODO replace with a boost_factor function when that is supported by elastica - $fuctionScore->addScriptScoreFunction( new \Elastica\Script( 'boost', array( 'boost' => $boost ) ), + $functionScore->addScriptScoreFunction( new \Elastica\Script( 'boost', array( 'boost' => $boost ) ), new \Elastica\Filter\Query( $match ) ); } $useFunctionScore = true; } - if ( $useFunctionScore ) { - return $fuctionScore; + if ( !$useFunctionScore ) { + // Nothing to do + return; } - return $query; + + // Since Elasticsearch doesn't support multiple rescores we have to pick a strategy here.... + // TODO just use multiple rescores when Elasticsearch supports it (1.x) + + // If there isn't already a rescore then we can just add the boosting as a multiply rescore + if ( !$this->rescore ) { + $this->rescore = array( + 'window_size' => $wgCirrusSearchFunctionRescoreWindowSize, + 'query' => array( + 'rescore_query' => $functionScore, + 'query_weight' => 1.0, + 'rescore_query_weight' => 1.0, + 'score_mode' => 'multiply', + ) + ); + return; + } + + // Since there is already a rescore we have to wrap _both_ the rescore and the query in our + // function score query. Nothing else really spits out the right numbers. The problem + // with this is that the function score isn't just in the rescore which means that it can + // be slow if the main query finds lots of results. + $functionScore->setQuery( $this->query ); + $this->query = new \Elastica\Query\Simple( $functionScore->toArray() ); + + $functionScore->setQuery( $this->rescore[ 'query' ][ 'rescore_query' ] ); + $this->rescore[ 'query' ][ 'rescore_query' ] = $functionScore; } private static function getDefaultBoostTemplates() { -- To view, visit https://gerrit.wikimedia.org/r/110974 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I6c4c197ca8994f4005ba2b2b536b9099a8c6236e Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: Manybubbles <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
