Manybubbles has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/79087


Change subject: Boost perfect phrase matches.
......................................................................

Boost perfect phrase matches.

If the user hasn't enclosed anything in quotes then we assume that they
would like a perfect phrase match if one exists.  This patch rescores
the top bunch of results with a copy of the original query as a phrase
query.

Change-Id: I05f7dcf5fac1223336b3191d18dc084b4bad7232
---
M CirrusSearch.body.php
M CirrusSearch.php
2 files changed, 36 insertions(+), 5 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/87/79087/1

diff --git a/CirrusSearch.body.php b/CirrusSearch.body.php
index 22c88aa..1fab387 100644
--- a/CirrusSearch.body.php
+++ b/CirrusSearch.body.php
@@ -146,7 +146,9 @@
 
        public function searchText( $term ) {
                wfDebugLog( 'CirrusSearch', "Searching:  $term" );
-               global $wgCirrusSearchPhraseSuggestMaxErrors, 
$wgCirrusSearchPhraseSuggestConfidence;
+               global $wgCirrusSearchPhraseSuggestMaxErrors, 
$wgCirrusSearchPhraseSuggestConfidence,
+                       $wgCirrusSearchPhraseRescoreBoost, 
$wgCirrusSearchPhraseRescoreWindowSize,
+                       $wgCirrusSearchPhraseSlop;
                
                $originalTerm = $term;
 
@@ -218,7 +220,8 @@
 
                // Actual text query
                if ( trim( $term ) !== '' || $extraQueryStrings ) {
-                       $queryStringQueryString = trim( implode( ' ', 
$extraQueryStrings ) . ' ' . CirrusSearch::fixupQueryString( $term ) );
+                       $fixedTerm = CirrusSearch::fixupQueryString( $term );
+                       $queryStringQueryString = trim( implode( ' ', 
$extraQueryStrings ) . ' ' . $fixedTerm );
                        $queryStringQuery = new \Elastica\Query\QueryString( 
$queryStringQueryString );
                        $fields = array( 'title^20.0', 'text^3.0' );
                        if ( $this->showRedirects ) {
@@ -226,9 +229,22 @@
                        }
                        $queryStringQuery->setFields( $fields );
                        $queryStringQuery->setAutoGeneratePhraseQueries( true );
-                       $queryStringQuery->setPhraseSlop( 3 );
-                       // TODO phrase match boosts?
+                       $queryStringQuery->setPhraseSlop( 
$wgCirrusSearchPhraseSlop );
                        $query->setQuery( $queryStringQuery );
+                       // If there aren't any phrases in the query already 
then boost all
+                       if ( $wgCirrusSearchPhraseRescoreBoost > 1.0 && strpos( 
$queryStringQueryString, '"' ) === false ) {
+                               $rescore = array(
+                                       'window_size' => 
$wgCirrusSearchPhraseRescoreWindowSize,
+                                       'query' => array(
+                                               'rescore_query' => 
$queryStringQuery->toArray(),
+                                               'query_weight' => 1.0,
+                                               'rescore_query_weight' => 
$wgCirrusSearchPhraseRescoreBoost,
+                                       )
+                               );
+                               // Replace the original query string with a 
quoted copy
+                               $rescore[ 'query' ][ 'rescore_query' ][ 
'query_string' ][ 'query' ] = '"' . $fixedTerm . '"';
+                               $query->setParam( 'rescore', $rescore );
+                       }
                        $query->setParam( 'suggest', array(
                                'text' => $term,
                                CirrusSearch::PHRASE_TITLE => array(
@@ -260,7 +276,7 @@
                                                ),
                                        )
                                )
-                       ));
+                       ) );
                }
 
                // Perform the search
@@ -355,6 +371,8 @@
                                :|              (?# no specifying your own 
fields)
                                \\\
                        )/x', '\\\$1', $string );
+               // If the string doesn't have balanced quotes then add a quote 
on the end so Elasticsearch
+               // can parse it.
                if ( !preg_match( '/^(
                                [^"]|                   (?# non quoted terms)
                                "([^"]|\\.)*"   (?# quoted terms)
diff --git a/CirrusSearch.php b/CirrusSearch.php
index 60004f2..101d0e1 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -45,6 +45,12 @@
 // Number of replicas per shard for each index
 $wgCirrusSearchContentReplicaCount = array( 'content' => 1, 'general' => 1 );
 
+// When searching for a phrase how many words not searched for can be in the 
phrase
+// before it doesn't match.  If I search for "like yellow candy" then 
phraseSlop of 0
+// won't match "like the yellow candy" but phraseSlop of 1 will.  We're pretty 
liberal
+// by default.
+$wgCirrusSearchPhraseSlop = 3;
+
 // Maximum number of terms that we ask phrase suggest to correct.
 // See max_errors on 
http://www.elasticsearch.org/guide/reference/api/search/suggest/
 $wgCirrusSearchPhraseSuggestMaxErrors = 5;
@@ -56,7 +62,14 @@
 // Maximum number of redirects per target page to index.  
 $wgCirrusSearchIndexedRedirects = 1024;
 
+// If the search doesn't include any phrases (delimited by quotes) then we try 
wrapping
+// the whole thing in quotes because sometimes that can turn up better 
results.  This is
+// the boost that we give such matches.  Set this less than or equal to 1.0 to 
turn off
+// this feature.
+$wgCirrusSearchPhraseRescoreBoost = 10.0;
 
+// Number of documents for which automatic phrase matches are performed if it 
is enabled.
+$wgCirrusSearchPhraseRescoreWindowSize = 1024;
 
 $dir = __DIR__ . '/';
 $elasticaDir = $dir . 'Elastica/lib/Elastica/';

-- 
To view, visit https://gerrit.wikimedia.org/r/79087
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I05f7dcf5fac1223336b3191d18dc084b4bad7232
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to