Manybubbles has uploaded a new change for review.
https://gerrit.wikimedia.org/r/79087
Change subject: Boost perfect phrase matches.
......................................................................
Boost perfect phrase matches.
If the user hasn't enclosed anything in quotes then we assume that they
would like a perfect phrase match if one exists. This patch rescores
the top bunch of results with a copy of the original query as a phrase
query.
Change-Id: I05f7dcf5fac1223336b3191d18dc084b4bad7232
---
M CirrusSearch.body.php
M CirrusSearch.php
2 files changed, 36 insertions(+), 5 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch
refs/changes/87/79087/1
diff --git a/CirrusSearch.body.php b/CirrusSearch.body.php
index 22c88aa..1fab387 100644
--- a/CirrusSearch.body.php
+++ b/CirrusSearch.body.php
@@ -146,7 +146,9 @@
public function searchText( $term ) {
wfDebugLog( 'CirrusSearch', "Searching: $term" );
- global $wgCirrusSearchPhraseSuggestMaxErrors,
$wgCirrusSearchPhraseSuggestConfidence;
+ global $wgCirrusSearchPhraseSuggestMaxErrors,
$wgCirrusSearchPhraseSuggestConfidence,
+ $wgCirrusSearchPhraseRescoreBoost,
$wgCirrusSearchPhraseRescoreWindowSize,
+ $wgCirrusSearchPhraseSlop;
$originalTerm = $term;
@@ -218,7 +220,8 @@
// Actual text query
if ( trim( $term ) !== '' || $extraQueryStrings ) {
- $queryStringQueryString = trim( implode( ' ',
$extraQueryStrings ) . ' ' . CirrusSearch::fixupQueryString( $term ) );
+ $fixedTerm = CirrusSearch::fixupQueryString( $term );
+ $queryStringQueryString = trim( implode( ' ',
$extraQueryStrings ) . ' ' . $fixedTerm );
$queryStringQuery = new \Elastica\Query\QueryString(
$queryStringQueryString );
$fields = array( 'title^20.0', 'text^3.0' );
if ( $this->showRedirects ) {
@@ -226,9 +229,22 @@
}
$queryStringQuery->setFields( $fields );
$queryStringQuery->setAutoGeneratePhraseQueries( true );
- $queryStringQuery->setPhraseSlop( 3 );
- // TODO phrase match boosts?
+ $queryStringQuery->setPhraseSlop(
$wgCirrusSearchPhraseSlop );
$query->setQuery( $queryStringQuery );
+ // If there aren't any phrases in the query already
then boost all
+ if ( $wgCirrusSearchPhraseRescoreBoost > 1.0 && strpos(
$queryStringQueryString, '"' ) === false ) {
+ $rescore = array(
+ 'window_size' =>
$wgCirrusSearchPhraseRescoreWindowSize,
+ 'query' => array(
+ 'rescore_query' =>
$queryStringQuery->toArray(),
+ 'query_weight' => 1.0,
+ 'rescore_query_weight' =>
$wgCirrusSearchPhraseRescoreBoost,
+ )
+ );
+ // Replace the original query string with a
quoted copy
+ $rescore[ 'query' ][ 'rescore_query' ][
'query_string' ][ 'query' ] = '"' . $fixedTerm . '"';
+ $query->setParam( 'rescore', $rescore );
+ }
$query->setParam( 'suggest', array(
'text' => $term,
CirrusSearch::PHRASE_TITLE => array(
@@ -260,7 +276,7 @@
),
)
)
- ));
+ ) );
}
// Perform the search
@@ -355,6 +371,8 @@
:| (?# no specifying your own
fields)
\\\
)/x', '\\\$1', $string );
+ // If the string doesn't have balanced quotes then add a quote
on the end so Elasticsearch
+ // can parse it.
if ( !preg_match( '/^(
[^"]| (?# non quoted terms)
"([^"]|\\.)*" (?# quoted terms)
diff --git a/CirrusSearch.php b/CirrusSearch.php
index 60004f2..101d0e1 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -45,6 +45,12 @@
// Number of replicas per shard for each index
$wgCirrusSearchContentReplicaCount = array( 'content' => 1, 'general' => 1 );
+// When searching for a phrase how many words not searched for can be in the
phrase
+// before it doesn't match. If I search for "like yellow candy" then
phraseSlop of 0
+// won't match "like the yellow candy" but phraseSlop of 1 will. We're pretty
liberal
+// by default.
+$wgCirrusSearchPhraseSlop = 3;
+
// Maximum number of terms that we ask phrase suggest to correct.
// See max_errors on
http://www.elasticsearch.org/guide/reference/api/search/suggest/
$wgCirrusSearchPhraseSuggestMaxErrors = 5;
@@ -56,7 +62,14 @@
// Maximum number of redirects per target page to index.
$wgCirrusSearchIndexedRedirects = 1024;
+// If the search doesn't include any phrases (delimited by quotes) then we try
wrapping
+// the whole thing in quotes because sometimes that can turn up better
results. This is
+// the boost that we give such matches. Set this less than or equal to 1.0 to
turn off
+// this feature.
+$wgCirrusSearchPhraseRescoreBoost = 10.0;
+// Number of documents for which automatic phrase matches are performed if it
is enabled.
+$wgCirrusSearchPhraseRescoreWindowSize = 1024;
$dir = __DIR__ . '/';
$elasticaDir = $dir . 'Elastica/lib/Elastica/';
--
To view, visit https://gerrit.wikimedia.org/r/79087
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I05f7dcf5fac1223336b3191d18dc084b4bad7232
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits