EBernhardson has uploaded a new change for review.
https://gerrit.wikimedia.org/r/287125
Change subject: Drop support for common terms query
......................................................................
Drop support for common terms query
This was an experiment that didn't end up making a useful difference in
search. The feature is unused so lets clean it up.
Bug: T133749
Change-Id: I10acc647008dc28b2fc33279180b0f6eb462fb4c
---
M CirrusSearch.php
M autoload.php
M includes/Hooks.php
M includes/Search/SearchContext.php
M includes/Search/SearchTextQueryBuilders.php
D profiles/CommonTermsQueryProfiles.php
M tests/browser/features/step_definitions/search_steps.rb
7 files changed, 0 insertions(+), 416 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch
refs/changes/25/287125/1
diff --git a/CirrusSearch.php b/CirrusSearch.php
index 778edce..36bea41 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -23,7 +23,6 @@
require_once __DIR__ . "/profiles/SuggestProfiles.php";
require_once __DIR__ . "/profiles/PhraseSuggesterProfiles.php";
-require_once __DIR__ . "/profiles/CommonTermsQueryProfiles.php";
require_once __DIR__ . "/profiles/RescoreProfiles.php";
require_once __DIR__ . "/profiles/SimilarityProfiles.php";
@@ -836,21 +835,6 @@
* duplicating $wgCirrusSearchInterwikiSources. This needs to be fixed.
*/
$wgCirrusSearchWikiToNameMap = array();
-
-/**
- * Enable common terms query.
- * This query is enabled only if the query string does not contain any special
- * syntax and the number of terms is greater than one defined in the profile
- * NOTE: CommonTermsQuery can be more restrictive in some cases if the all
- * field is disabled (see $wgCirrusSearchAllFields).
- */
-$wgCirrusSearchUseCommonTermsQuery = false;
-
-/**
- * Set the Common terms query profile to default.
- * see profiles/CommonTermsQueryProfiles.php for more info.
- */
-$wgCirrusSearchCommonTermsQueryProfile =
$wgCirrusSearchCommonTermsQueryProfiles['default'];
/**
* If set to non-empty string, interwiki results will have ?wprov=XYZ
parameter added.
diff --git a/autoload.php b/autoload.php
index c95127a..c81556c 100644
--- a/autoload.php
+++ b/autoload.php
@@ -117,7 +117,6 @@
'CirrusSearch\\Search\\ScriptScoreFunctionScoreBuilder' => __DIR__ .
'/includes/Search/RescoreBuilders.php',
'CirrusSearch\\Search\\SearchContext' => __DIR__ .
'/includes/Search/SearchContext.php',
'CirrusSearch\\Search\\SearchTextBaseQueryBuilder' => __DIR__ .
'/includes/Search/SearchTextQueryBuilders.php',
- 'CirrusSearch\\Search\\SearchTextCommonTermsQueryBuilder' => __DIR__ .
'/includes/Search/SearchTextQueryBuilders.php',
'CirrusSearch\\Search\\SearchTextQueryBuilder' => __DIR__ .
'/includes/Search/SearchTextQueryBuilders.php',
'CirrusSearch\\Search\\SearchTextQueryBuilderFactory' => __DIR__ .
'/includes/Search/SearchTextQueryBuilders.php',
'CirrusSearch\\Search\\SearchTextQueryStringBuilder' => __DIR__ .
'/includes/Search/SearchTextQueryBuilders.php',
diff --git a/includes/Hooks.php b/includes/Hooks.php
index e4c242d..ad4d806 100644
--- a/includes/Hooks.php
+++ b/includes/Hooks.php
@@ -119,7 +119,6 @@
self::overrideUseExtraPluginForRegex( $request );
self::overrideMoreLikeThisOptions( $request );
PhraseSuggesterProfiles::overrideOptions( $request );
- CommonTermsQueryProfiles::overrideOptions( $request );
RescoreProfiles::overrideOptions( $request );
self::overrideSecret(
$wgCirrusSearchLogElasticRequests, $wgCirrusSearchLogElasticRequestsSecret,
$request, 'cirrusLogElasticRequests', false );
self::overrideYesNo( $wgCirrusSearchEnableAltLanguage,
$request, 'cirrusAltLanguage' );
diff --git a/includes/Search/SearchContext.php
b/includes/Search/SearchContext.php
index 122c7a9..813d25d 100644
--- a/includes/Search/SearchContext.php
+++ b/includes/Search/SearchContext.php
@@ -117,20 +117,6 @@
}
/**
- * @return array the CommonTermsQuery profile
- */
- public function getCommonTermsQueryProfile() {
- return $this->config->getElement(
'CirrusSearchCommonTermsQueryProfile' );
- }
-
- /**
- * @return bool true if CommonTermsQuery is allowed
- */
- public function isUseCommonTermsQuery() {
- return (bool)
$this->config->get('CirrusSearchUseCommonTermsQuery' );
- }
-
- /**
* @return true if the query contains special syntax
*/
public function isSearchContainedSyntax() {
diff --git a/includes/Search/SearchTextQueryBuilders.php
b/includes/Search/SearchTextQueryBuilders.php
index 64d06fb..cad1e80 100644
--- a/includes/Search/SearchTextQueryBuilders.php
+++ b/includes/Search/SearchTextQueryBuilders.php
@@ -42,7 +42,6 @@
public function __construct( SearchContext $context ) {
$this->context = $context;
- $this->builders[] = new SearchTextCommonTermsQueryBuilder(
$context );
$this->builders[] = new SearchTextQueryStringBuilder( $context
);
}
@@ -197,278 +196,4 @@
return $query;
}
-}
-
-/**
- * This builder builds a query based on the common terms query. The builder
- * will build a commons term query for the plain fields. Another common terms
- * query for the stem fields will be built if specified in the profile. A
- * simple/multimatch match query will be built otherwise.
- *
- * The common terms query is well suited for the plain field because stopwords
- * are not filtered, finding a cutoff freq is "relatively" easy. Concerning
- * the commons term query and the stem field it's a bit trickier. It can help
- * to determine what words the user would like to see in the results e.g.
- * 'what is snail slime made of'.
- * While 'what', 'is' and 'of' will be filtered by the stopword filter 'made'
- * can be considered as a high freq term and 'snail slime' will be required.
- * Showing best results with 'snail' and 'slimes' even if 'made' is not
- * present.
- *
- * This is not necessarily the case for all queries: e.g. 'interesting facts
- * about kennedy assassination'. In this case the most important words are
- * certainly 'kennedy' and 'assassination'. But it appears that 'interesting'
- * has a lower docFreq than 'kennedy' on english Wikipedia, so if the cutoff is
- * not properly set 'kennedy' might be considered as high freq while
- * 'interesting' will be a low freq.
- */
-class SearchTextCommonTermsQueryBuilder extends SearchTextBaseQueryBuilder {
- /**
- * The builder used for rescore and highlight query.
- *
- * @var SearchTextBaseQueryBuilder
- */
- private $queryStringBuilder;
-
- /**
- * The profile used to configure the queries
- *
- * @var array
- */
- private $profile;
-
- public function __construct( SearchContext $config ) {
- parent::__construct( $config );
- $this->queryStringBuilder = new SearchTextQueryStringBuilder(
$config );
- $this->profile = $this->context->getCommonTermsQueryProfile();
- }
-
- /**
- * The query is accepted if the number of words in the query
- * is greater or equal than min_query_terms defined in the
- * CirrusSearchCommonTerm.
- *
- * @param string $queryString
- * @return bool
- */
- public function accept( $queryString ) {
- if ( !$this->context->isUseCommonTermsQuery() ) {
- return false;
- }
-
- // the Searcher class relies heavily on the QueryString syntax
and
- // can generate QueryString syntax (i.e wildcards)
- // This builder cannot understand such syntax.
- if ( $this->context->isSearchContainedSyntax() ) {
- // It's likely a query string syntax...
- return false;
- }
-
- // XXX: this will work only for languages where space is a word
separator
- // (note that query string has the same issue)
- // Ideal solution would be to move this code to java and reuse
our lucene
- // analysis chain.
- // This is complex because the number of tokens depends on the
field analyzer :
- // "what's the most dangerous snake"
- // is 5 tokens with the plain analyzer ("what's" remains
"what's")
- // but 6 tokens with some language specific analyzer ("what's"
is
- // "what" and "s" with the english analyzer)
- if ( str_word_count( $queryString ) <
$this->profile['min_query_terms'] ) {
- return false;
- }
-
- // @fixme throw in a complete hack which will inform javascript
that
- // this query was a common terms query. This should be removed
when the
- // test is over, or a more generic way to report on the query
type
- // should be invented.
- global $wgOut;
- $wgOut->addJsConfigVars( 'wgCirrusCommonTermsApplicable', true
);
-
- $request = RequestContext::getMain()->getRequest();
- if ( $request !== null && $request->getVal(
'cirrusCommonTermsQueryControlGroup' ) === 'yes' ) {
- return false;
- }
-
-
- return true;
- }
-
- /**
- * @param string[] $fields
- * @param string $queryString
- * @param int $phraseSlop
- * @return \Elastica\Query\AbstractQuery
- */
- public function buildMainQuery( array $fields, $queryString,
$phraseSlop ) {
- $plainFields = array();
- $stemFields = array();
-
- // Separate plain and stem fields first
- foreach( $fields as $f ) {
- list( $field, $boost ) = explode( '^', $f, 2 );
- $fieldInfo = array ( 'field' => $field, 'boost' =>
$boost );
- if ( Util::endsWith( $field, '.plain' ) ) {
- $plainFields[] = $fieldInfo;
- } else {
- $stemFields[] = $fieldInfo;
- }
- }
- $query = new \Elastica\Query\BoolQuery();
- $query->setMinimumNumberShouldMatch( 1 );
- // We always build a common terms query for the plain field
- $this->attachCommonTermsClause( $query, $plainFields,
$queryString, $this->profile );
-
- // We can use different types of query for the stem field.
- if ( count( $stemFields ) === 1 ) {
- $this->attachSingleFieldStemClause( $query,
$stemFields[0], $queryString );
- } else {
- $this->attachMultiFieldsStemClause( $query,
$stemFields, $queryString );
- }
- return $query;
- }
-
- /**
- * Attach the query for the stem field. It will build a set of common
- * terms query if use_common_terms is true for the stems clause or a
- * multi match (cross_fields) if false.
- *
- * @param \Elastica\Query\BoolQuery $query the boolean query to attach
the new
- * clause
- * @param array $stemFields of boost field
- * @param string $queryString the query
- */
- private function attachMultiFieldsStemClause( \Elastica\Query\BoolQuery
$query, array $stemFields, $queryString ) {
- if ( $this->profile['stems_clause']['use_common_terms'] ===
true ) {
- $bool = new \Elastica\Query\BoolQuery();
- $bool->setMinimumNumberShouldMatch( 1 );
- $this->attachCommonTermsClause( $bool, $stemFields,
$queryString,
- $this->profile['stems_clause'] );
- $query->addShould( $bool );
- } else {
- $query->addShould( $this->buildCrossFields(
$stemFields, $queryString,
-
$this->profile['stems_clause']['min_should_match'] ) );
- }
- }
-
- /**
- * Attach the query for the stem field. Will build a single common
- * terms query if use_common_terms is true of a simple match if false.
- *
- * @param \Elastica\Query\BoolQuery $query the boolean query to attach
the
- * new clause
- * @param array $boostedField the boosted field
- * @param string $queryString the query
- */
- private function attachSingleFieldStemClause( \Elastica\Query\BoolQuery
$query, array $boostedField, $queryString ) {
- if ( $this->profile['stems_clause']['use_common_terms'] ===
true ) {
- $query->addShould( $this->buildOneCommonTermsClause(
$boostedField, $queryString,
- $this->profile['stems_clause'] ) );
- } else {
- $query->addShould( $this->buildSimpleMatch(
$boostedField, $queryString,
-
$this->profile['stems_clause']['min_should_match'] ) );
- }
- }
-
- /**
- * Attach a common terms query to $parent with a should.
- * Note that if the all field is not used this can be more restrictive:
- * for the query word1 word2 both words would have to appear in the
- * same field. We cannot use similar techniques like cross_field of
- * QueryString with multiple fields which allows both words to be
- * in different fields.
- *
- * @param \Elastica\Query\BoolQuery $parent
- * @param array $boostedFields of boostedFields
- * @param string $queryString the query
- * @param array $profile the profile
- */
- private function attachCommonTermsClause( \Elastica\Query\BoolQuery
$parent, array $boostedFields, $queryString, $profile ) {
- foreach( $boostedFields as $boostedField ) {
- $parent->addShould( $this->buildOneCommonTermsClause(
$boostedField, $queryString, $profile ) );
- }
- }
-
- /**
- * Builds a common terms query clause.
- *
- * @param array $boostedField the boosted field
- * @param string $queryString the query
- * @param array $profile the profile used by the common terms query
- * @return \Elastica\Query\Common the common terms query.
- * @suppress PhanTypeMismatchArgument minimum_should_match can be
applied
- * for low and high frequency terms, but the type signature hasn't been
- * updated to match
- */
- private function buildOneCommonTermsClause( array $boostedField,
$queryString, array $profile ) {
- $common = new \Elastica\Query\Common( $boostedField['field'],
$queryString, $profile['cutoff_freq'] );
- $common->setMinimumShouldMatch( array (
- 'high_freq' => $profile['high_freq_min_should_match'],
- 'low_freq' => $profile['low_freq_min_should_match']
- ));
- $common->setBoost( $boostedField['boost'] );
- return $common;
- }
-
- /**
- * Builds a multi match query with the cross_field type.
- *
- * @param array[] $boostedFields Each nested array must contain a
'field' and 'boost' key.
- * @param string $queryString the query
- * @param string $minShouldMatch the MinimumShouldMatch value
- * @return \Elastica\Query\MultiMatch
- */
- private function buildCrossFields( array $boostedFields, $queryString,
$minShouldMatch ) {
- $fields = array();
- foreach( $boostedFields as $f ) {
- $fields[] = $f['field'] . '^' . $f['boost'];
- }
- $cross = new \Elastica\Query\MultiMatch();
- $cross->setQuery( $queryString );
- $cross->setFields( $fields );
- $cross->setType( 'cross_fields' );
- $cross->setMinimumShouldMatch( $minShouldMatch );
- return $cross;
- }
-
- /**
- * Builds a simple match query.
- *
- * @param array $boostedField the boostedField
- * @param string $queryString the query
- * @param string $minShouldMatch the MinimumShouldMatch value
- * @return \Elastica\Query\Match
- */
- private function buildSimpleMatch( array $boostedField, $queryString,
$minShouldMatch ) {
- $match = new \Elastica\Query\Match();
- $match->setField( $boostedField['field'], array(
- 'query' => $queryString,
- 'minimum_should_match' => $minShouldMatch,
- 'boost' => $boostedField['boost']
- ) );
- return $match;
- }
-
- /**
- * Use a SearchTextQueryStringBuilder with a default OR.
- *
- * @param string[] $fields
- * @param string $queryString
- * @param int $phraseSlop
- * @return \Elastica\Query\AbstractQuery
- */
- public function buildHighlightQuery( array $fields, $queryString,
$phraseSlop ) {
- return $this->queryStringBuilder->buildQueryString( $fields,
$queryString, $phraseSlop, 'OR' );
- }
-
- /**
- * Use a SearchTextQueryStringBuilder.
- *
- * @param string[] $fields
- * @param string $queryString
- * @param int $phraseSlop
- * @return \Elastica\Query\AbstractQuery
- */
- public function buildRescoreQuery( array $fields, $queryString,
$phraseSlop ) {
- return $this->queryStringBuilder->buildRescoreQuery( $fields,
$queryString, $phraseSlop );
- }
}
diff --git a/profiles/CommonTermsQueryProfiles.php
b/profiles/CommonTermsQueryProfiles.php
deleted file mode 100644
index cbe22b8..0000000
--- a/profiles/CommonTermsQueryProfiles.php
+++ /dev/null
@@ -1,103 +0,0 @@
-<?php
-
-namespace CirrusSearch;
-
-use WebRequest;
-
-/**
- * CirrusSearch - List of profiles for CommonsTermQuery
- * see
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-common-terms-query.html
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- * http://www.gnu.org/copyleft/gpl.html
- */
-
-$wgCirrusSearchCommonTermsQueryProfiles = array(
- // This is the default profile
- 'default' => array(
- // Minimal number of required terms in the query
- // to activate the CommonTermsQuery
- 'min_query_terms' => 4,
- // The cut off frequency in % (per shard) used to split terms
- // into high freq (common words) and low freq groups (salient
- // words).
- 'cutoff_freq' => 0.001,
- // minimum_should_match operator to use for each group
- // see
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-minimum-should-match.html
- // 0 required for 1, and 50% required for 2 or more
- 'high_freq_min_should_match' => '0<0 1<50%',
- // If there are more that 5 salient terms 90% of them must match
- // all otherwise
- 'low_freq_min_should_match' => '5<90%',
- 'stems_clause' => array(
- // Configuration for language specific field
- // Which generally exclude stopwords
- 'use_common_terms' => true,
- 'cutoff_freq' => 0.05,
- // 0 out of 1, or 50% out of 2 or more
- 'high_freq_min_should_match' => '0<0 1<50%',
- 'low_freq_min_should_match' => '100%'
- ),
- ),
- 'strict' => array(
- 'min_query_terms' => 6,
- 'cutoff_freq' => 0.001,
- // 0 out of 1, or 75% out of 2 or more
- 'high_freq_min_should_match' => '0<0 1<75%',
- 'low_freq_min_should_match' => '100%',
- 'stems_clause' => array(
- // We do not use the common terms query here
- // Just a simple match query
- 'use_common_terms' => false,
- // All terms are required
- 'min_should_match' => '100%'
- ),
- ),
- 'aggressive_recall' => array(
- 'min_query_terms' => 3,
- 'cutoff_freq' => 0.0006,
- // 0 out of 1-2, or 25% out of 3 or more
- 'high_freq_min_should_match' => '0<0 2<25%',
- // 2 out of 2, or 66% out of 3 or more
- 'low_freq_min_should_match' => '2<66%',
- 'stems_clause' => array(
- 'use_common_terms' => true,
- 'cutoff_freq' => 0.001,
- // 0 out of 1-2, or 25% out of 3 or more
- 'high_freq_min_should_match' => '0<0 2<50%',
- // 3 out of 3, or 80% out of 4 or more
- 'low_freq_min_should_match' => '3<80%'
- ),
- )
-);
-
-class CommonTermsQueryProfiles {
- /**
- * @param WebRequest $request
- */
- public static function overrideOptions( WebRequest $request ) {
- global $wgCirrusSearchUseCommonTermsQuery,
- $wgCirrusSearchCommonTermsQueryProfile,
- $wgCirrusSearchCommonTermsQueryProfiles;
-
- Util::overrideYesNo( $wgCirrusSearchUseCommonTermsQuery,
$request, 'cirrusUseCommonTermsQuery' );
- if ( $wgCirrusSearchUseCommonTermsQuery ) {
- $profile = $request->getVal(
'cirrusCommonTermsQueryProfile' );
- if ( $profile !== null && isset (
$wgCirrusSearchCommonTermsQueryProfiles[$profile] ) ) {
- $wgCirrusSearchCommonTermsQueryProfile =
$wgCirrusSearchCommonTermsQueryProfiles[$profile];
- }
- }
- }
-}
diff --git a/tests/browser/features/step_definitions/search_steps.rb
b/tests/browser/features/step_definitions/search_steps.rb
index a953c2e..8cea880 100644
--- a/tests/browser/features/step_definitions/search_steps.rb
+++ b/tests/browser/features/step_definitions/search_steps.rb
@@ -21,11 +21,6 @@
@didyoumean_options ||= {}
@didyoumean_options[varname] = value
end
-When(/^I activate common terms query with the (.*) profile/) do |profile|
- @common_terms_option ||= {}
- @common_terms_option["cirrusUseCommonTermsQuery"] = "yes"
- @common_terms_option["cirrusCommonTermsQueryProfile"] = profile
-end
When(/^I api search( with rewrites enabled)?( with disabled incoming link
weighting)?(?: with offset (\d+))?(?: in the (.*) language)?(?: in namespaces?
(\d+(?: \d+)*))? for (.*)$/) do |enable_rewrites, incoming_links, offset, lang,
namespaces, search|
begin
@@ -37,7 +32,6 @@
enablerewrites: enable_rewrites ? 1 : 0
}
options = options.merge(@didyoumean_options) if defined?@didyoumean_options
- options = options.merge(@common_terms_option) if
defined?@common_terms_option
@api_result = search_for(
search.gsub(/%[^ {]+%/, @search_vars)
--
To view, visit https://gerrit.wikimedia.org/r/287125
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I10acc647008dc28b2fc33279180b0f6eb462fb4c
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: es2.x
Gerrit-Owner: EBernhardson <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits