EBernhardson has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/287125

Change subject: Drop support for common terms query
......................................................................

Drop support for common terms query

This was an experiment that didn't end up making a useful difference in
search. The feature is unused so lets clean it up.

Bug: T133749
Change-Id: I10acc647008dc28b2fc33279180b0f6eb462fb4c
---
M CirrusSearch.php
M autoload.php
M includes/Hooks.php
M includes/Search/SearchContext.php
M includes/Search/SearchTextQueryBuilders.php
D profiles/CommonTermsQueryProfiles.php
M tests/browser/features/step_definitions/search_steps.rb
7 files changed, 0 insertions(+), 416 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/25/287125/1

diff --git a/CirrusSearch.php b/CirrusSearch.php
index 778edce..36bea41 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -23,7 +23,6 @@
 
 require_once __DIR__ . "/profiles/SuggestProfiles.php";
 require_once __DIR__ . "/profiles/PhraseSuggesterProfiles.php";
-require_once __DIR__ . "/profiles/CommonTermsQueryProfiles.php";
 require_once __DIR__ . "/profiles/RescoreProfiles.php";
 require_once __DIR__ . "/profiles/SimilarityProfiles.php";
 
@@ -836,21 +835,6 @@
  * duplicating $wgCirrusSearchInterwikiSources. This needs to be fixed.
  */
 $wgCirrusSearchWikiToNameMap = array();
-
-/**
- * Enable common terms query.
- * This query is enabled only if the query string does not contain any special
- * syntax and the number of terms is greater than one defined in the profile
- * NOTE: CommonTermsQuery can be more restrictive in some cases if the all
- * field is disabled (see $wgCirrusSearchAllFields).
- */
-$wgCirrusSearchUseCommonTermsQuery = false;
-
-/**
- * Set the Common terms query profile to default.
- * see profiles/CommonTermsQueryProfiles.php for more info.
- */
-$wgCirrusSearchCommonTermsQueryProfile = 
$wgCirrusSearchCommonTermsQueryProfiles['default'];
 
 /**
  * If set to non-empty string, interwiki results will have ?wprov=XYZ 
parameter added.
diff --git a/autoload.php b/autoload.php
index c95127a..c81556c 100644
--- a/autoload.php
+++ b/autoload.php
@@ -117,7 +117,6 @@
        'CirrusSearch\\Search\\ScriptScoreFunctionScoreBuilder' => __DIR__ . 
'/includes/Search/RescoreBuilders.php',
        'CirrusSearch\\Search\\SearchContext' => __DIR__ . 
'/includes/Search/SearchContext.php',
        'CirrusSearch\\Search\\SearchTextBaseQueryBuilder' => __DIR__ . 
'/includes/Search/SearchTextQueryBuilders.php',
-       'CirrusSearch\\Search\\SearchTextCommonTermsQueryBuilder' => __DIR__ . 
'/includes/Search/SearchTextQueryBuilders.php',
        'CirrusSearch\\Search\\SearchTextQueryBuilder' => __DIR__ . 
'/includes/Search/SearchTextQueryBuilders.php',
        'CirrusSearch\\Search\\SearchTextQueryBuilderFactory' => __DIR__ . 
'/includes/Search/SearchTextQueryBuilders.php',
        'CirrusSearch\\Search\\SearchTextQueryStringBuilder' => __DIR__ . 
'/includes/Search/SearchTextQueryBuilders.php',
diff --git a/includes/Hooks.php b/includes/Hooks.php
index e4c242d..ad4d806 100644
--- a/includes/Hooks.php
+++ b/includes/Hooks.php
@@ -119,7 +119,6 @@
                        self::overrideUseExtraPluginForRegex( $request );
                        self::overrideMoreLikeThisOptions( $request );
                        PhraseSuggesterProfiles::overrideOptions( $request );
-                       CommonTermsQueryProfiles::overrideOptions( $request );
                        RescoreProfiles::overrideOptions( $request );
                        self::overrideSecret( 
$wgCirrusSearchLogElasticRequests, $wgCirrusSearchLogElasticRequestsSecret, 
$request, 'cirrusLogElasticRequests', false );
                        self::overrideYesNo( $wgCirrusSearchEnableAltLanguage, 
$request, 'cirrusAltLanguage' );
diff --git a/includes/Search/SearchContext.php 
b/includes/Search/SearchContext.php
index 122c7a9..813d25d 100644
--- a/includes/Search/SearchContext.php
+++ b/includes/Search/SearchContext.php
@@ -117,20 +117,6 @@
        }
 
        /**
-        * @return array the CommonTermsQuery profile
-        */
-       public function getCommonTermsQueryProfile() {
-               return $this->config->getElement( 
'CirrusSearchCommonTermsQueryProfile' );
-       }
-
-       /**
-        * @return bool true if CommonTermsQuery is allowed
-        */
-       public function isUseCommonTermsQuery() {
-               return (bool) 
$this->config->get('CirrusSearchUseCommonTermsQuery' );
-       }
-
-       /**
         * @return true if the query contains special syntax
         */
        public function isSearchContainedSyntax() {
diff --git a/includes/Search/SearchTextQueryBuilders.php 
b/includes/Search/SearchTextQueryBuilders.php
index 64d06fb..cad1e80 100644
--- a/includes/Search/SearchTextQueryBuilders.php
+++ b/includes/Search/SearchTextQueryBuilders.php
@@ -42,7 +42,6 @@
 
        public function __construct( SearchContext $context ) {
                $this->context = $context;
-               $this->builders[] = new SearchTextCommonTermsQueryBuilder( 
$context );
                $this->builders[] = new SearchTextQueryStringBuilder( $context 
);
        }
 
@@ -197,278 +196,4 @@
                return $query;
        }
 
-}
-
-/**
- * This builder builds a query based on the common terms query.  The builder
- * will build a commons term query for the plain fields.  Another common terms
- * query for the stem fields will be built if specified in the profile. A
- * simple/multimatch match query will be built otherwise.
- *
- * The common terms query is well suited for the plain field because stopwords
- * are not filtered, finding a cutoff freq is "relatively" easy.  Concerning
- * the commons term query and the stem field it's a bit trickier.  It can help
- * to determine what words the user would like to see in the results e.g.
- * 'what is snail slime made of'.
- * While 'what', 'is' and 'of' will be filtered by the stopword filter 'made'
- * can be considered as a high freq term and 'snail slime' will be required.
- * Showing best results with 'snail' and 'slimes' even if 'made' is not
- * present.
- *
- * This is not necessarily the case for all queries: e.g. 'interesting facts
- * about kennedy assassination'. In this case the most important words are
- * certainly 'kennedy' and 'assassination'.  But it appears that 'interesting'
- * has a lower docFreq than 'kennedy' on english Wikipedia, so if the cutoff is
- * not properly set 'kennedy' might be considered as high freq while
- * 'interesting' will be a low freq.
- */
-class SearchTextCommonTermsQueryBuilder extends SearchTextBaseQueryBuilder {
-       /**
-        * The builder used for rescore and highlight query.
-        *
-        * @var SearchTextBaseQueryBuilder
-        */
-       private $queryStringBuilder;
-
-       /**
-        * The profile used to configure the queries
-        *
-        * @var array
-        */
-       private $profile;
-
-       public function __construct( SearchContext $config ) {
-               parent::__construct( $config );
-               $this->queryStringBuilder = new SearchTextQueryStringBuilder( 
$config );
-               $this->profile = $this->context->getCommonTermsQueryProfile();
-       }
-
-       /**
-        * The query is accepted if the number of words in the query
-        * is greater or equal than min_query_terms defined in the
-        * CirrusSearchCommonTerm.
-        *
-        * @param string $queryString
-        * @return bool
-        */
-       public function accept( $queryString ) {
-               if ( !$this->context->isUseCommonTermsQuery() ) {
-                       return false;
-               }
-
-               // the Searcher class relies heavily on the QueryString syntax 
and
-               // can generate QueryString syntax (i.e wildcards)
-               // This builder cannot understand such syntax.
-               if ( $this->context->isSearchContainedSyntax() ) {
-                       // It's likely a query string syntax...
-                       return false;
-               }
-
-               // XXX: this will work only for languages where space is a word 
separator
-               // (note that query string has the same issue)
-               // Ideal solution would be to move this code to java and reuse 
our lucene
-               // analysis chain.
-               // This is complex because the number of tokens depends on the 
field analyzer :
-               // "what's the most dangerous snake"
-               // is 5 tokens with the plain analyzer ("what's" remains 
"what's")
-               // but 6 tokens with some language specific analyzer ("what's" 
is
-               // "what" and "s" with the english analyzer)
-               if ( str_word_count( $queryString ) < 
$this->profile['min_query_terms'] ) {
-                       return false;
-               }
-
-               // @fixme throw in a complete hack which will inform javascript 
that
-               // this query was a common terms query. This should be removed 
when the
-               // test is over, or a more generic way to report on the query 
type
-               // should be invented.
-               global $wgOut;
-               $wgOut->addJsConfigVars( 'wgCirrusCommonTermsApplicable', true 
);
-
-               $request = RequestContext::getMain()->getRequest();
-               if ( $request !== null && $request->getVal( 
'cirrusCommonTermsQueryControlGroup' ) === 'yes' ) {
-                       return false;
-               }
-
-
-               return true;
-       }
-
-       /**
-        * @param string[] $fields
-        * @param string $queryString
-        * @param int $phraseSlop
-        * @return \Elastica\Query\AbstractQuery
-        */
-       public function buildMainQuery( array $fields, $queryString, 
$phraseSlop ) {
-               $plainFields = array();
-               $stemFields = array();
-
-               // Separate plain and stem fields first
-               foreach( $fields as $f ) {
-                       list( $field, $boost ) = explode( '^', $f, 2 );
-                       $fieldInfo = array ( 'field' => $field, 'boost' => 
$boost );
-                       if ( Util::endsWith( $field, '.plain' ) ) {
-                               $plainFields[] = $fieldInfo;
-                       } else {
-                               $stemFields[] = $fieldInfo;
-                       }
-               }
-               $query = new \Elastica\Query\BoolQuery();
-               $query->setMinimumNumberShouldMatch( 1 );
-               // We always build a common terms query for the plain field
-               $this->attachCommonTermsClause( $query, $plainFields, 
$queryString, $this->profile );
-
-               // We can use different types of query for the stem field.
-               if ( count( $stemFields ) === 1 ) {
-                       $this->attachSingleFieldStemClause( $query, 
$stemFields[0], $queryString );
-               } else {
-                       $this->attachMultiFieldsStemClause( $query, 
$stemFields, $queryString );
-               }
-               return $query;
-       }
-
-       /**
-        * Attach the query for the stem field. It will build a set of common
-        * terms query if use_common_terms is true for the stems clause or a
-        * multi match (cross_fields) if false.
-        *
-        * @param \Elastica\Query\BoolQuery $query the boolean query to attach 
the new
-        * clause
-        * @param array $stemFields of boost field
-        * @param string $queryString the query
-        */
-       private function attachMultiFieldsStemClause( \Elastica\Query\BoolQuery 
$query, array $stemFields, $queryString ) {
-               if ( $this->profile['stems_clause']['use_common_terms'] === 
true ) {
-                       $bool = new \Elastica\Query\BoolQuery();
-                       $bool->setMinimumNumberShouldMatch( 1 );
-                       $this->attachCommonTermsClause( $bool, $stemFields, 
$queryString,
-                               $this->profile['stems_clause'] );
-                       $query->addShould( $bool );
-               } else {
-                       $query->addShould( $this->buildCrossFields( 
$stemFields, $queryString,
-                               
$this->profile['stems_clause']['min_should_match'] ) );
-               }
-       }
-
-       /**
-        * Attach the query for the stem field. Will build a single common
-        * terms query if use_common_terms is true of a simple match if false.
-        *
-        * @param \Elastica\Query\BoolQuery $query the boolean query to attach 
the
-        * new clause
-        * @param array $boostedField the boosted field
-        * @param string $queryString the query
-        */
-       private function attachSingleFieldStemClause( \Elastica\Query\BoolQuery 
$query, array $boostedField, $queryString ) {
-               if ( $this->profile['stems_clause']['use_common_terms'] === 
true ) {
-                       $query->addShould( $this->buildOneCommonTermsClause( 
$boostedField, $queryString,
-                               $this->profile['stems_clause'] ) );
-               } else {
-                       $query->addShould( $this->buildSimpleMatch( 
$boostedField, $queryString,
-                               
$this->profile['stems_clause']['min_should_match'] ) );
-               }
-       }
-
-       /**
-        * Attach a common terms query to $parent with a should.
-        * Note that if the all field is not used this can be more restrictive:
-        * for the query word1 word2 both words would have to appear in the
-        * same field. We cannot use similar techniques like cross_field of
-        * QueryString with multiple fields which allows both words to be
-        * in different fields.
-        *
-        * @param \Elastica\Query\BoolQuery $parent
-        * @param array $boostedFields of boostedFields
-        * @param string $queryString the query
-        * @param array $profile the profile
-        */
-       private function attachCommonTermsClause( \Elastica\Query\BoolQuery 
$parent, array $boostedFields, $queryString, $profile ) {
-               foreach( $boostedFields as $boostedField ) {
-                       $parent->addShould( $this->buildOneCommonTermsClause( 
$boostedField, $queryString, $profile ) );
-               }
-       }
-
-       /**
-        * Builds a common terms query clause.
-        *
-        * @param array $boostedField the boosted field
-        * @param string $queryString the query
-        * @param array $profile the profile used by the common terms query
-        * @return \Elastica\Query\Common the common terms query.
-        * @suppress PhanTypeMismatchArgument minimum_should_match can be 
applied
-        *  for low and high frequency terms, but the type signature hasn't been
-        *  updated to match
-        */
-       private function buildOneCommonTermsClause( array $boostedField, 
$queryString, array $profile ) {
-               $common = new \Elastica\Query\Common( $boostedField['field'], 
$queryString, $profile['cutoff_freq'] );
-               $common->setMinimumShouldMatch( array (
-                       'high_freq' => $profile['high_freq_min_should_match'],
-                       'low_freq' => $profile['low_freq_min_should_match']
-               ));
-               $common->setBoost( $boostedField['boost'] );
-               return $common;
-       }
-
-       /**
-        * Builds a multi match query with the cross_field type.
-        *
-        * @param array[] $boostedFields Each nested array must contain a 
'field' and 'boost' key.
-        * @param string $queryString the query
-        * @param string $minShouldMatch the MinimumShouldMatch value
-        * @return \Elastica\Query\MultiMatch
-        */
-       private function buildCrossFields( array $boostedFields, $queryString, 
$minShouldMatch ) {
-               $fields = array();
-               foreach( $boostedFields as $f ) {
-                       $fields[] = $f['field'] . '^' . $f['boost'];
-               }
-               $cross = new \Elastica\Query\MultiMatch();
-               $cross->setQuery( $queryString );
-               $cross->setFields( $fields );
-               $cross->setType( 'cross_fields' );
-               $cross->setMinimumShouldMatch( $minShouldMatch );
-               return $cross;
-       }
-
-       /**
-        * Builds a simple match query.
-        *
-        * @param array $boostedField the boostedField
-        * @param string $queryString the query
-        * @param string $minShouldMatch the MinimumShouldMatch value
-        * @return \Elastica\Query\Match
-        */
-       private function buildSimpleMatch( array $boostedField, $queryString, 
$minShouldMatch ) {
-               $match = new \Elastica\Query\Match();
-               $match->setField( $boostedField['field'], array(
-                       'query' => $queryString,
-                       'minimum_should_match' => $minShouldMatch,
-                       'boost' => $boostedField['boost']
-               ) );
-               return $match;
-       }
-
-       /**
-        * Use a SearchTextQueryStringBuilder with a default OR.
-        *
-        * @param string[] $fields
-        * @param string $queryString
-        * @param int $phraseSlop
-        * @return \Elastica\Query\AbstractQuery
-        */
-       public function buildHighlightQuery( array $fields, $queryString, 
$phraseSlop ) {
-               return $this->queryStringBuilder->buildQueryString( $fields, 
$queryString, $phraseSlop, 'OR' );
-       }
-
-       /**
-        * Use a SearchTextQueryStringBuilder.
-        *
-        * @param string[] $fields
-        * @param string $queryString
-        * @param int $phraseSlop
-        * @return \Elastica\Query\AbstractQuery
-        */
-       public function buildRescoreQuery( array $fields, $queryString, 
$phraseSlop ) {
-               return $this->queryStringBuilder->buildRescoreQuery( $fields, 
$queryString, $phraseSlop );
-       }
 }
diff --git a/profiles/CommonTermsQueryProfiles.php 
b/profiles/CommonTermsQueryProfiles.php
deleted file mode 100644
index cbe22b8..0000000
--- a/profiles/CommonTermsQueryProfiles.php
+++ /dev/null
@@ -1,103 +0,0 @@
-<?php
-
-namespace CirrusSearch;
-
-use WebRequest;
-
-/**
- * CirrusSearch - List of profiles for CommonsTermQuery
- * see 
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-common-terms-query.html
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License along
- * with this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
- * http://www.gnu.org/copyleft/gpl.html
- */
-
-$wgCirrusSearchCommonTermsQueryProfiles = array(
-       // This is the default profile
-       'default' => array(
-               // Minimal number of required terms in the query
-               // to activate the CommonTermsQuery
-               'min_query_terms' => 4,
-               // The cut off frequency in % (per shard) used to split terms
-               // into high freq (common words) and low freq groups (salient
-               // words).
-               'cutoff_freq' => 0.001,
-               // minimum_should_match operator to use for each group
-               // see 
https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-minimum-should-match.html
-               // 0 required for 1, and 50% required for 2 or more
-               'high_freq_min_should_match' => '0<0 1<50%',
-               // If there are more that 5 salient terms 90% of them must match
-               // all otherwise
-               'low_freq_min_should_match' => '5<90%',
-               'stems_clause' => array(
-                       // Configuration for language specific field
-                       // Which generally exclude stopwords
-                       'use_common_terms' => true,
-                       'cutoff_freq' => 0.05,
-                       // 0 out of 1, or 50% out of 2 or more
-                       'high_freq_min_should_match' => '0<0 1<50%',
-                       'low_freq_min_should_match' => '100%'
-               ),
-       ),
-       'strict' => array(
-               'min_query_terms' => 6,
-               'cutoff_freq' => 0.001,
-               // 0 out of 1, or 75% out of 2 or more
-               'high_freq_min_should_match' => '0<0 1<75%',
-               'low_freq_min_should_match' => '100%',
-               'stems_clause' => array(
-                       // We do not use the common terms query here
-                       // Just a simple match query
-                       'use_common_terms' => false,
-                       // All terms are required
-                       'min_should_match' => '100%'
-               ),
-       ),
-       'aggressive_recall' => array(
-               'min_query_terms' => 3,
-               'cutoff_freq' => 0.0006,
-               // 0 out of 1-2, or 25% out of 3 or more
-               'high_freq_min_should_match' => '0<0 2<25%',
-               // 2 out of 2, or 66% out of 3 or more
-               'low_freq_min_should_match' => '2<66%',
-               'stems_clause' => array(
-                       'use_common_terms' => true,
-                       'cutoff_freq' => 0.001,
-                       // 0 out of 1-2, or 25% out of 3 or more
-                       'high_freq_min_should_match' => '0<0 2<50%',
-                       // 3 out of 3, or 80% out of 4 or more
-                       'low_freq_min_should_match' => '3<80%'
-               ),
-       )
-);
-
-class CommonTermsQueryProfiles {
-       /**
-        * @param WebRequest $request
-        */
-       public static function overrideOptions( WebRequest $request ) {
-               global $wgCirrusSearchUseCommonTermsQuery,
-                       $wgCirrusSearchCommonTermsQueryProfile,
-                       $wgCirrusSearchCommonTermsQueryProfiles;
-
-               Util::overrideYesNo( $wgCirrusSearchUseCommonTermsQuery, 
$request, 'cirrusUseCommonTermsQuery' );
-               if ( $wgCirrusSearchUseCommonTermsQuery ) {
-                       $profile = $request->getVal( 
'cirrusCommonTermsQueryProfile' );
-                       if ( $profile !== null && isset ( 
$wgCirrusSearchCommonTermsQueryProfiles[$profile] ) ) {
-                               $wgCirrusSearchCommonTermsQueryProfile = 
$wgCirrusSearchCommonTermsQueryProfiles[$profile];
-                       }
-               }
-       }
-}
diff --git a/tests/browser/features/step_definitions/search_steps.rb 
b/tests/browser/features/step_definitions/search_steps.rb
index a953c2e..8cea880 100644
--- a/tests/browser/features/step_definitions/search_steps.rb
+++ b/tests/browser/features/step_definitions/search_steps.rb
@@ -21,11 +21,6 @@
   @didyoumean_options ||= {}
   @didyoumean_options[varname] = value
 end
-When(/^I activate common terms query with the (.*) profile/) do |profile|
-  @common_terms_option ||= {}
-  @common_terms_option["cirrusUseCommonTermsQuery"] = "yes"
-  @common_terms_option["cirrusCommonTermsQueryProfile"] = profile
-end
 
 When(/^I api search( with rewrites enabled)?( with disabled incoming link 
weighting)?(?: with offset (\d+))?(?: in the (.*) language)?(?: in namespaces? 
(\d+(?: \d+)*))? for (.*)$/) do |enable_rewrites, incoming_links, offset, lang, 
namespaces, search|
   begin
@@ -37,7 +32,6 @@
       enablerewrites: enable_rewrites ? 1 : 0
     }
     options = options.merge(@didyoumean_options) if defined?@didyoumean_options
-    options = options.merge(@common_terms_option) if 
defined?@common_terms_option
 
     @api_result = search_for(
       search.gsub(/%[^ {]+%/, @search_vars)

-- 
To view, visit https://gerrit.wikimedia.org/r/287125
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I10acc647008dc28b2fc33279180b0f6eb462fb4c
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: es2.x
Gerrit-Owner: EBernhardson <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to