Manybubbles has uploaded a new change for review.
https://gerrit.wikimedia.org/r/102958
Change subject: Reimplement TitleKey's nearmatch finding
......................................................................
Reimplement TitleKey's nearmatch finding
We hook the SearchGetNearMatchBefore hook to jump in before core has a chance
to do most of its near match behavior but we still let it have a chance if we
don't find anything. This does waste some time as core duplicates our
efforts with regards to case mangling but it allows us to take advantage of
core's neat fallbacks like unquoting the search, finding user pages, and
file handling.
DEPLOYMENT: this won't find any results without an in place reindex
Bug: 58590
Change-Id: I6c249d81de5bcb617cbae209ace630d836b003cc
---
M CirrusSearch.php
M includes/CirrusSearch.body.php
M includes/CirrusSearchAnalysisConfigBuilder.php
M includes/CirrusSearchHooks.php
M includes/CirrusSearchMappingConfigBuilder.php
M includes/CirrusSearchResultsType.php
M includes/CirrusSearchSearcher.php
A tests/browser/features/go.feature
M tests/browser/features/prefix.feature
M tests/browser/features/step_definitions/search_steps.rb
M tests/browser/features/support/hooks.rb
M tests/browser/features/support/pages/search_results_page.rb
12 files changed, 180 insertions(+), 47 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch
refs/changes/58/102958/1
diff --git a/CirrusSearch.php b/CirrusSearch.php
index 6b93c1e..ee61289 100644
--- a/CirrusSearch.php
+++ b/CirrusSearch.php
@@ -227,6 +227,7 @@
$wgHooks[ 'BeforeInitialize' ][] = 'CirrusSearchHooks::beforeInitializeHook';
$wgHooks[ 'GetBetaFeaturePreferences' ][] =
'CirrusSearchHooks::getPreferencesHook';
$wgHooks[ 'LinksUpdateComplete' ][] =
'CirrusSearchHooks::linksUpdateCompletedHook';
+$wgHooks[ 'SearchGetNearMatchBefore' ][] =
'CirrusSearch::searchGetNearMatchBeforeHook';
$wgHooks[ 'SoftwareInfo' ][] = 'CirrusSearchHooks::softwareInfoHook';
$wgHooks[ 'SpecialSearchResultsPrepend' ][] =
'CirrusSearchHooks::specialSearchResultsPrependHook';
$wgHooks[ 'UnitTestsList' ][] = 'CirrusSearchHooks::unitTestsList';
diff --git a/includes/CirrusSearch.body.php b/includes/CirrusSearch.body.php
index e7cbb96..b2923ad 100644
--- a/includes/CirrusSearch.body.php
+++ b/includes/CirrusSearch.body.php
@@ -1,7 +1,8 @@
<?php
/**
* SearchEngine implementation for CirrusSearch. Delegates to
- * CirrusSearchSearcher for searches and CirrusSearchUpdater for updates.
+ * CirrusSearchSearcher for searches and CirrusSearchUpdater for updates. Note
+ * that lots of search behavior is hooked rather than overridden here.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -72,6 +73,19 @@
}
/**
+ * Merge the prefix into the query (if any).
+ * @var string $term search term
+ * @return string possibly with a prefix appended
+ */
+ public function transformSearchTerm( $term ) {
+ if ( $this->prefix != '' ) {
+ // Slap the standard prefix notation onto the query
+ $term = $term . ' prefix:' . $this->prefix;
+ }
+ return $term;
+ }
+
+ /**
* Hooked to delegate prefix searching to CirrusSearchSearcher.
* @param int $ns namespace to search
* @param string $search search text
@@ -81,7 +95,7 @@
*/
public static function prefixSearch( $ns, $search, $limit, &$results ) {
$searcher = new CirrusSearchSearcher( 0, $limit, $ns );
- $searcher->setResultsType( new CirrusSearchTitleResultsType() );
+ $searcher->setResultsType( new CirrusSearchTitleResultsType(
true ) );
$status = $searcher->prefixSearch( $search );
// There is no way to send errors or warnings back to the
caller here so we have to make do with
// only sending results back if there are results and relying
on the logging done at the status
@@ -94,15 +108,33 @@
}
/**
- * Merge the prefix into the query (if any).
- * @var string $term search term
- * @return string possibly with a prefix appended
+ * Let Elasticsearch take a crack at getting near matches before
mediawiki tries all kinds of variants.
+ * @param array(string) $termAnAllLanguageVariants the original search
term and all language variants
+ * @param null|Title $titleResult resulting match. A Title if we found
something, unchanged otherwise.
+ * @return bool return false if we find something, true otherwise so
mediawiki can try its default behavior
*/
- public function transformSearchTerm( $term ) {
- if ( $this->prefix != '' ) {
- // Slap the standard prefix notation onto the query
- $term = $term . ' prefix:' . $this->prefix;
+ public static function searchGetNearMatchBeforeHook(
$termAndAllLanguageVariants, $titleResult ) {
+ // Elasticsearch should handle all language variants. If it
doesn't, we'll have to make it do so.
+ $term = $termAndAllLanguageVariants[ 0 ];
+ $title = Title::newFromText( $term );
+ if ( $title === null ) {
+ return false;
}
- return $term;
+
+ $searcher = new CirrusSearchSearcher( 0, 1, array(
$title->getNamespace() ) );
+ $searcher->setResultsType( new CirrusSearchTitleResultsType(
false ) );
+ $status = $searcher->lowercasedTitleSearch( $term );
+ // There is no way to send errors or warnings back to the
caller here so we have to make do with
+ // only sending results back if there are results and relying
on the logging done at the status
+ // constrution site to log errors.
+ if ( !$status->isOK() ) {
+ return true;
+ }
+ $array = $status->getValue();
+ if ( !isset( $array[ 0 ] ) ) {
+ return true;
+ }
+ $titleResult = $array[ 0 ];
+ return false;
}
}
diff --git a/includes/CirrusSearchAnalysisConfigBuilder.php
b/includes/CirrusSearchAnalysisConfigBuilder.php
index 73c359d..add4b6f 100644
--- a/includes/CirrusSearchAnalysisConfigBuilder.php
+++ b/includes/CirrusSearchAnalysisConfigBuilder.php
@@ -96,13 +96,13 @@
),
'prefix_ngram_filter' => array(
'type' => 'edgeNGram',
- 'max_gram' =>
CirrusSearchSearcher::MAX_PREFIX_SEARCH,
+ 'max_gram' =>
CirrusSearchSearcher::MAX_TITLE_SEARCH,
),
),
'tokenizer' => array(
'prefix' => array(
'type' => 'edgeNGram',
- 'max_gram' =>
CirrusSearchSearcher::MAX_PREFIX_SEARCH,
+ 'max_gram' =>
CirrusSearchSearcher::MAX_TITLE_SEARCH,
),
'no_splitting' => array( // Just grab the whole
term.
'type' => 'keyword',
diff --git a/includes/CirrusSearchHooks.php b/includes/CirrusSearchHooks.php
index a93e8a0..9a37ad8 100644
--- a/includes/CirrusSearchHooks.php
+++ b/includes/CirrusSearchHooks.php
@@ -1,6 +1,7 @@
<?php
/**
- * Simple wrappers around things for hooks to call.
+ * All CirrusSearch's hooks that aren't directly search related. The search
+ * related hooks are in CirrusSearch.body.php.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
diff --git a/includes/CirrusSearchMappingConfigBuilder.php
b/includes/CirrusSearchMappingConfigBuilder.php
index 4584880..1b81bb2 100644
--- a/includes/CirrusSearchMappingConfigBuilder.php
+++ b/includes/CirrusSearchMappingConfigBuilder.php
@@ -36,7 +36,7 @@
// Note never to set something as type='object' here because
that isn't returned by elasticsearch
// and is infered anyway.
- $titleExtraAnalyzers = array( 'suggest', 'prefix', 'keyword' );
+ $titleExtraAnalyzers = array( 'suggest', 'prefix', 'keyword',
'lowercase_keyword' );
if ( $wgCirrusSearchPrefixSearchStartsWithAnyWord ) {
$titleExtraAnalyzers[] = 'word_prefix';
}
diff --git a/includes/CirrusSearchResultsType.php
b/includes/CirrusSearchResultsType.php
index 4e51be6..2fa41c2 100644
--- a/includes/CirrusSearchResultsType.php
+++ b/includes/CirrusSearchResultsType.php
@@ -24,6 +24,16 @@
}
class CirrusSearchTitleResultsType implements CirrusSearchResultsType {
+ private $getText;
+
+ /**
+ * Build result type.
+ * @param boolean $getText should the results be text (true) or titles
(false)
+ */
+ public function __construct( $getText ) {
+ $this->getText = $getText;
+ }
+
public function getFields() {
return array( 'namespace', 'title' );
}
@@ -33,7 +43,11 @@
public function transformElasticsearchResult( $result ) {
$results = array();
foreach( $result->getResults() as $r ) {
- $results[] = Title::makeTitle( $r->namespace, $r->title
)->getPrefixedText();
+ $title = Title::makeTitle( $r->namespace, $r->title );
+ if ( $this->getText ) {
+ $title = $title->getPrefixedText();
+ }
+ $results[] = $title;
}
return $results;
}
diff --git a/includes/CirrusSearchSearcher.php
b/includes/CirrusSearchSearcher.php
index e2d2360..42115f8 100644
--- a/includes/CirrusSearchSearcher.php
+++ b/includes/CirrusSearchSearcher.php
@@ -28,12 +28,11 @@
const HIGHLIGHT_POST = '</span>';
/**
- * Maximum title length that we'll check in prefix search. Since
titles can
- * be 255 bytes in length we're setting this to 255 characters but this
- * might cause bloat in the title's prefix index so we'll have to keep
an
- * eye on this.
+ * Maximum title length that we'll check in prefix and keyword searches.
+ * Since titles can be 255 bytes in length we're setting this to 255
+ * characters.
*/
- const MAX_PREFIX_SEARCH = 255;
+ const MAX_TITLE_SEARCH = 255;
/**
* @var integer search offset
@@ -84,6 +83,11 @@
* since last update to decay half way. Defaults to 0 meaning don't
decay the score with time.
*/
private $preferRecentHalfLife = 0;
+ /**
+ * @var boolean should the query results boost pages with more incoming
links. Defaults to true but some search
+ * methods disable it.
+ */
+ private $boostForLinks = true;
public function __construct( $offset, $limit, $namespaces ) {
$this->offset = $offset;
@@ -99,17 +103,36 @@
}
/**
+ * Perform a lowercased title search.
+ * @param string $search text by which to search
+ * @return Status(mixed) status containing results defined by
resultsType on success
+ */
+ public function lowercasedTitleSearch( $search ) {
+ wfProfileIn( __METHOD__ );
+ self::checkTitleSearchRequestLength( $search );
+ wfDebugLog( 'CirrusSearch', "Lowercased title searching:
$search" );
+
+ $match = new \Elastica\Query\Match();
+ $match->setField( 'title.lowercase_keyword', $search );
+ $this->filters[] = new \Elastica\Filter\Query( $match );
+ $this->boostForLinks = false;
+
+ $this->description = "lowercase title search for '$search'";
+ $result = $this->search();
+ wfProfileOut( __METHOD );
+ return $result;
+ }
+
+ /**
* Perform a prefix search.
- * @param $search
- * @param Status(array(string)) of titles
+ * @param string $search text by which to search
+ * @param Status(mixed) status containing results defined by
resultsType on success
*/
public function prefixSearch( $search ) {
+ wfProfileIn( __METHOD__ );
global $wgCirrusSearchPrefixSearchStartsWithAnyWord;
- $requestLength = strlen( $search );
- if ( $requestLength > self::MAX_PREFIX_SEARCH ) {
- throw new UsageException( 'Prefix search requset was
longer longer than the maximum allowed length.' .
- " ($requestLength > " . self::MAX_PREFIX_SEARCH
. ')', 'request_too_long', 400 );
- }
+
+ self::checkTitleSearchRequestLength( $search );
wfDebugLog( 'CirrusSearch', "Prefix searching: $search" );
if ( $wgCirrusSearchPrefixSearchStartsWithAnyWord ) {
@@ -125,15 +148,16 @@
}
$this->description = "prefix search for '$search'";
- $this->buildFullTextResults = false;
- return $this->search();
+ $result = $this->search();
+ wfProfileOut( __METHOD );
+ return $result;
}
/**
* Search articles with provided term.
* @param $term string term to search
* @param $showRedirects boolean should this request show redirects?
- * @return Status(CirrusSearchResultSet|null)
+ * @param Status(mixed) status containing results defined by
resultsType on success
*/
public function searchText( $term, $showRedirects ) {
wfProfileIn( __METHOD__ );
@@ -656,7 +680,7 @@
private function buildPrefixFilter( $search ) {
$match = new \Elastica\Query\Match();
$match->setField( 'title.prefix', array(
- 'query' => substr( $search, 0, self::MAX_PREFIX_SEARCH
),
+ 'query' => $search,
'analyzer' => 'lowercase_keyword',
) );
return new \Elastica\Filter\Query( $match );
@@ -752,17 +776,21 @@
}
/**
- * Wrap query in link (and potentially last update time) based boosts.
- * @param $query null|Elastica\Query optional query to boost. if null
the match_all is assumed
+ * Wrap query in a CustomScore query if its score need to be modified.
+ * @param $query Elastica\Query query to boost.
* @return query that will run $query and boost results based on links
*/
- private function boostQuery( $query = null ) {
- // MVEL code for incoming links boost
- $incomingLinks = "(doc['incoming_links'].empty ? 0 :
doc['incoming_links'].value)";
- $incomingRedirectLinks = "(doc['incoming_redirect_links'].empty
? 0 : doc['incoming_redirect_links'].value)";
- $scoreBoostMvel = " * log10($incomingLinks +
$incomingRedirectLinks + 2)";
- // MVEL code for last update time decay
- $lastUpdateDecayMvel = '';
+ private function boostQuery( $query ) {
+ $scoreCustomizations = array();
+
+ // Customize score by boosting based on incoming links count
+ if ( $this->boostForLinks ) {
+ $incomingLinks = "(doc['incoming_links'].empty ? 0 :
doc['incoming_links'].value)";
+ $incomingRedirectLinks =
"(doc['incoming_redirect_links'].empty ? 0 :
doc['incoming_redirect_links'].value)";
+ $scoreCustomizations[] = " * log10($incomingLinks +
$incomingRedirectLinks + 2)";
+ }
+
+ // Customize score by decaying a portion by time since last
update
if ( $this->preferRecentDecayPortion > 0 &&
$this->preferRecentHalfLife > 0 ) {
// Convert half life for time in days to decay constant
for time in milliseconds.
$decayConstant = log( 2 ) / $this->preferRecentHalfLife
/ 86400000;
@@ -774,8 +802,21 @@
}
// p(e^ct - 1) + 1 which is easier to calculate than
bet reduces to 1 - p + pe^ct
// Which breaks the score into an unscaled portion (1 -
p) and a scaled portion (p)
- $lastUpdateDecayMvel = " * ($exponentialDecayMvel + 1)";
+ $scoreCustomizations[] = " * ($exponentialDecayMvel +
1)";
}
- return new \Elastica\Query\CustomScore( '_score' .
$scoreBoostMvel . $lastUpdateDecayMvel, $query );
+
+ if ( $scoreCustomizations ) {
+ return new \Elastica\Query\CustomScore( '_score' .
implode( $scoreCustomizations ), $query );
+ }
+ return $query;
+ }
+
+
+ private function checkTitleSearchRequestLength( $search ) {
+ $requestLength = strlen( $search );
+ if ( $requestLength > self::MAX_TITLE_SEARCH ) {
+ throw new UsageException( 'Prefix search request was
longer longer than the maximum allowed length.' .
+ " ($requestLength > " . self::MAX_TITLE_SEARCH
. ')', 'request_too_long', 400 );
+ }
}
}
diff --git a/tests/browser/features/go.feature
b/tests/browser/features/go.feature
new file mode 100644
index 0000000..7a79a64
--- /dev/null
+++ b/tests/browser/features/go.feature
@@ -0,0 +1,25 @@
+Feature: Go Search
+ @go
+ Scenario: I can "go" to a page with mixed capital and lower case name by the
name all lower cased
+ When I go search for mixedcapsandlowercase
+ Then I am on a page titled MixedCapsAndLowerCase
+
+ @go
+ Scenario: I can "go" to a page with mixed capital and lower case name by the
name with totally wrong case cased
+ When I go search for miXEdcapsandlowercASe
+ Then I am on a page titled MixedCapsAndLowerCase
+
+ @go
+ Scenario: I can "go" to a page with an accented character without the accent
+ When I go search for africa
+ Then I am on a page titled África
+
+ @go @from_core
+ Scenario: I can "go" to a page with mixed capital and lower case name by the
name all lower cased and quoted
+ When I go search for "mixedcapsandlowercase"
+ Then I am on a page titled MixedCapsAndLowerCase
+
+ @go @from_core
+ Scenario: I can "go" to a user's page whether it is there or not
+ When I go search for User:DoesntExist
+ Then I am on a page titled User:DoesntExist
diff --git a/tests/browser/features/prefix.feature
b/tests/browser/features/prefix.feature
index 5d309ad..ff645bc 100644
--- a/tests/browser/features/prefix.feature
+++ b/tests/browser/features/prefix.feature
@@ -19,8 +19,7 @@
| two words | Two Words | Two Words
|
| ~catapult | none | Search results
|
| África | África | África
|
-# Hitting enter in a search for Africa should pull up África but that bug is
beyond me.
- | Africa | África | Search results
|
+ | Africa | África | África
|
| Template:Template Test | Template:Template Test | Template:Template Test
|
Scenario: Suggestions don't appear when you search for a string that is too
long
diff --git a/tests/browser/features/step_definitions/search_steps.rb
b/tests/browser/features/step_definitions/search_steps.rb
index 6d78795..00d2c0c 100644
--- a/tests/browser/features/step_definitions/search_steps.rb
+++ b/tests/browser/features/step_definitions/search_steps.rb
@@ -1,6 +1,9 @@
Given(/^I am at the search results page(?: with the search (.+?)(?: and the
prefix (.+))?)?$/) do |search, prefix|
visit(SearchResultsPage, using_params: {search: search, prefix: prefix})
end
+When(/^I go search for (.*)$/) do |search|
+ visit(SearchResultsPage, using_params: {search: search})
+end
When(/^I type (.+) into the search box$/) do |search_term|
on(SearchPage).search_input = search_term
diff --git a/tests/browser/features/support/hooks.rb
b/tests/browser/features/support/hooks.rb
index 297d9be..9779741 100644
--- a/tests/browser/features/support/hooks.rb
+++ b/tests/browser/features/support/hooks.rb
@@ -19,13 +19,21 @@
Before('@setup_main') do
if !$setup_main2
steps %Q{
- Given a page named África exists with contents for testing
- And a page named Rdir exists with contents #REDIRECT [[Two Words]]
+ Given a page named Rdir exists with contents #REDIRECT [[Two Words]]
And a file named File:Savepage-greyed.png exists with contents
Savepage-greyed.png and description Screenshot, for test purposes, associated
with https://bugzilla.wikimedia.org/show_bug.cgi?id=52908 .
And a page named IHaveAVideo exists with contents [[File:How to Edit
Article in Arabic Wikipedia.ogg|thumb|267x267px]]
And a page named IHaveASound exists with contents [[File:Serenade for
Strings -mvt-1- Elgar.ogg]]
}
$setup_main2 = true
+ end
+end
+
+Before('@setup_main, @go') do
+ if !$africa
+ steps %Q{
+ Given a page named África exists with contents for testing
+ }
+ $africa = true
end
end
@@ -207,3 +215,12 @@
end
$hastemplate = true
end
+
+Before("@go") do
+ if !$go
+ steps %Q{
+ Given a page named MixedCapsAndLowerCase exists
+ }
+ end
+ $go = true
+end
diff --git a/tests/browser/features/support/pages/search_results_page.rb
b/tests/browser/features/support/pages/search_results_page.rb
index 58824ec..9c5994e 100644
--- a/tests/browser/features/support/pages/search_results_page.rb
+++ b/tests/browser/features/support/pages/search_results_page.rb
@@ -1,7 +1,7 @@
class SearchResultsPage
include PageObject
- page_url
URL.url("/w/index.php?search=<%=params[:search]%>&prefix=<%=params[:prefix]%>")
+ page_url URL.url("/w/index.php?search=<%=params[:search]%><%if
(params[:prefix]) %>&prefix=<%=params[:prefix]%><% end %>")
text_field(:search, id: "searchText")
h1(:title, id: "firstHeading")
--
To view, visit https://gerrit.wikimedia.org/r/102958
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I6c249d81de5bcb617cbae209ace630d836b003cc
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits