jenkins-bot has submitted this change and it was merged.
Change subject: Term containing * match against unstemmed text
......................................................................
Term containing * match against unstemmed text
Matching terms containing * against stemmed text didn't work too well
because Elasticsearch didn't stem the terms so things like pi*les would
match nothing while pi*le would match "pickles". This is truely
backwards.
Bug: 56163
Change-Id: I1e1a56616409e0ebcf84117287bd11087044bab5
---
M includes/CirrusSearchSearcher.php
1 file changed, 70 insertions(+), 23 deletions(-)
Approvals:
Chad: Looks good to me, approved
jenkins-bot: Verified
diff --git a/includes/CirrusSearchSearcher.php
b/includes/CirrusSearchSearcher.php
index fa12d76..c4e4930 100644
--- a/includes/CirrusSearchSearcher.php
+++ b/includes/CirrusSearchSearcher.php
@@ -170,36 +170,43 @@
);
$this->filters = $filters;
wfProfileOut( __METHOD__ . '-other-filters' );
- wfProfileIn( __METHOD__ . '-find-phrase-queries-and-escape' );
- $query = array();
- $matches = array();
- $offset = 0;
- while ( preg_match(
'/(?<main>"([^"]+)"(?:~[0-9]+)?)(?<fuzzy>~)?/',
- $term, $matches, PREG_OFFSET_CAPTURE, $offset )
) {
- $startOffset = $matches[ 0 ][ 1 ];
- if ( $startOffset > $offset ) {
- $query[] = self::fixupQueryStringPart( substr(
$term, $offset, $startOffset - $offset ) );
- }
+ wfProfileIn( __METHOD__ . '-switch-phrase-queries-to-plain' );
+ $query = self::replacePartsOfQuery( $term,
'/(?<main>"([^"]+)"(?:~[0-9]+)?)(?<fuzzy>~)?/',
+ function ( $matches ) use ( $showRedirects ) {
+ $main =
CirrusSearchSearcher::fixupQueryStringPart( $matches[ 'main' ][ 0 ] );
+ if ( !isset( $matches[ 'fuzzy' ] ) ) {
+ $main =
CirrusSearchSearcher::switchSearchToExact( $main, $showRedirects );
+ }
+ return array( 'escaped' => $main );
+ } );
+ wfProfileOut( __METHOD__ . '-find-phrase-queries' );
+ wfProfileIn( __METHOD__ . '-switch-prefix-to-plain' );
+ $query = self::replaceAllPartsOfQuery( $query,
'/\w*\*(?:\w*\*?)*/',
+ function ( $matches ) use ( $showRedirects ) {
+ $term =
CirrusSearchSearcher::fixupQueryStringPart( $matches[ 0 ][ 0 ] );
+ return array( 'escaped' =>
CirrusSearchSearcher::switchSearchToExact( $term, $showRedirects ) );
+ } );
+ wfProfileOut( __METHOD__ . '-switch-phrase-queries-to-plain' );
- $main = self::fixupQueryStringPart( $matches[ 'main' ][
0 ] );
- if ( isset( $matches[ 'fuzzy' ] ) ) {
- $query[] = $main;
- } else {
- $main = $main;
- $exact = join( ' OR ',
self::buildFullTextSearchFields( $showRedirects, ".plain:$main" ) );
- $query[] = "($exact)";
+ wfProfileIn( __METHOD__ . '-escape' );
+ $escapedQuery = array();
+ foreach ( $query as $queryPart ) {
+ if ( isset( $queryPart[ 'escaped' ] ) ) {
+ $escapedQuery[] = $queryPart[ 'escaped' ];
+ continue;
}
- $offset = $startOffset + strlen( $matches[ 0 ][ 0 ] );
+ if ( isset( $queryPart[ 'raw' ] ) ) {
+ $escapedQuery[] = self::fixupQueryStringPart(
$queryPart[ 'raw' ] );
+ continue;
+ }
+ wfLogWarning( 'Unknown query part: ' . serialize(
$queryPart ) );
}
- if ( $offset < strlen( $term ) ) {
- $query[] = self::fixupQueryStringPart( substr( $term,
$offset ) );
- }
- wfProfileOut( __METHOD__ . '-find-phrase-queries-and-escape' );
+ wfProfileOut( __METHOD__ . '-escape' );
// Actual text query
if ( count( $query ) > 0 ) {
wfProfileIn( __METHOD__ . '-build-query' );
- $queryStringQueryString = self::fixupWholeQueryString(
implode( ' ', $query ) );
+ $queryStringQueryString = self::fixupWholeQueryString(
implode( ' ', $escapedQuery ) );
$fields = self::buildFullTextSearchFields(
$showRedirects );
$this->query = $this->buildSearchTextQuery( $fields,
$queryStringQueryString );
@@ -302,6 +309,41 @@
$result = $getWork->execute();
wfProfileOut( __METHOD__ );
return $result;
+ }
+
+ private static function replaceAllPartsOfQuery( $query, $regex,
$callable ) {
+ $result = array();
+ foreach ( $query as $queryPart ) {
+ if ( isset( $queryPart[ 'raw' ] ) ) {
+ $result = array_merge( $result,
self::replacePartsOfQuery( $queryPart[ 'raw' ], $regex, $callable ) );
+ continue;
+ }
+ $result[] = $queryPart;
+ }
+ return $result;
+ }
+
+ private static function replacePartsOfQuery( $queryPart, $regex,
$callable ) {
+ $destination = array();
+ $matches = array();
+ $offset = 0;
+ while ( preg_match( $regex, $queryPart, $matches,
PREG_OFFSET_CAPTURE, $offset ) ) {
+ $startOffset = $matches[ 0 ][ 1 ];
+ if ( $startOffset > $offset ) {
+ $destination[] = array( 'raw' => substr(
$queryPart, $offset, $startOffset - $offset ) );
+ }
+
+ $callableResult = call_user_func( $callable, $matches );
+ if ( $callableResult ) {
+ $destination[] = $callableResult;
+ }
+
+ $offset = $startOffset + strlen( $matches[ 0 ][ 0 ] );
+ }
+ if ( $offset < strlen( $queryPart ) ) {
+ $destination[] = array( 'raw' => substr( $queryPart,
$offset ) );
+ }
+ return $destination;
}
/**
@@ -455,6 +497,11 @@
);
}
+ public static function switchSearchToExact( $term, $showRedirects ) {
+ $exact = join( ' OR ',
CirrusSearchSearcher::buildFullTextSearchFields( $showRedirects, ".plain:$term"
) );
+ return "($exact)";
+ }
+
/**
* Build fields searched by full text search.
* @param $includeRedirects bool show redirects be included
--
To view, visit https://gerrit.wikimedia.org/r/94373
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: merged
Gerrit-Change-Id: I1e1a56616409e0ebcf84117287bd11087044bab5
Gerrit-PatchSet: 2
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
Gerrit-Reviewer: Chad <[email protected]>
Gerrit-Reviewer: jenkins-bot
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits