EBernhardson has uploaded a new change for review. https://gerrit.wikimedia.org/r/235133
Change subject: Completion suggester : improve precision ...................................................................... Completion suggester : improve precision * Splits suggestions into title suggestions with similar redirects and redirect suggestions * With redirect suggestions a second pass query is required to fetch the text to display * Added more config options with SuggestProfiles * Added support for geo context suggestions * Improved precision with post-search re-scoring * Moved complex configuration profiles into 'profiles' folder NOTE: this is experimental and indexing strategy may change. Change-Id: I37953179d3f10036344fe16bf31da3fd04a7c075 (cherry picked from commit fa5e1385a41ff9ad72a7c72bf876bf7236f81b74) --- M CirrusSearch.php M includes/BuildDocument/SuggestBuilder.php M includes/BuildDocument/SuggestScoring.php M includes/ElasticsearchIntermediary.php M includes/Maintenance/SuggesterAnalysisConfigBuilder.php M includes/Maintenance/SuggesterMappingConfigBuilder.php M includes/Searcher.php M includes/Util.php M maintenance/updateSuggesterIndex.php A profiles/PhraseSuggesterProfiles.php A profiles/SuggestProfiles.php M tests/browser/features/suggest_api.feature M tests/browser/features/support/hooks.rb A tests/unit/SuggestBuilderTest.php M tests/unit/SuggestScoringTest.php M tests/unit/UtilTest.php 16 files changed, 1,000 insertions(+), 214 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch refs/changes/33/235133/1 diff --git a/CirrusSearch.php b/CirrusSearch.php index 14738e1..eb2aa2f 100644 --- a/CirrusSearch.php +++ b/CirrusSearch.php @@ -21,6 +21,9 @@ * http://www.gnu.org/copyleft/gpl.html */ +require_once __DIR__ . "/profiles/SuggestProfiles.php"; +require_once __DIR__ . "/profiles/PhraseSuggesterProfiles.php"; + $wgExtensionCredits['other'][] = array( 'path' => __FILE__, 'name' => 'CirrusSearch', @@ -265,117 +268,8 @@ // (This is the minimal value) $wgCirrusSearchPhraseSuggestPrefixLengthHardLimit = 2; -// Phrase suggester profiles (Did you mean) -$wgCirrusSearchPhraseSuggestProfiles = array( - // This is the default settings - 'default' => array( - // The suggest mode used by the phrase suggester - // can be : - // * missing: Only suggest terms in the suggest text that - // aren’t in the index. - // * popular: Only suggest suggestions that occur in more docs - // then the original suggest text term. - // * always: Suggest any matching suggestions based on terms - // in the suggest text. - 'mode' => 'always', - - // Confidence level required to suggest new phrases. - // See confidence on https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters-phrase.html - 'confidence' => 2.0, - - // Maximum number of terms that we ask phrase suggest to correct. - // See max_errors on https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters-phrase.html - 'max_errors' => 2, - - // the likelihood of a term being a misspelled even if the term exists in the dictionary. - 'real_word_error_likelihood' => 0.95, - - // The max term freq used by the phrase suggester. The maximum - // threshold in number of documents a suggest text token can - // exist in order to be included. Can be a relative percentage - // number (e.g 0.4) or an absolute // number to represent - // document frequencies. If an value higher than 1 is specified - // then fractional can not be specified. Defaults to 0.01f. If - // a term appears in more then half the docs then don't try to - // correct it. This really shouldn't kick in much because we're - // not looking for misspellings. We're looking for phrases that - // can be might off. Like "noble prize" -> "nobel prize". In - // any case, the default was 0.01 which way too frequently - // decided not to correct some terms. - 'max_term_freq' => 0.5, - - // The max doc freq (shard level) used by the phrase suggester - // The minimal threshold in number of documents a suggestion - // should appear in. This can be specified as an absolute - // number or as a relative percentage of number of documents. - // This can improve quality by only suggesting high frequency - // terms. Defaults to 0f and is not enabled. If a value higher - // than 1 is specified then the number cannot be fractional. The - // shard level document frequencies are used for this option. - // NOTE: this value is ignored if mode is "always" - 'min_doc_freq' => 0.0, - - // The prefix length used by the phrase suggester The number of - // minimal prefix characters that must match in order be a - // candidate suggestions. Defaults to 1. Increasing this number - // improves spellcheck performance. Usually misspellings don’t - // occur in the beginning of terms. - 'prefix_length' => 2, - - // Checks each suggestion against a specified query to prune - // suggestions for which no matching docs exist in the index. - 'collate' => false, - - // Controls the minimum_should_match option used by the collate - // query. - 'collate_minimum_should_match' => '3<66%', - - // Smoothing model See - // https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters-phrase.html - 'smoothing_model' => array( - 'stupid_backoff' => array( - 'discount' => 0.4 - ) - ), - ), - // The 'strict' settings will try to avoid displaying weird suggestions. - // (suited for small size wikis) - 'strict' => array( - 'mode' => 'always', - 'confidence' => 2.0, - 'max_errors' => 2, - 'real_word_error_likelihood' => 0.95, - 'max_term_freq' => 0.5, - 'min_doc_freq' => 0.0, - 'prefix_length' => 2, - 'collate' => true, - 'collate_minimum_should_match' => '3<66%', - 'smoothing_model' => array( - 'laplace' => array( - 'alpha' => 0.3 - ) - ) - ), - // Alternative settings, confidence set to 1 but with laplace smoothing - 'alternative' => array( - 'mode' => 'always', - 'confidence' => 1.0, - 'max_errors' => 2, - 'real_word_error_likelihood' => 0.95, - 'max_term_freq' => 0.5, - 'min_doc_freq' => 0.0, - 'prefix_length' => 2, - 'collate' => false, - 'collate_minimum_should_match' => '3<66%', - 'smoothing_model' => array( - 'laplace' => array( - 'alpha' => 0.3 - ) - ) - ) -); - // Set the Phrase suggester settings using the default profile. +// see profiles/PhraseSuggesterProfiles.php $wgCirrusSearchPhraseSuggestSettings = $wgCirrusSearchPhraseSuggestProfiles['default']; // Look for suggestions in the article text? Changing this from false to true will @@ -792,20 +686,20 @@ $wgCirrusSearchUserTesting = array(); /** - * Settings for completion suggestion options. - * See CirrusSearch\BuildDocument\SuggestBuilder and CirrusSearch\Searcher - * fields - set of suggestion fields to use - * fuzzy - fuzziness configuration (false for no fuzziness) - * See also: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters-completion.html + * Profile for search as you type suggestion (completion suggestion) + * (see profiles/SuggestProfiles.php for more details.) + * + * NOTE: This is an experimental API */ -$wgCirrusSearchCompletionSettings = array( - "fields" => array( - "suggest", "suggest-stop" - ), - "fuzzy" => array( - "fuzziness" => 2 - ), -); +$wgCirrusSearchCompletionSettings = $wgCirrusSearchCompletionProfiles['default']; + +/** + * Profile for geo context search as you type suggestion (completion suggestion) + * (see profiles/SuggestProfiles.php for more details.) + * + * NOTE: This is an experimental API + */ +$wgCirrusSearchCompletionGeoContextSettings = $wgCirrusSearchCompletionGeoContextProfiles['default']; $includes = __DIR__ . "/includes/"; $apiDir = $includes . 'Api/'; diff --git a/includes/BuildDocument/SuggestBuilder.php b/includes/BuildDocument/SuggestBuilder.php index 4188095..9570b2f 100644 --- a/includes/BuildDocument/SuggestBuilder.php +++ b/includes/BuildDocument/SuggestBuilder.php @@ -23,9 +23,23 @@ /** * Builder used to create suggester docs + * NOTE: Experimental */ class SuggestBuilder { + /** + * We limit the input to 50 chars + */ const MAX_INPUT_LENGTH = 50; + + /** + * The acceptable edit distance to group similar strings + */ + const GROUP_ACCEPTABLE_DISTANCE = 2; + + /** + * Discount suggestions based on redirects + */ + const REDIRECT_DISCOUNT = 0.1; /** * @var SuggestScoringMethod the scoring function @@ -33,10 +47,16 @@ private $scoringMethod; /** + * @var boolean builds geo contextualized suggestions + */ + private $withGeo; + + /** * @param SuggestScoringMethod $scoringMethod the scoring function to use */ - public function __construct( SuggestScoringMethod $scoringMethod ) { + public function __construct( SuggestScoringMethod $scoringMethod, $withGeo = true ) { $this->scoringMethod = $scoringMethod; + $this->withGeo = $withGeo; } /** @@ -45,39 +65,131 @@ * @return array a set of suggest documents */ public function build( $id, $inputDoc ) { + if( !isset( $inputDoc['title'] ) ) { + // Bad doc, nothing to do here. + return array(); + } $score = $this->scoringMethod->score( $inputDoc ); - $inputs = $this->buildInputs( $inputDoc ); + + // We support only earth and the primary/first coordinates... + $location = $this->findPrimaryCoordinates( $inputDoc ); + + $suggestions = $this->extractTitleAndSimilarRedirects( $inputDoc ); + $docs[] = $this->buildTitleSuggestion( $id, $suggestions['group'], $location, $score ); + if ( !empty( $suggestions['candidates'] ) ) { + $docs[] = $this->buildRedirectsSuggestion( $id, $suggestions['candidates'], + $location, $score ); + } + return $docs; + } + + /** + * Inspects the 'coordinates' index and return the first coordinates flagged as 'primary' + * or the first coordinates if no primaries are found. + * @param array $inputDoc the input doc + * @return array with 'lat' and 'lon' or null + */ + public function findPrimaryCoordinates( $inputDoc ) { + if ( !isset( $inputDoc['coordinates'] ) || !is_array( $inputDoc['coordinates'] ) ) { + return null; + } + + $first = null; + foreach( $inputDoc['coordinates'] as $coord ) { + if ( isset( $coord['globe'] ) && $coord['globe'] == 'earth' && isset( $coord['coord'] ) ) { + if ( $first === null ) { + $first = $coord['coord']; + } + if ( isset( $coord['primary'] ) && $coord['primary'] ) { + return $coord['coord']; + } + } + } + return $first; + } + + /** + * Builds the 'title' suggestion. + * The output is encoded as pageId:t:Title. + * NOTE: the client will be able to display Title encoded in the output when searching. + * + * @param int $id the page id + * @param array $title the title in 'text' and an array of similar redirects in 'variants' + * @param array $location the geo coordinates or null if unavailable + * @param int $score the weight of the suggestion + * @return array the suggestion document + */ + private function buildTitleSuggestion( $id, $title, $location, $score ) { + $inputs = array( $this->prepareInput( $title['text'] ) ); + foreach ( $title['variants'] as $variant ) { + $inputs[] = $this->prepareInput( $variant ); + } + $output = $id . ":t:" . $title['text']; + return $this->buildSuggestion( $output, $inputs, $location, $score ); + } + + /** + * Builds the 'redirects' suggestion. + * The output is encoded as pageId:r + * The score will be discounted by the REDIRECT_DISCOUNT factor. + * NOTE: the client will have to fetch the doc redirects when searching + * and choose the best one to display. This is because we are unable + * to make this decision at index time. + * + * @param int $id the page id + * @param array of string $redirects the redirects + * @param array $location the geo coordinates or null if unavailable + * @param int $score the weight of the suggestion + * @return array the suggestion document + */ + private function buildRedirectsSuggestion( $id, $redirects, $location, $score ) { + $inputs = array(); + foreach ( $redirects as $redirect ) { + $inputs[] = $this->prepareInput( $redirect ); + } + $output = $id . ":r"; + $score = (int) ( $score * self::REDIRECT_DISCOUNT ); + return $this->buildSuggestion( $output, $inputs, $location, $score ); + } + + /** + * Builds a suggestion document. + * + * @param string $output the suggestion output + * @param string $inputs the suggestion inputs + * @param array $location the geo coordinates or null if unavailable + * @param int $score the weight of the suggestion + * @return array a doc ready to be indexed in the completion suggester + */ + private function buildSuggestion( $output, $inputs, $location, $score ) { $doc = array( 'suggest' => array ( 'input' => $inputs, - 'output' => $id, + 'output' => $output, 'weight' => $score ), 'suggest-stop' => array ( 'input' => $inputs, - 'output' => $id, + 'output' => $output, 'weight' => $score ) ); - // We support only earth and we take the first coordinate only... - if ( isset ( $inputDoc['coordinates'][0]['globe'] ) && $inputDoc['coordinates'][0]['globe'] === 'earth' ) { - $location = array( 'location' => $inputDoc['coordinates'][0]['coord'] ); - + if ( $this->withGeo && $location !== null ) { $doc['suggest-geo'] = array( 'input' => $inputs, - 'output' => $id, + 'output' => $output, 'weight' => $score, - 'context' => $location + 'context' => array( 'location' => $location ) ); $doc['suggest-stop-geo'] = array( 'input' => $inputs, - 'output' => $id, + 'output' => $output, 'weight' => $score, - 'context' => $location + 'context' => array( 'location' => $location ) ); } - return array( $doc ); + return $doc; } /** @@ -85,10 +197,10 @@ * @return array list of prepared suggestions that should * resolve to the document. */ - public function buildInputs( array $input ) { - $inputs = array( $this->prepareInput( $input['title'] ) ); - foreach ( $input['redirect'] as $redir ) { - $inputs[] = $this->prepareInput( $redir['title'] ); + public function buildInputs( $input ) { + $inputs = array( $this->prepareInput( $input['text'] ) ); + foreach ( $input['variants'] as $variant ) { + $inputs[] = $this->prepareInput( $variant ); } return $inputs; } @@ -104,4 +216,91 @@ } return $input; } + + /** + * Extracts title with redirects that are very close. + * It will allow to make one suggestion with title as the + * output and title + similar redirects as the inputs. + * It can be useful to avoid displaying redirects created to + * to handle typos. + * + * e.g. : + * title: Giraffe + * redirects: Girafe, Girraffe, Mating Giraffes + * will output + * - 'group' : { 'text': 'Giraffe', 'variants': ['Girafe', 'Girraffe'] } + * - 'candidates' : ['Mating Giraffes'] + * + * It would be nice to do this for redirects but we have no way to decide + * which redirect is a typo and this technique would simply take the first + * redirect in the list. + * + * @return array mixed 'group' key contains the group with the + * lead and its variants and 'candidates' contains the remaining + * candidates that were not close enough to $groupHead. + */ + public function extractTitleAndSimilarRedirects( $doc ) { + $redirects = array(); + if ( isset( $doc['redirect'] ) ) { + foreach( $doc['redirect'] as $redir ) { + $redirects[] = $redir['title']; + } + } + return $this->extractSimilars( $doc['title'], $redirects, true ); + } + + /** + * Extracts from $candidates the values that are "similar" to $groupHead + * + * @param string $groupHead string the group "head" + * @param array $candidates array of string the candidates + * @param boolean $checkVariants if the candidate does not match the groupHead try to match a variant + * @return array 'group' key contains the group with the + * head and its variants and 'candidates' contains the remaining + * candidates that were not close enough to $groupHead. + */ + private function extractSimilars( $groupHead, $candidates, $checkVariants = false ) { + $group = array( + 'text' => $groupHead, + 'variants' => array() + ); + $newCandidates = array(); + foreach( $candidates as $c ) { + $distance = $this->distance( $groupHead, $c ); + if( $distance > self::GROUP_ACCEPTABLE_DISTANCE && $checkVariants ) { + // Run a second pass over the variants + foreach ( $group['variants'] as $v ) { + $distance = $this->distance( $v, $c ); + if ( $distance <= self::GROUP_ACCEPTABLE_DISTANCE ) { + break; + } + } + } + if ( $distance <= self::GROUP_ACCEPTABLE_DISTANCE ) { + $group['variants'][] = $c; + } else { + $newCandidates[] = $c; + } + } + + return array( + 'group' => $group, + 'candidates' => $newCandidates + ); + } + + /** + * Computes the edit distance between $a and $b. + * @param string $a + * @param string $b + * @return integer the edit distance between a and b + */ + private function distance( $a, $b ) { + $a = $this->prepareInput( $a ); + $b = $this->prepareInput( $b ); + $a = mb_strtolower( $a ); + $b = mb_strtolower( $b ); + + return levenshtein( $a, $b ); + } } diff --git a/includes/BuildDocument/SuggestScoring.php b/includes/BuildDocument/SuggestScoring.php index b025b72..fb9c4df 100644 --- a/includes/BuildDocument/SuggestScoring.php +++ b/includes/BuildDocument/SuggestScoring.php @@ -72,7 +72,20 @@ } /** - * Score that tries to reflect the quality of a page + * Score that tries to reflect the quality of a page. + * NOTE: Experimental + * + * This score makes the assumption that bigger is better. + * + * Small cities/village which have a high number of incoming links because they + * link to each others ( see https://en.wikipedia.org/wiki/Villefort,_Loz%C3%A8re ) + * will be be discounted correctly because others variables are very low. + * + * On the other hand some pages like List will get sometimes a very high but unjustified + * score. + * + * The boost templates feature might help but it's a System message that is not necessarily + * configured by wiki admins. */ class QualityScore implements SuggestScoringMethod { // TODO: move these constants into a cirrus profile @@ -90,7 +103,7 @@ const REDIRECT_WEIGHT = 0.1; // The final score will be in the range [0, SCORE_RANGE] - const SCORE_RANGE = 100000; + const SCORE_RANGE = 10000000; /** * Template boosts configured by the mediawiki admin. diff --git a/includes/ElasticsearchIntermediary.php b/includes/ElasticsearchIntermediary.php index 2066df4..3512fac 100644 --- a/includes/ElasticsearchIntermediary.php +++ b/includes/ElasticsearchIntermediary.php @@ -50,7 +50,7 @@ /** * @var array map of search request stats to log about the current search request */ - private $logContext = array(); + protected $logContext = array(); /** * @var int how many millis a request through this intermediary needs to take before it counts as slow. diff --git a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php index 9f632c9..eaf9ecc 100644 --- a/includes/Maintenance/SuggesterAnalysisConfigBuilder.php +++ b/includes/Maintenance/SuggesterAnalysisConfigBuilder.php @@ -56,15 +56,15 @@ ), "asciifolding_preserve" => array( "type" => "asciifolding", - "preserve_original" => "true", + "preserve_original" => "false", ), "icu_normalizer" => array( "type" => "icu_normalizer", "name" => "nfkc_cf" ), - "50_token_limit" => array( + "token_limit" => array( "type" => "limit", - "max_token_count" => "50" + "max_token_count" => "20" ) ), 'analyzer' => array( @@ -75,21 +75,18 @@ "lowercase", "stop_filter", "asciifolding_preserve", - "50_token_limit" + "token_limit" ), "tokenizer" => "standard" ), - // We do not use ascii_folding when searching - // Using ascii folding when searching will increase recall - // but could be annoying for the user who makes effort to write - // diacritics. "stop_analyzer_search" => array( "type" => "custom", "filter" => array( "standard", "lowercase", "stop_filter_search", - "50_token_limit" + "asciifolding_preserve", + "token_limit" ), "tokenizer" => "standard" ), @@ -99,7 +96,7 @@ "standard", "icu_normalizer", "asciifolding_preserve", - "50_token_limit" + "token_limit" ), "tokenizer" => "standard" ), @@ -108,7 +105,8 @@ "filter" => array( "standard", "icu_normalizer", - "50_token_limit" + "asciifolding_preserve", + "token_limit" ), "tokenizer" => "standard" ) diff --git a/includes/Maintenance/SuggesterMappingConfigBuilder.php b/includes/Maintenance/SuggesterMappingConfigBuilder.php index 880a998..8303693 100644 --- a/includes/Maintenance/SuggesterMappingConfigBuilder.php +++ b/includes/Maintenance/SuggesterMappingConfigBuilder.php @@ -37,7 +37,7 @@ $geoContext = array( 'location' => array( 'type' => 'geo', - 'precision' => array('1km', '10km', '100km'), + 'precision' => array( 6, 4, 3 ), // ~ 1km, 10km, 100km 'neighbors' => true, ) ); diff --git a/includes/Searcher.php b/includes/Searcher.php index 59269dc..a95590e 100644 --- a/includes/Searcher.php +++ b/includes/Searcher.php @@ -777,8 +777,10 @@ /** * Produce a set of completion suggestions for text using _suggest * See https://www.elastic.co/guide/en/elasticsearch/reference/1.6/search-suggesters-completion.html + * + * WARNING: experimental API + * * @param string $text Search term - * @param array $context Context, see https://www.elastic.co/guide/en/elasticsearch/reference/current/suggester-context.html * @return Status */ public function suggest( $text, $context = null ) { @@ -787,26 +789,32 @@ $this->term = $text; $suggest = array( 'text' => $text ); - foreach ( $wgCirrusSearchCompletionSettings[ 'fields' ] as $field ) { - $suggest[$field] = array( + $queryLen = mb_strlen( $text ); + $profile = $wgCirrusSearchCompletionSettings; + + if ( $context != null && isset( $context['geo']['lat'] ) && isset( $context['geo']['lon'] ) + && is_numeric( $context['geo']['lat'] ) && is_numeric( $context['geo']['lon'] ) + ) { + $profile = $this->prepareGeoContextSuggestProfile( $context ); + $description = "geo suggest query for {query}"; + } + + foreach ( $profile as $name => $config ) { + if ( $config['min_query_len'] > $queryLen ) { + continue; + } + $field = $config['field']; + $suggest[$name] = array( 'completion' => array( 'field' => $field, + 'size' => $this->limit * $config['fetch_limit_factor'] ) ); - if ( $context ) { - $suggest[$field]['completion']['context'] = $context; + if ( isset( $config['fuzzy'] ) ) { + $suggest[$name]['completion']['fuzzy'] = $config['fuzzy']; } - - if( is_array( $wgCirrusSearchCompletionSettings[ 'fuzzy' ] ) ) { - $suggest[$field."-fuzzy"] = array( - 'completion' => array( - 'field' => $field, - 'fuzzy' => $wgCirrusSearchCompletionSettings[ 'fuzzy' ], - ) - ); - if ( $context ) { - $suggest[$field."-fuzzy"]['completion']['context'] = $context; - } + if ( isset( $config['context'] ) ) { + $suggest[$name]['completion']['context'] = $config['context']; } } @@ -815,86 +823,196 @@ Connection::setTimeout( $wgCirrusSearchSearchShardTimeout[ 'default' ] ); $index = Connection::getIndex( $this->indexBaseName, Connection::TITLE_SUGGEST_TYPE ); - $description = "completion suggest query for {query}"; $logContext = array( 'query' => $text, + 'queryType' => 'comp_suggest' ); $searcher = $this; + $limit = $this->limit; $result = Util::doPoolCounterWork( 'CirrusSearch-Search', $this->user, - function() use( $searcher, $index, $description, $suggest, $logContext, $queryOptions ) { + function() use( $searcher, $index, $suggest, $logContext, $queryOptions, + $profile, $text , $limit ) { + $description = "{queryType} search for '{query}'"; $searcher->start( $description, $logContext ); try { - return $index->request( "_suggest", Request::POST, $suggest, $queryOptions ); + $result = $index->request( "_suggest", Request::POST, $suggest, $queryOptions ); + if( $result->isOk() ) { + $result = $searcher->postProcessSuggest( $text, $result, + $profile, $limit ); + return $searcher->success( $result ); + } + return $result; } catch ( \Elastica\Exception\ExceptionInterface $e ) { return $searcher->failure( $e ); } } ); - if( $result->isOk() ) { - $result = $this->postProcessSuggest( $result, $this->limit ); - return $this->success( $result ); - } return $result; + } + + /** + * prepare the list of suggest requests used for geo context suggestions + * This method will merge $wgCirrusSearchCompletionSettings and + * $wgCirrusSearchCompletionGeoContextSettings + * @param array $context user's geo context + * @return array of suggest request profiles + */ + private function prepareGeoContextSuggestProfile( $context ) { + global $wgCirrusSearchCompletionSettings, + $wgCirrusSearchCompletionGeoContextSettings; + $profiles = array(); + foreach ( $wgCirrusSearchCompletionGeoContextSettings as $geoname => $geoprof ) { + foreach ( $wgCirrusSearchCompletionSettings as $sugname => $sugprof ) { + if ( !in_array( $sugname, $geoprof['with'] ) ) { + continue; + } + $profile = $sugprof; + $profile['field'] .= $geoprof['field_suffix']; + $profile['discount'] *= $geoprof['discount']; + $profile['context'] = array( + 'location' => array( + 'lat' => $context['geo']['lat'], + 'lon' => $context['geo']['lon'], + 'precision' => $geoprof['precision'] + ) + ); + $profiles["$sugname-$geoname"] = $profile; + } + } + return $profiles; } /** * merge top level multi-queries and resolve returned pageIds into Title objects. * + * WARNING: experimental API + * + * @param string $query the user query * @param \Elastica\Response $response Response from elasticsearch _suggest api + * @param array $profile the suggestion profile * @param int $limit Maximum suggestions to return, -1 for unlimited * @return Title[] List of suggested titles */ - protected function postProcessSuggest( \Elastica\Response $response, $limit = -1 ) { + protected function postProcessSuggest( $query, \Elastica\Response $response, $profile, $limit = -1 ) { $data = $response->getData(); unset( $data['_shards'] ); $suggestions = array(); foreach ( $data as $name => $results ) { - foreach ( $results as $suggested ) { + $discount = $profile[$name]['discount']; + foreach ( $results as $suggested ) { foreach ( $suggested['options'] as $suggest ) { - $pageId = $suggest['text']; + $output = explode( ':', $suggest['text'], 3 ); + if ( sizeof ( $output ) < 2 ) { + // Ignore broken output + continue; + } + $pageId = $output[0]; + $type = $output[1]; + + $score = $discount * $suggest['score']; if ( !isset( $suggestions[$pageId] ) || - $suggest['score'] > $suggestions[$pageId]['score'] + $score > $suggestions[$pageId]['score'] ) { - $suggestions[$pageId] = $suggest; + $suggestion = array( + 'score' => $score, + 'pageId' => $pageId + ); + // If it's a title suggestion we have the text + if ( $type === 't' && sizeof( $output ) == 3 ) { + $suggestion['text'] = $output[2]; + } + $suggestions[$pageId] = $suggestion; } } } } // simply sort by existing scores - usort( $suggestions, function ( $a, $b ) { + uasort( $suggestions, function ( $a, $b ) { return $b['score'] - $a['score']; } ); + $this->logContext['hitsTotal'] = count( $suggestions ); + if ( $limit > 0 ) { - $suggestions = array_slice( $suggestions, 0, $limit ); + $suggestions = array_slice( $suggestions, 0, $limit, true ); } - // suggest currently returns page ids, we need to resolve those now - $pageIds = array(); - foreach ( $suggestions as $suggestion ) { - $pageIds[] = $suggestion['text']; + $this->logContext['hitsReturned'] = count( $suggestions ); + $this->logContext['hitsOffset'] = 0; + + // we must fetch redirect data for redirect suggestions + $missingText = array(); + foreach ( $suggestions as $id => $suggestion ) { + if ( !isset( $suggestion['text'] ) ) { + $missingText[] = $id; + } } - // doesn't guarantee to maintain order - $unsortedTitles = Title::newFromIDs( $pageIds ); - $byId = array(); - foreach ( $unsortedTitles as $title ) { - $byId[$title->getArticleID()] = $title; + if ( !empty ( $missingText ) ) { + // Experimental. + // + // Second pass query to fetch redirects. + // It's not clear if it's the best option, this will slowdown the whole query + // when we hit a redirect suggestion. + // Other option would be to encode redirects as a payload resulting in a + // very big index... + + // XXX: we support only the content index + $type = Connection::getPageType( $this->indexBaseName, Connection::CONTENT_INDEX_TYPE ); + // NOTE: we are already in a poolCounterWork + // Multi get is not supported by elastica + $redirResponse = null; + try { + $redirResponse = $type->request( '_mget', 'GET', + array( 'ids' => $missingText ), + array( '_source_include' => 'redirect' ) ); + if ( $redirResponse->isOk() ) { + $docs = $redirResponse->getData(); + $docs = $docs['docs']; + foreach ( $docs as $doc ) { + $id = $doc['_id']; + if ( !isset( $doc['_source']['redirect'] ) + || empty( $doc['_source']['redirect'] ) + ) { + continue; + } + $text = Util::chooseBestRedirect( $query, $doc['_source']['redirect'] ); + $suggestions[$id]['text'] = $text; + } + } else { + LoggerFactory::getInstance( 'CirrusSearch' )->warning( + 'Unable to fetch redirects for suggestion {query} with results {ids} : {error}', + array( 'query' => $query, + 'ids' => serialize( $missingText ), + 'error' => $redirResponse->getError() ) ); + } + } catch ( \Elastica\Exception\ExceptionInterface $e ) { + LoggerFactory::getInstance( 'CirrusSearch' )->warning( + 'Unable to fetch redirects for suggestion {query} with results {ids} : {error}', + array( 'query' => $query, + 'ids' => serialize( $missingText ), + 'error' => $this->extractMessage( $e ) ) ); + } } $retval = array(); foreach ( $suggestions as $suggestion ) { - $pageId = $suggestion['text']; - if ( isset( $byId[$pageId] ) ) { - $retval[] = array( - 'title' => (string)$byId[$pageId], - 'score' => $suggestion['score'], - ); + if ( !isset( $suggestion['text'] ) ) { + // We were unable to find a text to display + // Maybe a page with redirects when we built the suggester index + // but now without redirects? + continue; } + $retval[] = array( + // XXX: we run the suggester for namespace 0 for now + 'title' => Title::makeTitle( 0, $suggestion['text'] ), + 'pageId' => $suggestion['pageId'], + 'score' => $suggestion['score'], + ); } return $retval; diff --git a/includes/Util.php b/includes/Util.php index 08f1872..5550609 100644 --- a/includes/Util.php +++ b/includes/Util.php @@ -338,4 +338,35 @@ $lines = array_filter( $lines ); // Remove empty lines return $lines; } + + /** + * Tries to identify the best redirect by finding the link with the + * smallest edit distance between the title and the user query. + * @param $userQuery string the user query + * @param $redirects array the list of redirects + * @return string the best redirect text + */ + public static function chooseBestRedirect( $userQuery, $redirects ) { + $userQuery = mb_strtolower( $userQuery ); + $len = mb_strlen( $userQuery ); + $bestDistance = INF; + $best = null; + + foreach( $redirects as $redir ) { + $text = $redir['title']; + if ( mb_strlen( $text ) > $len ) { + $text = mb_substr( $text, 0, $len ); + } + $text = mb_strtolower( $text ); + $distance = levenshtein( $text, $userQuery ); + if ( $distance == 0 ) { + return $redir['title']; + } + if ( $distance < $bestDistance ) { + $bestDistance = $distance; + $best = $redir['title']; + } + } + return $best; + } } diff --git a/maintenance/updateSuggesterIndex.php b/maintenance/updateSuggesterIndex.php index 4df6f0d..106cbd2 100644 --- a/maintenance/updateSuggesterIndex.php +++ b/maintenance/updateSuggesterIndex.php @@ -80,6 +80,12 @@ */ private $availablePlugins; + + /** + * @var boolean index geo contextualized suggestions + */ + private $withGeo; + public function __construct() { parent::__construct(); $this->addDescription( "Create a new suggester index." ); @@ -94,6 +100,7 @@ 'of moving a shard this can time out. This will retry the attempt after some backoff ' . 'rather than failing the whole reindex process. Defaults to 5.', false, true ); $this->addOption( 'optimize', 'Optimize the index to 1 segment. Defaults to false.', false, false ); + $this->addOption( 'with-geo', 'Build geo contextualized suggestions. Defaults to false.', false, false ); $this->addOption( 'scoringMethod', 'The scoring method to use when computing suggestion weights. ' . 'Detauls to quality.', false, true ); } @@ -116,6 +123,7 @@ $this->indexRetryAttempts = $this->getOption( 'reindexRetryAttempts', 5 ); $this->optimizeIndex = $this->getOption( 'optimize', false ); + $this->withGeo = $this->getOption( 'with-geo', false ); $utils = new ConfigUtils( $this->getClient(), $this); @@ -212,7 +220,7 @@ $scoreMethodName = $this->getOption( 'scoringMethod', 'quality' ); $this->scoreMethod = SuggestScoringMethodFactory::getScoringMethod( $scoreMethodName, $totalDocsInIndex ); - $builder = new SuggestBuilder( $this->scoreMethod ); + $builder = new SuggestBuilder( $this->scoreMethod, $this->withGeo ); $docsDumped = 0; $this->output( "Indexing $totalDocsToDump documents ($totalDocsInIndex in the index)\n" ); diff --git a/profiles/PhraseSuggesterProfiles.php b/profiles/PhraseSuggesterProfiles.php new file mode 100644 index 0000000..885e900 --- /dev/null +++ b/profiles/PhraseSuggesterProfiles.php @@ -0,0 +1,131 @@ +<?php + +/** + * CirrusSearch - List of profiles for "Did you mean" suggestions + * + * Set $wgSearchType to 'CirrusSearch' + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + */ + +$wgCirrusSearchPhraseSuggestProfiles = array( + // This is the default settings + 'default' => array( + // The suggest mode used by the phrase suggester + // can be : + // * missing: Only suggest terms in the suggest text that + // aren’t in the index. + // * popular: Only suggest suggestions that occur in more docs + // then the original suggest text term. + // * always: Suggest any matching suggestions based on terms + // in the suggest text. + 'mode' => 'always', + + // Confidence level required to suggest new phrases. + // See confidence on https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters-phrase.html + 'confidence' => 2.0, + + // Maximum number of terms that we ask phrase suggest to correct. + // See max_errors on https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters-phrase.html + 'max_errors' => 2, + + // the likelihood of a term being a misspelled even if the term exists in the dictionary. + 'real_word_error_likelihood' => 0.95, + + // The max term freq used by the phrase suggester. The maximum + // threshold in number of documents a suggest text token can + // exist in order to be included. Can be a relative percentage + // number (e.g 0.4) or an absolute // number to represent + // document frequencies. If an value higher than 1 is specified + // then fractional can not be specified. Defaults to 0.01f. If + // a term appears in more then half the docs then don't try to + // correct it. This really shouldn't kick in much because we're + // not looking for misspellings. We're looking for phrases that + // can be might off. Like "noble prize" -> "nobel prize". In + // any case, the default was 0.01 which way too frequently + // decided not to correct some terms. + 'max_term_freq' => 0.5, + + // The max doc freq (shard level) used by the phrase suggester + // The minimal threshold in number of documents a suggestion + // should appear in. This can be specified as an absolute + // number or as a relative percentage of number of documents. + // This can improve quality by only suggesting high frequency + // terms. Defaults to 0f and is not enabled. If a value higher + // than 1 is specified then the number cannot be fractional. The + // shard level document frequencies are used for this option. + // NOTE: this value is ignored if mode is "always" + 'min_doc_freq' => 0.0, + + // The prefix length used by the phrase suggester The number of + // minimal prefix characters that must match in order be a + // candidate suggestions. Defaults to 1. Increasing this number + // improves spellcheck performance. Usually misspellings don’t + // occur in the beginning of terms. + 'prefix_length' => 2, + + // Checks each suggestion against a specified query to prune + // suggestions for which no matching docs exist in the index. + 'collate' => false, + + // Controls the minimum_should_match option used by the collate + // query. + 'collate_minimum_should_match' => '3<66%', + + // Smoothing model See + // https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters-phrase.html + 'smoothing_model' => array( + 'stupid_backoff' => array( + 'discount' => 0.4 + ) + ), + ), + // The 'strict' settings will try to avoid displaying weird suggestions. + // (suited for small size wikis) + 'strict' => array( + 'mode' => 'always', + 'confidence' => 2.0, + 'max_errors' => 2, + 'real_word_error_likelihood' => 0.95, + 'max_term_freq' => 0.5, + 'min_doc_freq' => 0.0, + 'prefix_length' => 2, + 'collate' => true, + 'collate_minimum_should_match' => '3<66%', + 'smoothing_model' => array( + 'laplace' => array( + 'alpha' => 0.3 + ) + ) + ), + // Alternative settings, confidence set to 1 but with laplace smoothing + 'alternative' => array( + 'mode' => 'always', + 'confidence' => 1.0, + 'max_errors' => 2, + 'real_word_error_likelihood' => 0.95, + 'max_term_freq' => 0.5, + 'min_doc_freq' => 0.0, + 'prefix_length' => 2, + 'collate' => false, + 'collate_minimum_should_match' => '3<66%', + 'smoothing_model' => array( + 'laplace' => array( + 'alpha' => 0.3 + ) + ) + ) +); diff --git a/profiles/SuggestProfiles.php b/profiles/SuggestProfiles.php new file mode 100644 index 0000000..94b94c4 --- /dev/null +++ b/profiles/SuggestProfiles.php @@ -0,0 +1,107 @@ +<?php + +/** + * CirrusSearch - List of profiles for search as you type suggestions + * (Completion suggester) + * + * Set $wgSearchType to 'CirrusSearch' + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + */ + +/** + * + * See CirrusSearch\BuildDocument\SuggestBuilder and CirrusSearch\Searcher + * See also: https://www.elastic.co/guide/en/elasticsearch/reference/current/search-suggesters-completion.html + */ +$wgCirrusSearchCompletionProfiles = array( + // Default profile + 'default' => array( + // key is the name of the suggestion request + 'plain' => array( + // Field to request + 'field' => 'suggest', + // Fire the request only if the user query has min_query_len chars + 'min_query_len' => 0, + // Discount result scores for this request + // Useful to discount fuzzy request results + 'discount' => 1.0, + // Fetch more result than the limit + // It's possible to have the same page multiple times. + // Requesting more than the limit helps to display the correct number + // of suggestions + 'fetch_limit_factor' => 2, + ), + 'plain_stop' => array( + 'field' => 'suggest-stop', + 'min_query_len' => 0, + 'discount' => 0.1, + 'fetch_limit_factor' => 2, + ), + 'plain_fuzzy' => array( + 'field' => 'suggest', + 'min_query_len' => 3, + 'discount' => 0.005, + 'fetch_limit_factor' => 2, + 'fuzzy' => array( + 'fuzzyness' => 'AUTO', + 'prefix_length' => 0, + 'unicode_aware' => true, + ) + ), + 'plain_stop_fuzzy' => array( + 'field' => 'suggest-stop', + 'min_query_len' => 3, + 'discount' => 0.001, + 'fetch_limit_factor' => 2, + 'fuzzy' => array( + 'fuzzyness' => 'AUTO', + 'prefix_length' => 0, + 'unicode_aware' => true, + ) + ) + ) +); + +/** + * List of profiles for geo context suggestions + */ +$wgCirrusSearchCompletionGeoContextProfiles = array( + 'default' => array( + 'geo-1km' => array( + 'field_suffix' => '-geo', + // Discount applied to the score, this value will be multiplied + // to the discount from $wgCirrusSearchCompletionProfiles + 'discount' => 1.0, + 'precision' => 6, + // List of requests to run with this precision + // must be a valid name from the active $wgCirrusSearchCompletionProfiles + 'with' => array( 'plain', 'plain_stop', 'plain_fuzzy', 'plain_stop_fuzzy' ) + ), + 'geo-10km' => array( + 'field_suffix' => '-geo', + 'discount' => 0.5, + 'precision' => 4, + 'with' => array( 'plain', 'plain_stop', 'plain_fuzzy' ) + ), + 'geo-100km' => array( + 'field_suffix' => '-geo', + 'discount' => 0.2, + 'precision' => 3, + 'with' => array( 'plain', 'plain_stop' ) + ) + ) +); diff --git a/tests/browser/features/suggest_api.feature b/tests/browser/features/suggest_api.feature index c4b203f..7390a8d 100644 --- a/tests/browser/features/suggest_api.feature +++ b/tests/browser/features/suggest_api.feature @@ -25,10 +25,32 @@ Then the API should produce empty list Scenario: Ordering - When I ask suggestion API for x-m - Then the API should produce list starting with X-Men + When I ask suggestion API for x-m + Then the API should produce list starting with X-Men + + Scenario: Fuzzy + When I ask suggestion API for xmen + Then the API should produce list starting with X-Men + + Scenario Outline: Search redirects shows the best redirect + When I ask suggestion API for <term> + Then the API should produce list containing <suggested> + Examples: + | term | suggested | + | eise | Eisenhardt, Max | + | max | Max Eisenhardt | + | magnetu | Magneto | + + Scenario Outline: Search prefers exact match over + When I ask suggestion API for <term> + Then the API should produce list starting with <suggested> + Examples: + | term | suggested | + | max | Max Eisenhardt | + | mai | Main Page | + | eis | Eisenhardt, Max | Scenario: Ordering & limit - When I ask suggestion API at most 1 item for x-m - Then the API should produce list starting with X-Men - And the API should produce list of length 1 \ No newline at end of file + When I ask suggestion API at most 1 item for x-m + Then the API should produce list starting with X-Men + And the API should produce list of length 1 diff --git a/tests/browser/features/support/hooks.rb b/tests/browser/features/support/hooks.rb index 796bff9..33f087f 100644 --- a/tests/browser/features/support/hooks.rb +++ b/tests/browser/features/support/hooks.rb @@ -617,6 +617,10 @@ Given a page named X-Men exists with contents The X-Men are a fictional team of superheroes And a page named Xavier, Charles exists with contents Professor Charles Francis Xavier (also known as Professor X) is the founder of [[X-Men]] And a page named X-Force exists with contents X-Force is a fictional team of of [[X-Men]] + And a page named Magneto exists with contents Magneto is a fictional character appearing in American comic books + And a page named Max Eisenhardt exists with contents #REDIRECT [[Magneto]] + And a page named Eisenhardt, Max exists with contents #REDIRECT [[Magneto]] + And a page named Magnetu exists with contents #REDIRECT [[Magneto]] And I reindex suggestions ) suggest = true diff --git a/tests/unit/SuggestBuilderTest.php b/tests/unit/SuggestBuilderTest.php new file mode 100644 index 0000000..e361631 --- /dev/null +++ b/tests/unit/SuggestBuilderTest.php @@ -0,0 +1,245 @@ +<?php + +namespace CirrusSearch; + +use CirrusSearch\BuildDocument\SuggestBuilder; +use CirrusSearch\BuildDocument\SuggestScoringMethodFactory; + +/** + * test suggest builder. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License along + * with this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * http://www.gnu.org/copyleft/gpl.html + */ +class SuggestBuilderTest extends \MediaWikiTestCase { + public function testEinstein() { + $builder = new SuggestBuilder( SuggestScoringMethodFactory::getScoringMethod( 'incomingLinks', 1 ) ); + $score = 10; + $redirScore = (int) ( $score * SuggestBuilder::REDIRECT_DISCOUNT ); + $doc = array( + 'title' => 'Albert Einstein', + 'redirect' => array( + array( 'title' => "Albert Enstein", 'namespace' => 0 ), + array( 'title' => "Albert Einsten", 'namespace' => 0 ), + array( 'title' => 'Albert Einstine', 'namespace' => 0 ), + array( 'title' => "Enstein", 'namespace' => 0 ), + array( 'title' => "Einstein", 'namespace' => 0 ), + ), + 'incoming_links' => $score + ); + $expected = array( + array( + 'suggest' => array( + 'input' => array( 'Albert Einstein', 'Albert Enstein', + 'Albert Einsten', 'Albert Einstine' ), + 'output' => '1:t:Albert Einstein', + 'weight' => $score + ), + 'suggest-stop' => array( + 'input' => array( 'Albert Einstein', 'Albert Enstein', + 'Albert Einsten', 'Albert Einstine' ), + 'output' => '1:t:Albert Einstein', + 'weight' => $score + ) + ), + array( + 'suggest' => array( + 'input' => array( 'Enstein', 'Einstein' ), + 'output' => '1:r', + 'weight' => $redirScore + ), + 'suggest-stop' => array( + 'input' => array( 'Enstein', 'Einstein' ), + 'output' => '1:r', + 'weight' => $redirScore + ) + ) + ); + + $suggestions = $builder->build( 1, $doc ); + $this->assertSame( $expected, $suggestions ); + } + + public function testUlm() { + $builder = new SuggestBuilder( SuggestScoringMethodFactory::getScoringMethod( 'incomingLinks', 1 ) ); + $score = 10; + $redirScore = (int) ( $score * SuggestBuilder::REDIRECT_DISCOUNT ); + $doc = array( + 'title' => 'Ulm', + 'redirect' => array( + array( 'title' => 'UN/LOCODE:DEULM', 'namespace' => 0 ), + array( 'title'=> 'Ulm, Germany', 'namespace' => 0 ), + array( 'title' => "Ulm displaced persons camp", 'namespace' => 0 ), + array( 'title' => "Söflingen", 'namespace' => 0 ), + ), + 'coordinates' => array( + array( + 'coord' => array( + 'lat' => 48.3985, + 'lon' => 9.9918 + ), + 'region' => "BW", + 'dim' => 10000, + 'name' => "", + 'primary' => true, + 'type' => "city", + 'globe' => "earth", + 'country' => "DE" + ) + ), + 'incoming_links' => $score + ); + + $expected = array( + array( + 'suggest' => array( + 'input' => array( 'Ulm' ), + 'output' => '1:t:Ulm', + 'weight' => $score + ), + 'suggest-stop' => array( + 'input' => array( 'Ulm' ), + 'output' => '1:t:Ulm', + 'weight' => $score + ), + 'suggest-geo' => array( + 'input' => array( 'Ulm' ), + 'output' => '1:t:Ulm', + 'weight' => $score, + 'context' => array( + 'location' => array( + 'lat' => 48.3985, + 'lon' => 9.9918 + ) + ) + ), + 'suggest-stop-geo' => array( + 'input' => array( 'Ulm' ), + 'output' => '1:t:Ulm', + 'weight' => $score, + 'context' => array( + 'location' => array( + 'lat' => 48.3985, + 'lon' => 9.9918 + ) + ) + ) + ), + array( + 'suggest' => array( + 'input' => array( 'UN/LOCODE:DEULM', 'Ulm, Germany', + 'Ulm displaced persons camp', 'Söflingen' ), + 'output' => '1:r', + 'weight' => $redirScore + ), + 'suggest-stop' => array( + 'input' => array( 'UN/LOCODE:DEULM', 'Ulm, Germany', + 'Ulm displaced persons camp', 'Söflingen' ), + 'output' => '1:r', + 'weight' => $redirScore + ), + 'suggest-geo' => array( + 'input' => array( 'UN/LOCODE:DEULM', 'Ulm, Germany', + 'Ulm displaced persons camp', 'Söflingen' ), + 'output' => '1:r', + 'weight' => $redirScore, + 'context' => array( + 'location' => array( + 'lat' => 48.3985, + 'lon' => 9.9918 + ) + ) + ), + 'suggest-stop-geo' => array( + 'input' => array( 'UN/LOCODE:DEULM', 'Ulm, Germany', + 'Ulm displaced persons camp', 'Söflingen' ), + 'output' => '1:r', + 'weight' => $redirScore, + 'context' => array( + 'location' => array( + 'lat' => 48.3985, + 'lon' => 9.9918 + ) + ) + ) + ) + ); + $suggestions = $builder->build( 1, $doc ); + $this->assertSame( $expected, $suggestions ); + } + + public function testMultipleCoordinates() { + $doc = array( + 'coordinates' => array( + array( + 'coord' => array( + 'lat' => 0.70777777777778, + 'lon' => -50.089444444444 + ), + 'region' => null, + 'dim' => 10000, + 'name' => "", + 'primary' => true, + 'type' => "river", + 'globe' => "earth", + 'country' => "BR" + ), + array( + 'coord' => array( + 'lat' => -15.518055555556, + 'lon' => -71.765277777778 + ), + 'region' => null, + 'dim' => 10000, + 'name' => "", + 'primary' => false, + 'type' => "river", + 'globe' => "earth", + 'country' => "BR" + ) + ) + ); + + $builder = new SuggestBuilder( SuggestScoringMethodFactory::getScoringMethod( 'incomingLinks', 1 ) ); + $coord = $builder->findPrimaryCoordinates( $doc ); + $expected = array( 'lat' => 0.70777777777778, 'lon' => -50.089444444444 ); + $this->assertSame( $expected, $coord ); + + $doc['coordinates'][1]['primary'] = true; + $coord = $builder->findPrimaryCoordinates( $doc ); + $expected = array( 'lat' => 0.70777777777778, 'lon' => -50.089444444444 ); + $this->assertSame( $expected, $coord, "With two primaries coord we choose the first one" ); + + $doc['coordinates'][0]['primary'] = false; + $coord = $builder->findPrimaryCoordinates( $doc ); + $expected = array( 'lat' => -15.518055555556, 'lon' => -71.765277777778 ); + $this->assertSame( $expected, $coord, "Choose primary coord even if it's not the first one." ); + + $doc['coordinates'][1]['primary'] = false; + $coord = $builder->findPrimaryCoordinates( $doc ); + $expected = array( 'lat' => 0.70777777777778, 'lon' => -50.089444444444 ); + $this->assertSame( $expected, $coord, "Choose first coord if there's no primary." ); + + $doc['coordinates'][0]['primary'] = true; + $doc['coordinates'][0]['globe'] = 'Magrathea'; + $coord = $builder->findPrimaryCoordinates( $doc ); + $expected = array( 'lat' => -15.518055555556, 'lon' => -71.765277777778 ); + $this->assertSame( $expected, $coord, "Choose first coord on earth." ); + + $doc['coordinates'][1]['globe'] = 'Magrathea'; + $coord = $builder->findPrimaryCoordinates( $doc ); + $this->assertNull( $coord, "No coord if none is on earth." ); + } +} diff --git a/tests/unit/SuggestScoringTest.php b/tests/unit/SuggestScoringTest.php index c2bf342..493878d 100644 --- a/tests/unit/SuggestScoringTest.php +++ b/tests/unit/SuggestScoringTest.php @@ -37,7 +37,7 @@ $this->assertGreaterThanOrEqual( 0, $score, "scoreNormL2 cannot produce a score lower than 0" ); } - # Edges + // Edges $score = $qs->scoreNorm( 1, 1 ); $this->assertLessThanOrEqual( 1, $score, "scoreNorm cannot produce a score greater than 1" ); $this->assertGreaterThanOrEqual( 0, $score, "scoreNorm cannot produce a score lower than 0" ); @@ -66,13 +66,13 @@ if ( $boost > 1 ) { $this->assertGreaterThan( $score, $res, "With a boost ($boost) greater than 1 the boosted score must be greater than the original." ); } else if ( $boost < 1 ) { - $this->assertLessThan( $score, $res, "With a boost ($boost) lesser than 1 the boosted score must be lesser than the original." ); + $this->assertLessThan( $score, $res, "With a boost ($boost) less than 1 the boosted score must be less than the original." ); } else { $this->assertEquals( $score, $res, "When boost is 1 the score remains unchanged." ); } } for( $i = 1; $i < 1000; $i++ ) { - # The same boost value must keep original score ordering + // The same boost value must keep original score ordering $score1 = 0.1; $score2 = 0.5; @@ -87,7 +87,7 @@ $this->assertGreaterThan( $res1, $res2, "A boost cannot 'overboost' a score" ); } - # Edges + // Edges $res = $qs->boost( 1, 1 ); $this->assertEquals( $res, 1, "When boost is 1 the score remains unchanged." ); $res = $qs->boost( 1, 0 ); @@ -123,7 +123,7 @@ $this->assertGreaterThan( $score, $res, "A good doc gets a better score" ); $res = $qs->boostTemplates( $badDoc, $score ); - $this->assertLessThan( $score, $res, "A good doc gets a lower score" ); + $this->assertLessThan( $score, $res, "A bad doc gets a lower score" ); $res = $qs->boostTemplates( $mixedDoc, $score ); $this->assertEquals( $score, $res, "A mixed doc gets the same score"); @@ -227,7 +227,7 @@ $this->assertLessThan( QualityScore::SCORE_RANGE, $qs->score( $page ), "Score is always lower than " . QualityScore::SCORE_RANGE ); } - # Edges + // Edges $page = array( 'incoming_links' => $maxDocs * QualityScore::INCOMING_LINKS_MAX_DOCS_FACTOR, 'external_link' => array_fill( 0, QualityScore::EXTERNAL_LINKS_NORM, null ), @@ -251,7 +251,7 @@ $page = array(); $this->assertEquals( 0, $qs->score( $page ), "Score of a broken article is 0" ); - # A very small wiki + // A very small wiki $qs = new QualityScore( 1 ); $page = array( 'incoming_links' => 1, @@ -263,7 +263,7 @@ ); $this->assertEquals( QualityScore::SCORE_RANGE, $qs->score( $page ), "With very small wiki the highest score is also " . QualityScore::SCORE_RANGE ); - # The scoring function should not fail with 0 page + // The scoring function should not fail with 0 page $qs = new QualityScore( 0 ); $page = array( 'incoming_links' => 1, diff --git a/tests/unit/UtilTest.php b/tests/unit/UtilTest.php index bd325db..91d25e3 100644 --- a/tests/unit/UtilTest.php +++ b/tests/unit/UtilTest.php @@ -257,4 +257,20 @@ $this->assertEquals( 6, $calls ); $this->assertEquals( 5, $errorCallbackCalls ); } + + public function testChooseBestRedirect() { + $convert = function( $x ) { + $redirect = array(); + foreach( $x as $t ) { + $redirect[] = array( 'title' => $t, 'namespace' => 0 ); + } + return $redirect; + }; + $input = $convert( array( 'Al. Einstein', 'Albert Einstein', 'A. Einstein', 'Einstein, Albert' ) ); + $this->assertEquals( 'Al. Einstein', Util::chooseBestRedirect( 'a', $input ) ); + $this->assertEquals( 'Al. Einstein', Util::chooseBestRedirect( 'al', $input ) ); + $this->assertEquals( 'Albert Einstein', Util::chooseBestRedirect( 'albet', $input ) ); + $this->assertEquals( 'Einstein, Albert', Util::chooseBestRedirect( 'Einstein', $input ) ); + $this->assertEquals( 'Einstein, Albert', Util::chooseBestRedirect( 'Ens', $input ) ); + } } -- To view, visit https://gerrit.wikimedia.org/r/235133 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I37953179d3f10036344fe16bf31da3fd04a7c075 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: wmf/1.26wmf20 Gerrit-Owner: EBernhardson <[email protected]> Gerrit-Reviewer: DCausse <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
