jenkins-bot has submitted this change and it was merged. Change subject: Add a language based keyword filter ......................................................................
Add a language based keyword filter Adds a new full text search keyword, inlanguage, that limits the result set to pages marked as being in a specific language. Most commonly these pages are created by using the Translate extension. Bug: T125944 Change-Id: I15fc139531e3e6902ed64db915da1f8ef7910e99 --- M autoload.php A includes/Query/LanguageFeature.php M includes/Search/Filters.php M includes/Searcher.php A tests/unit/fixtures/searchText/inlanguage_001.expected A tests/unit/fixtures/searchText/inlanguage_001.query A tests/unit/fixtures/searchText/inlanguage_002.expected A tests/unit/fixtures/searchText/inlanguage_002.query A tests/unit/fixtures/searchText/inlanguage_003.expected A tests/unit/fixtures/searchText/inlanguage_003.query A tests/unit/fixtures/searchText/inlanguage_004.expected A tests/unit/fixtures/searchText/inlanguage_004.query A tests/unit/fixtures/searchText/inlanguage_005.expected A tests/unit/fixtures/searchText/inlanguage_005.query A tests/unit/fixtures/searchText/inlanguage_006.expected A tests/unit/fixtures/searchText/inlanguage_006.query 16 files changed, 1,756 insertions(+), 0 deletions(-) Approvals: Cindy-the-browser-test-bot: Looks good to me, but someone else must approve EBernhardson: Looks good to me, approved jenkins-bot: Verified diff --git a/autoload.php b/autoload.php index e00d715..250ac5c 100644 --- a/autoload.php +++ b/autoload.php @@ -101,6 +101,7 @@ 'CirrusSearch\\Query\\InCategoryFeature' => __DIR__ . '/includes/Query/InCategoryFeature.php', 'CirrusSearch\\Query\\InTitleFeature' => __DIR__ . '/includes/Query/InTitleFeature.php', 'CirrusSearch\\Query\\KeywordFeature' => __DIR__ . '/includes/Query/KeywordFeature.php', + 'CirrusSearch\\Query\\LanguageFeature' => __DIR__ . '/includes/Query/LanguageFeature.php', 'CirrusSearch\\Query\\LinksToFeature' => __DIR__ . '/includes/Query/LinksToFeature.php', 'CirrusSearch\\Query\\LocalFeature' => __DIR__ . '/includes/Query/LocalFeature.php', 'CirrusSearch\\Query\\PreferRecentFeature' => __DIR__ . '/includes/Query/PreferRecentFeature.php', diff --git a/includes/Query/LanguageFeature.php b/includes/Query/LanguageFeature.php new file mode 100644 index 0000000..8280f60 --- /dev/null +++ b/includes/Query/LanguageFeature.php @@ -0,0 +1,56 @@ +<?php + +namespace CirrusSearch\Query; + +use CirrusSearch\Search\Filters; +use CirrusSearch\Search\SearchContext; + +/** + * Filters the result set based on pages labeled with the provided language. + * More than one language can be specified with commas and they will be + * generated as an OR query. + * + * Examples: + * inlanguage:en + * inlanguage:fr,en + */ +class LanguageFeature extends SimpleKeywordFeature { + /** + * Limit search to 20 languages. Arbitrarily chosen, but should be more + * than enough and some sort of limit has to be enforced. + */ + const QUERY_LIMIT = 20; + + /** + * @return string + */ + protected function getKeywordRegex() { + return 'inlanguage'; + } + + /** + * @param SearchContext $context + * @param string $key The keyword + * @param string $value The value attached to the keyword with quotes stripped + * @param string $quotedValue The original value in the search string, including quotes if used + * @param bool $negated Is the search negated? Not used to generate the returned AbstractQuery, + * that will be negated as necessary. Used for any other building/context necessary. + * @return array Two element array, first an AbstractQuery or null to apply to the + * query. Second a boolean indicating if the quotedValue should be kept in the search + * string. + */ + protected function doApply( SearchContext $context, $key, $value, $quotedValue, $negated ) { + $queries = []; + + $langs = array_slice( explode( ',', $value ), 0, self::QUERY_LIMIT ); + foreach ( $langs as $lang ) { + if ( strlen( trim( $lang ) ) > 0 ) { + $query = new \Elastica\Query\Match(); + $query->setFieldQuery( 'language', $lang ); + $queries[] = $query; + } + } + + return [Filters::booleanOr( $queries, false ), false]; + } +} diff --git a/includes/Search/Filters.php b/includes/Search/Filters.php index ee10077..d7bbd58 100644 --- a/includes/Search/Filters.php +++ b/includes/Search/Filters.php @@ -5,6 +5,7 @@ use Elastica; use Elastica\Query\AbstractQuery; use Elastica\Query\BoolQuery; +use Elastica\Query\MatchAll; use GeoData\Coord; /** @@ -27,6 +28,30 @@ */ class Filters { /** + * Turns a list of queries into a boolean OR, requiring only one + * of the provided queries to match. + * + * @param AbstractQuery[] $queries + * @param bool $matchAll When true (default) function never returns null, + * when no queries are provided a MatchAll is returned. + * @return AbstractQuery|null The resulting OR query. Only returns null when + * no queries are passed and $matchAll is false. + */ + public static function booleanOr( array $queries, $matchAll = true ) { + if ( !$queries ) { + return $matchAll ? new MatchAll() : null; + } elseif ( count( $queries ) === 1 ) { + return reset( $queries ); + } else { + $bool = new BoolQuery(); + foreach ( $queries as $query ) { + $bool->addShould( $query ); + } + return $bool; + } + } + + /** * Merges lists of include/exclude filters into a single filter that * Elasticsearch will execute efficiently. * diff --git a/includes/Searcher.php b/includes/Searcher.php index 3b8389b..7610b71 100644 --- a/includes/Searcher.php +++ b/includes/Searcher.php @@ -331,6 +331,8 @@ new Query\SimpleInSourceFeature( $this->escaper ), // Handle intitle keyword new Query\InTitleFeature( $this->escaper ), + // inlanguage keyword + new Query\LanguageFeature(), ], $builderSettings['settings'] ); diff --git a/tests/unit/fixtures/searchText/inlanguage_001.expected b/tests/unit/fixtures/searchText/inlanguage_001.expected new file mode 100644 index 0000000..96f93b5 --- /dev/null +++ b/tests/unit/fixtures/searchText/inlanguage_001.expected @@ -0,0 +1,316 @@ +{ + "description": "full_text search for 'foo inlanguage:es\n'", + "path": "wiki\/page\/_search", + "params": { + "search_type": "dfs_query_then_fetch", + "timeout": "20s" + }, + "query": { + "_source": [ + "id", + "title", + "namespace", + "redirect.*", + "timestamp", + "text_bytes" + ], + "fields": "text.word_count", + "query": { + "bool": { + "minimum_number_should_match": 1, + "should": [ + { + "query_string": { + "query": "foo", + "fields": [ + "all.plain^1", + "all^0.5" + ], + "auto_generate_phrase_queries": true, + "phrase_slop": 0, + "default_operator": "AND", + "allow_leading_wildcard": true, + "fuzzy_prefix_length": 2, + "rewrite": "top_terms_boost_1024", + "max_determinized_states": 500 + } + }, + { + "multi_match": { + "fields": [ + "all_near_match^2" + ], + "query": "foo" + } + } + ], + "filter": [ + { + "bool": { + "must": [ + { + "match": { + "language": { + "query": "es" + } + } + }, + { + "terms": { + "namespace": [ + 0, + 1, + 2, + 3 + ] + } + } + ] + } + } + ] + } + }, + "highlight": { + "pre_tags": [ + "<span class=\"searchmatch\">" + ], + "post_tags": [ + "<\/span>" + ], + "fields": { + "title": { + "type": "experimental", + "fragmenter": "none", + "number_of_fragments": 1, + "matched_fields": [ + "title", + "title.plain" + ] + }, + "redirect.title": { + "type": "experimental", + "fragmenter": "none", + "order": "score", + "number_of_fragments": 1, + "options": { + "skip_if_last_matched": true + }, + "matched_fields": [ + "redirect.title", + "redirect.title.plain" + ] + }, + "category": { + "type": "experimental", + "fragmenter": "none", + "order": "score", + "number_of_fragments": 1, + "options": { + "skip_if_last_matched": true + }, + "matched_fields": [ + "category", + "category.plain" + ] + }, + "heading": { + "type": "experimental", + "fragmenter": "none", + "order": "score", + "number_of_fragments": 1, + "options": { + "skip_if_last_matched": true + }, + "matched_fields": [ + "heading", + "heading.plain" + ] + }, + "text": { + "type": "experimental", + "number_of_fragments": 1, + "fragmenter": "scan", + "fragment_size": 150, + "options": { + "top_scoring": true, + "boost_before": { + "20": 2, + "50": 1.8, + "200": 1.5, + "1000": 1.2 + }, + "max_fragments_scored": 5000 + }, + "no_match_size": 150, + "matched_fields": [ + "text", + "text.plain" + ] + }, + "auxiliary_text": { + "type": "experimental", + "number_of_fragments": 1, + "fragmenter": "scan", + "fragment_size": 150, + "options": { + "top_scoring": true, + "boost_before": { + "20": 2, + "50": 1.8, + "200": 1.5, + "1000": 1.2 + }, + "max_fragments_scored": 5000, + "skip_if_last_matched": true + }, + "matched_fields": [ + "auxiliary_text", + "auxiliary_text.plain" + ] + }, + "file_text": { + "type": "experimental", + "number_of_fragments": 1, + "fragmenter": "scan", + "fragment_size": 150, + "options": { + "top_scoring": true, + "boost_before": { + "20": 2, + "50": 1.8, + "200": 1.5, + "1000": 1.2 + }, + "max_fragments_scored": 5000, + "skip_if_last_matched": true + }, + "matched_fields": [ + "file_text", + "file_text.plain" + ] + } + }, + "highlight_query": { + "query_string": { + "query": "foo", + "fields": [ + "title.plain^20", + "redirect.title.plain^15", + "category.plain^8", + "heading.plain^5", + "opening_text.plain^3", + "text.plain^1", + "auxiliary_text.plain^0.5", + "title^10", + "redirect.title^7.5", + "category^4", + "heading^2.5", + "opening_text^1.5", + "text^0.5", + "auxiliary_text^0.25" + ], + "auto_generate_phrase_queries": true, + "phrase_slop": 1, + "default_operator": "AND", + "allow_leading_wildcard": true, + "fuzzy_prefix_length": 2, + "rewrite": "top_terms_boost_1024", + "max_determinized_states": 500 + } + } + }, + "suggest": { + "text": "foo", + "suggest": { + "phrase": { + "field": "suggest", + "size": 1, + "max_errors": 2, + "confidence": 2, + "real_word_error_likelihood": 0.95, + "direct_generator": [ + { + "field": "suggest", + "suggest_mode": "always", + "max_term_freq": 0.5, + "min_doc_freq": 0, + "prefix_length": 2 + } + ], + "highlight": { + "pre_tag": "<em>", + "post_tag": "<\/em>" + }, + "smoothing": { + "stupid_backoff": { + "discount": 0.4 + } + } + } + } + }, + "stats": [ + "suggest", + "full_text" + ], + "size": 20, + "rescore": [ + { + "window_size": 8192, + "query": { + "query_weight": 1, + "rescore_query_weight": 1, + "score_mode": "multiply", + "rescore_query": { + "function_score": { + "functions": [ + { + "field_value_factor": { + "field": "incoming_links", + "modifier": "log2p", + "missing": 0 + } + }, + { + "weight": "0.25", + "filter": { + "terms": { + "namespace": [ + 1 + ] + } + } + }, + { + "weight": "0.05", + "filter": { + "terms": { + "namespace": [ + 2 + ] + } + } + }, + { + "weight": "0.0125", + "filter": { + "terms": { + "namespace": [ + 3 + ] + } + } + } + ] + } + } + } + } + ] + }, + "options": { + "search_type": "dfs_query_then_fetch", + "timeout": "20s" + } +} \ No newline at end of file diff --git a/tests/unit/fixtures/searchText/inlanguage_001.query b/tests/unit/fixtures/searchText/inlanguage_001.query new file mode 100644 index 0000000..2862b9c --- /dev/null +++ b/tests/unit/fixtures/searchText/inlanguage_001.query @@ -0,0 +1 @@ +foo inlanguage:es diff --git a/tests/unit/fixtures/searchText/inlanguage_002.expected b/tests/unit/fixtures/searchText/inlanguage_002.expected new file mode 100644 index 0000000..e4fef8f --- /dev/null +++ b/tests/unit/fixtures/searchText/inlanguage_002.expected @@ -0,0 +1,248 @@ +{ + "description": "full_text search for 'inlanguage:es,fr\n'", + "path": "wiki\/page\/_search", + "params": { + "search_type": "dfs_query_then_fetch", + "timeout": "20s" + }, + "query": { + "_source": [ + "id", + "title", + "namespace", + "redirect.*", + "timestamp", + "text_bytes" + ], + "fields": "text.word_count", + "query": { + "bool": { + "must": [ + { + "match_all": {} + } + ], + "filter": [ + { + "bool": { + "must": [ + { + "bool": { + "should": [ + { + "match": { + "language": { + "query": "es" + } + } + }, + { + "match": { + "language": { + "query": "fr" + } + } + } + ] + } + }, + { + "terms": { + "namespace": [ + 0, + 1, + 2, + 3 + ] + } + } + ] + } + } + ] + } + }, + "highlight": { + "pre_tags": [ + "<span class=\"searchmatch\">" + ], + "post_tags": [ + "<\/span>" + ], + "fields": { + "title": { + "type": "experimental", + "fragmenter": "none", + "number_of_fragments": 1, + "matched_fields": [ + "title", + "title.plain" + ] + }, + "redirect.title": { + "type": "experimental", + "fragmenter": "none", + "order": "score", + "number_of_fragments": 1, + "options": { + "skip_if_last_matched": true + }, + "matched_fields": [ + "redirect.title", + "redirect.title.plain" + ] + }, + "category": { + "type": "experimental", + "fragmenter": "none", + "order": "score", + "number_of_fragments": 1, + "options": { + "skip_if_last_matched": true + }, + "matched_fields": [ + "category", + "category.plain" + ] + }, + "heading": { + "type": "experimental", + "fragmenter": "none", + "order": "score", + "number_of_fragments": 1, + "options": { + "skip_if_last_matched": true + }, + "matched_fields": [ + "heading", + "heading.plain" + ] + }, + "text": { + "type": "experimental", + "number_of_fragments": 1, + "fragmenter": "scan", + "fragment_size": 150, + "options": { + "top_scoring": true, + "boost_before": { + "20": 2, + "50": 1.8, + "200": 1.5, + "1000": 1.2 + }, + "max_fragments_scored": 5000 + }, + "no_match_size": 150, + "matched_fields": [ + "text", + "text.plain" + ] + }, + "auxiliary_text": { + "type": "experimental", + "number_of_fragments": 1, + "fragmenter": "scan", + "fragment_size": 150, + "options": { + "top_scoring": true, + "boost_before": { + "20": 2, + "50": 1.8, + "200": 1.5, + "1000": 1.2 + }, + "max_fragments_scored": 5000, + "skip_if_last_matched": true + }, + "matched_fields": [ + "auxiliary_text", + "auxiliary_text.plain" + ] + }, + "file_text": { + "type": "experimental", + "number_of_fragments": 1, + "fragmenter": "scan", + "fragment_size": 150, + "options": { + "top_scoring": true, + "boost_before": { + "20": 2, + "50": 1.8, + "200": 1.5, + "1000": 1.2 + }, + "max_fragments_scored": 5000, + "skip_if_last_matched": true + }, + "matched_fields": [ + "file_text", + "file_text.plain" + ] + } + } + }, + "size": 20, + "rescore": [ + { + "window_size": 8192, + "query": { + "query_weight": 1, + "rescore_query_weight": 1, + "score_mode": "multiply", + "rescore_query": { + "function_score": { + "functions": [ + { + "field_value_factor": { + "field": "incoming_links", + "modifier": "log2p", + "missing": 0 + } + }, + { + "weight": "0.25", + "filter": { + "terms": { + "namespace": [ + 1 + ] + } + } + }, + { + "weight": "0.05", + "filter": { + "terms": { + "namespace": [ + 2 + ] + } + } + }, + { + "weight": "0.0125", + "filter": { + "terms": { + "namespace": [ + 3 + ] + } + } + } + ] + } + } + } + } + ], + "stats": [ + "full_text" + ] + }, + "options": { + "search_type": "dfs_query_then_fetch", + "timeout": "20s" + } +} \ No newline at end of file diff --git a/tests/unit/fixtures/searchText/inlanguage_002.query b/tests/unit/fixtures/searchText/inlanguage_002.query new file mode 100644 index 0000000..ab32fa2 --- /dev/null +++ b/tests/unit/fixtures/searchText/inlanguage_002.query @@ -0,0 +1 @@ +inlanguage:es,fr diff --git a/tests/unit/fixtures/searchText/inlanguage_003.expected b/tests/unit/fixtures/searchText/inlanguage_003.expected new file mode 100644 index 0000000..cf436b6 --- /dev/null +++ b/tests/unit/fixtures/searchText/inlanguage_003.expected @@ -0,0 +1,329 @@ +{ + "description": "full_text search for 'foo inlanguage:ru,uk bar\n'", + "path": "wiki\/page\/_search", + "params": { + "search_type": "dfs_query_then_fetch", + "timeout": "20s" + }, + "query": { + "_source": [ + "id", + "title", + "namespace", + "redirect.*", + "timestamp", + "text_bytes" + ], + "fields": "text.word_count", + "query": { + "bool": { + "minimum_number_should_match": 1, + "should": [ + { + "query_string": { + "query": "foo bar", + "fields": [ + "all.plain^1", + "all^0.5" + ], + "auto_generate_phrase_queries": true, + "phrase_slop": 0, + "default_operator": "AND", + "allow_leading_wildcard": true, + "fuzzy_prefix_length": 2, + "rewrite": "top_terms_boost_1024", + "max_determinized_states": 500 + } + }, + { + "multi_match": { + "fields": [ + "all_near_match^2" + ], + "query": "foo bar" + } + } + ], + "filter": [ + { + "bool": { + "must": [ + { + "bool": { + "should": [ + { + "match": { + "language": { + "query": "ru" + } + } + }, + { + "match": { + "language": { + "query": "uk" + } + } + } + ] + } + }, + { + "terms": { + "namespace": [ + 0, + 1, + 2, + 3 + ] + } + } + ] + } + } + ] + } + }, + "highlight": { + "pre_tags": [ + "<span class=\"searchmatch\">" + ], + "post_tags": [ + "<\/span>" + ], + "fields": { + "title": { + "type": "experimental", + "fragmenter": "none", + "number_of_fragments": 1, + "matched_fields": [ + "title", + "title.plain" + ] + }, + "redirect.title": { + "type": "experimental", + "fragmenter": "none", + "order": "score", + "number_of_fragments": 1, + "options": { + "skip_if_last_matched": true + }, + "matched_fields": [ + "redirect.title", + "redirect.title.plain" + ] + }, + "category": { + "type": "experimental", + "fragmenter": "none", + "order": "score", + "number_of_fragments": 1, + "options": { + "skip_if_last_matched": true + }, + "matched_fields": [ + "category", + "category.plain" + ] + }, + "heading": { + "type": "experimental", + "fragmenter": "none", + "order": "score", + "number_of_fragments": 1, + "options": { + "skip_if_last_matched": true + }, + "matched_fields": [ + "heading", + "heading.plain" + ] + }, + "text": { + "type": "experimental", + "number_of_fragments": 1, + "fragmenter": "scan", + "fragment_size": 150, + "options": { + "top_scoring": true, + "boost_before": { + "20": 2, + "50": 1.8, + "200": 1.5, + "1000": 1.2 + }, + "max_fragments_scored": 5000 + }, + "no_match_size": 150, + "matched_fields": [ + "text", + "text.plain" + ] + }, + "auxiliary_text": { + "type": "experimental", + "number_of_fragments": 1, + "fragmenter": "scan", + "fragment_size": 150, + "options": { + "top_scoring": true, + "boost_before": { + "20": 2, + "50": 1.8, + "200": 1.5, + "1000": 1.2 + }, + "max_fragments_scored": 5000, + "skip_if_last_matched": true + }, + "matched_fields": [ + "auxiliary_text", + "auxiliary_text.plain" + ] + }, + "file_text": { + "type": "experimental", + "number_of_fragments": 1, + "fragmenter": "scan", + "fragment_size": 150, + "options": { + "top_scoring": true, + "boost_before": { + "20": 2, + "50": 1.8, + "200": 1.5, + "1000": 1.2 + }, + "max_fragments_scored": 5000, + "skip_if_last_matched": true + }, + "matched_fields": [ + "file_text", + "file_text.plain" + ] + } + }, + "highlight_query": { + "query_string": { + "query": "foo bar", + "fields": [ + "title.plain^20", + "redirect.title.plain^15", + "category.plain^8", + "heading.plain^5", + "opening_text.plain^3", + "text.plain^1", + "auxiliary_text.plain^0.5", + "title^10", + "redirect.title^7.5", + "category^4", + "heading^2.5", + "opening_text^1.5", + "text^0.5", + "auxiliary_text^0.25" + ], + "auto_generate_phrase_queries": true, + "phrase_slop": 1, + "default_operator": "AND", + "allow_leading_wildcard": true, + "fuzzy_prefix_length": 2, + "rewrite": "top_terms_boost_1024", + "max_determinized_states": 500 + } + } + }, + "suggest": { + "text": "foo bar", + "suggest": { + "phrase": { + "field": "suggest", + "size": 1, + "max_errors": 2, + "confidence": 2, + "real_word_error_likelihood": 0.95, + "direct_generator": [ + { + "field": "suggest", + "suggest_mode": "always", + "max_term_freq": 0.5, + "min_doc_freq": 0, + "prefix_length": 2 + } + ], + "highlight": { + "pre_tag": "<em>", + "post_tag": "<\/em>" + }, + "smoothing": { + "stupid_backoff": { + "discount": 0.4 + } + } + } + } + }, + "stats": [ + "suggest", + "full_text" + ], + "size": 20, + "rescore": [ + { + "window_size": 8192, + "query": { + "query_weight": 1, + "rescore_query_weight": 1, + "score_mode": "multiply", + "rescore_query": { + "function_score": { + "functions": [ + { + "field_value_factor": { + "field": "incoming_links", + "modifier": "log2p", + "missing": 0 + } + }, + { + "weight": "0.25", + "filter": { + "terms": { + "namespace": [ + 1 + ] + } + } + }, + { + "weight": "0.05", + "filter": { + "terms": { + "namespace": [ + 2 + ] + } + } + }, + { + "weight": "0.0125", + "filter": { + "terms": { + "namespace": [ + 3 + ] + } + } + } + ] + } + } + } + } + ] + }, + "options": { + "search_type": "dfs_query_then_fetch", + "timeout": "20s" + } +} \ No newline at end of file diff --git a/tests/unit/fixtures/searchText/inlanguage_003.query b/tests/unit/fixtures/searchText/inlanguage_003.query new file mode 100644 index 0000000..52680b7 --- /dev/null +++ b/tests/unit/fixtures/searchText/inlanguage_003.query @@ -0,0 +1 @@ +foo inlanguage:ru,uk bar diff --git a/tests/unit/fixtures/searchText/inlanguage_004.expected b/tests/unit/fixtures/searchText/inlanguage_004.expected new file mode 100644 index 0000000..41b59a5 --- /dev/null +++ b/tests/unit/fixtures/searchText/inlanguage_004.expected @@ -0,0 +1,235 @@ +{ + "description": "full_text search for 'inlanguage: foo\n'", + "path": "wiki\/page\/_search", + "params": { + "search_type": "dfs_query_then_fetch", + "timeout": "20s" + }, + "query": { + "_source": [ + "id", + "title", + "namespace", + "redirect.*", + "timestamp", + "text_bytes" + ], + "fields": "text.word_count", + "query": { + "bool": { + "must": [ + { + "match_all": {} + } + ], + "filter": [ + { + "bool": { + "must": [ + { + "match": { + "language": { + "query": "foo" + } + } + }, + { + "terms": { + "namespace": [ + 0, + 1, + 2, + 3 + ] + } + } + ] + } + } + ] + } + }, + "highlight": { + "pre_tags": [ + "<span class=\"searchmatch\">" + ], + "post_tags": [ + "<\/span>" + ], + "fields": { + "title": { + "type": "experimental", + "fragmenter": "none", + "number_of_fragments": 1, + "matched_fields": [ + "title", + "title.plain" + ] + }, + "redirect.title": { + "type": "experimental", + "fragmenter": "none", + "order": "score", + "number_of_fragments": 1, + "options": { + "skip_if_last_matched": true + }, + "matched_fields": [ + "redirect.title", + "redirect.title.plain" + ] + }, + "category": { + "type": "experimental", + "fragmenter": "none", + "order": "score", + "number_of_fragments": 1, + "options": { + "skip_if_last_matched": true + }, + "matched_fields": [ + "category", + "category.plain" + ] + }, + "heading": { + "type": "experimental", + "fragmenter": "none", + "order": "score", + "number_of_fragments": 1, + "options": { + "skip_if_last_matched": true + }, + "matched_fields": [ + "heading", + "heading.plain" + ] + }, + "text": { + "type": "experimental", + "number_of_fragments": 1, + "fragmenter": "scan", + "fragment_size": 150, + "options": { + "top_scoring": true, + "boost_before": { + "20": 2, + "50": 1.8, + "200": 1.5, + "1000": 1.2 + }, + "max_fragments_scored": 5000 + }, + "no_match_size": 150, + "matched_fields": [ + "text", + "text.plain" + ] + }, + "auxiliary_text": { + "type": "experimental", + "number_of_fragments": 1, + "fragmenter": "scan", + "fragment_size": 150, + "options": { + "top_scoring": true, + "boost_before": { + "20": 2, + "50": 1.8, + "200": 1.5, + "1000": 1.2 + }, + "max_fragments_scored": 5000, + "skip_if_last_matched": true + }, + "matched_fields": [ + "auxiliary_text", + "auxiliary_text.plain" + ] + }, + "file_text": { + "type": "experimental", + "number_of_fragments": 1, + "fragmenter": "scan", + "fragment_size": 150, + "options": { + "top_scoring": true, + "boost_before": { + "20": 2, + "50": 1.8, + "200": 1.5, + "1000": 1.2 + }, + "max_fragments_scored": 5000, + "skip_if_last_matched": true + }, + "matched_fields": [ + "file_text", + "file_text.plain" + ] + } + } + }, + "size": 20, + "rescore": [ + { + "window_size": 8192, + "query": { + "query_weight": 1, + "rescore_query_weight": 1, + "score_mode": "multiply", + "rescore_query": { + "function_score": { + "functions": [ + { + "field_value_factor": { + "field": "incoming_links", + "modifier": "log2p", + "missing": 0 + } + }, + { + "weight": "0.25", + "filter": { + "terms": { + "namespace": [ + 1 + ] + } + } + }, + { + "weight": "0.05", + "filter": { + "terms": { + "namespace": [ + 2 + ] + } + } + }, + { + "weight": "0.0125", + "filter": { + "terms": { + "namespace": [ + 3 + ] + } + } + } + ] + } + } + } + } + ], + "stats": [ + "full_text" + ] + }, + "options": { + "search_type": "dfs_query_then_fetch", + "timeout": "20s" + } +} \ No newline at end of file diff --git a/tests/unit/fixtures/searchText/inlanguage_004.query b/tests/unit/fixtures/searchText/inlanguage_004.query new file mode 100644 index 0000000..04a28e9 --- /dev/null +++ b/tests/unit/fixtures/searchText/inlanguage_004.query @@ -0,0 +1 @@ +inlanguage: foo diff --git a/tests/unit/fixtures/searchText/inlanguage_005.expected b/tests/unit/fixtures/searchText/inlanguage_005.expected new file mode 100644 index 0000000..eccfa68 --- /dev/null +++ b/tests/unit/fixtures/searchText/inlanguage_005.expected @@ -0,0 +1,316 @@ +{ + "description": "full_text search for 'inlanguage:ko, bar\n'", + "path": "wiki\/page\/_search", + "params": { + "search_type": "dfs_query_then_fetch", + "timeout": "20s" + }, + "query": { + "_source": [ + "id", + "title", + "namespace", + "redirect.*", + "timestamp", + "text_bytes" + ], + "fields": "text.word_count", + "query": { + "bool": { + "minimum_number_should_match": 1, + "should": [ + { + "query_string": { + "query": "bar", + "fields": [ + "all.plain^1", + "all^0.5" + ], + "auto_generate_phrase_queries": true, + "phrase_slop": 0, + "default_operator": "AND", + "allow_leading_wildcard": true, + "fuzzy_prefix_length": 2, + "rewrite": "top_terms_boost_1024", + "max_determinized_states": 500 + } + }, + { + "multi_match": { + "fields": [ + "all_near_match^2" + ], + "query": "bar" + } + } + ], + "filter": [ + { + "bool": { + "must": [ + { + "match": { + "language": { + "query": "ko" + } + } + }, + { + "terms": { + "namespace": [ + 0, + 1, + 2, + 3 + ] + } + } + ] + } + } + ] + } + }, + "highlight": { + "pre_tags": [ + "<span class=\"searchmatch\">" + ], + "post_tags": [ + "<\/span>" + ], + "fields": { + "title": { + "type": "experimental", + "fragmenter": "none", + "number_of_fragments": 1, + "matched_fields": [ + "title", + "title.plain" + ] + }, + "redirect.title": { + "type": "experimental", + "fragmenter": "none", + "order": "score", + "number_of_fragments": 1, + "options": { + "skip_if_last_matched": true + }, + "matched_fields": [ + "redirect.title", + "redirect.title.plain" + ] + }, + "category": { + "type": "experimental", + "fragmenter": "none", + "order": "score", + "number_of_fragments": 1, + "options": { + "skip_if_last_matched": true + }, + "matched_fields": [ + "category", + "category.plain" + ] + }, + "heading": { + "type": "experimental", + "fragmenter": "none", + "order": "score", + "number_of_fragments": 1, + "options": { + "skip_if_last_matched": true + }, + "matched_fields": [ + "heading", + "heading.plain" + ] + }, + "text": { + "type": "experimental", + "number_of_fragments": 1, + "fragmenter": "scan", + "fragment_size": 150, + "options": { + "top_scoring": true, + "boost_before": { + "20": 2, + "50": 1.8, + "200": 1.5, + "1000": 1.2 + }, + "max_fragments_scored": 5000 + }, + "no_match_size": 150, + "matched_fields": [ + "text", + "text.plain" + ] + }, + "auxiliary_text": { + "type": "experimental", + "number_of_fragments": 1, + "fragmenter": "scan", + "fragment_size": 150, + "options": { + "top_scoring": true, + "boost_before": { + "20": 2, + "50": 1.8, + "200": 1.5, + "1000": 1.2 + }, + "max_fragments_scored": 5000, + "skip_if_last_matched": true + }, + "matched_fields": [ + "auxiliary_text", + "auxiliary_text.plain" + ] + }, + "file_text": { + "type": "experimental", + "number_of_fragments": 1, + "fragmenter": "scan", + "fragment_size": 150, + "options": { + "top_scoring": true, + "boost_before": { + "20": 2, + "50": 1.8, + "200": 1.5, + "1000": 1.2 + }, + "max_fragments_scored": 5000, + "skip_if_last_matched": true + }, + "matched_fields": [ + "file_text", + "file_text.plain" + ] + } + }, + "highlight_query": { + "query_string": { + "query": "bar", + "fields": [ + "title.plain^20", + "redirect.title.plain^15", + "category.plain^8", + "heading.plain^5", + "opening_text.plain^3", + "text.plain^1", + "auxiliary_text.plain^0.5", + "title^10", + "redirect.title^7.5", + "category^4", + "heading^2.5", + "opening_text^1.5", + "text^0.5", + "auxiliary_text^0.25" + ], + "auto_generate_phrase_queries": true, + "phrase_slop": 1, + "default_operator": "AND", + "allow_leading_wildcard": true, + "fuzzy_prefix_length": 2, + "rewrite": "top_terms_boost_1024", + "max_determinized_states": 500 + } + } + }, + "suggest": { + "text": "bar", + "suggest": { + "phrase": { + "field": "suggest", + "size": 1, + "max_errors": 2, + "confidence": 2, + "real_word_error_likelihood": 0.95, + "direct_generator": [ + { + "field": "suggest", + "suggest_mode": "always", + "max_term_freq": 0.5, + "min_doc_freq": 0, + "prefix_length": 2 + } + ], + "highlight": { + "pre_tag": "<em>", + "post_tag": "<\/em>" + }, + "smoothing": { + "stupid_backoff": { + "discount": 0.4 + } + } + } + } + }, + "stats": [ + "suggest", + "full_text" + ], + "size": 20, + "rescore": [ + { + "window_size": 8192, + "query": { + "query_weight": 1, + "rescore_query_weight": 1, + "score_mode": "multiply", + "rescore_query": { + "function_score": { + "functions": [ + { + "field_value_factor": { + "field": "incoming_links", + "modifier": "log2p", + "missing": 0 + } + }, + { + "weight": "0.25", + "filter": { + "terms": { + "namespace": [ + 1 + ] + } + } + }, + { + "weight": "0.05", + "filter": { + "terms": { + "namespace": [ + 2 + ] + } + } + }, + { + "weight": "0.0125", + "filter": { + "terms": { + "namespace": [ + 3 + ] + } + } + } + ] + } + } + } + } + ] + }, + "options": { + "search_type": "dfs_query_then_fetch", + "timeout": "20s" + } +} \ No newline at end of file diff --git a/tests/unit/fixtures/searchText/inlanguage_005.query b/tests/unit/fixtures/searchText/inlanguage_005.query new file mode 100644 index 0000000..b22f55c --- /dev/null +++ b/tests/unit/fixtures/searchText/inlanguage_005.query @@ -0,0 +1 @@ +inlanguage:ko, bar diff --git a/tests/unit/fixtures/searchText/inlanguage_006.expected b/tests/unit/fixtures/searchText/inlanguage_006.expected new file mode 100644 index 0000000..2318f7b --- /dev/null +++ b/tests/unit/fixtures/searchText/inlanguage_006.expected @@ -0,0 +1,222 @@ +{ + "description": "full_text search for 'inlanguage:,,,\n'", + "path": "wiki\/page\/_search", + "params": { + "search_type": "dfs_query_then_fetch", + "timeout": "20s" + }, + "query": { + "_source": [ + "id", + "title", + "namespace", + "redirect.*", + "timestamp", + "text_bytes" + ], + "fields": "text.word_count", + "query": { + "bool": { + "must": [ + { + "match_all": {} + } + ], + "filter": [ + { + "terms": { + "namespace": [ + 0, + 1, + 2, + 3 + ] + } + } + ] + } + }, + "highlight": { + "pre_tags": [ + "<span class=\"searchmatch\">" + ], + "post_tags": [ + "<\/span>" + ], + "fields": { + "title": { + "type": "experimental", + "fragmenter": "none", + "number_of_fragments": 1, + "matched_fields": [ + "title", + "title.plain" + ] + }, + "redirect.title": { + "type": "experimental", + "fragmenter": "none", + "order": "score", + "number_of_fragments": 1, + "options": { + "skip_if_last_matched": true + }, + "matched_fields": [ + "redirect.title", + "redirect.title.plain" + ] + }, + "category": { + "type": "experimental", + "fragmenter": "none", + "order": "score", + "number_of_fragments": 1, + "options": { + "skip_if_last_matched": true + }, + "matched_fields": [ + "category", + "category.plain" + ] + }, + "heading": { + "type": "experimental", + "fragmenter": "none", + "order": "score", + "number_of_fragments": 1, + "options": { + "skip_if_last_matched": true + }, + "matched_fields": [ + "heading", + "heading.plain" + ] + }, + "text": { + "type": "experimental", + "number_of_fragments": 1, + "fragmenter": "scan", + "fragment_size": 150, + "options": { + "top_scoring": true, + "boost_before": { + "20": 2, + "50": 1.8, + "200": 1.5, + "1000": 1.2 + }, + "max_fragments_scored": 5000 + }, + "no_match_size": 150, + "matched_fields": [ + "text", + "text.plain" + ] + }, + "auxiliary_text": { + "type": "experimental", + "number_of_fragments": 1, + "fragmenter": "scan", + "fragment_size": 150, + "options": { + "top_scoring": true, + "boost_before": { + "20": 2, + "50": 1.8, + "200": 1.5, + "1000": 1.2 + }, + "max_fragments_scored": 5000, + "skip_if_last_matched": true + }, + "matched_fields": [ + "auxiliary_text", + "auxiliary_text.plain" + ] + }, + "file_text": { + "type": "experimental", + "number_of_fragments": 1, + "fragmenter": "scan", + "fragment_size": 150, + "options": { + "top_scoring": true, + "boost_before": { + "20": 2, + "50": 1.8, + "200": 1.5, + "1000": 1.2 + }, + "max_fragments_scored": 5000, + "skip_if_last_matched": true + }, + "matched_fields": [ + "file_text", + "file_text.plain" + ] + } + } + }, + "size": 20, + "rescore": [ + { + "window_size": 8192, + "query": { + "query_weight": 1, + "rescore_query_weight": 1, + "score_mode": "multiply", + "rescore_query": { + "function_score": { + "functions": [ + { + "field_value_factor": { + "field": "incoming_links", + "modifier": "log2p", + "missing": 0 + } + }, + { + "weight": "0.25", + "filter": { + "terms": { + "namespace": [ + 1 + ] + } + } + }, + { + "weight": "0.05", + "filter": { + "terms": { + "namespace": [ + 2 + ] + } + } + }, + { + "weight": "0.0125", + "filter": { + "terms": { + "namespace": [ + 3 + ] + } + } + } + ] + } + } + } + } + ], + "stats": [ + "full_text" + ] + }, + "options": { + "search_type": "dfs_query_then_fetch", + "timeout": "20s" + } +} \ No newline at end of file diff --git a/tests/unit/fixtures/searchText/inlanguage_006.query b/tests/unit/fixtures/searchText/inlanguage_006.query new file mode 100644 index 0000000..7771345 --- /dev/null +++ b/tests/unit/fixtures/searchText/inlanguage_006.query @@ -0,0 +1 @@ +inlanguage:,,, -- To view, visit https://gerrit.wikimedia.org/r/312061 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: I15fc139531e3e6902ed64db915da1f8ef7910e99 Gerrit-PatchSet: 5 Gerrit-Project: mediawiki/extensions/CirrusSearch Gerrit-Branch: master Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: Cindy-the-browser-test-bot <bernhardsone...@gmail.com> Gerrit-Reviewer: DCausse <dcau...@wikimedia.org> Gerrit-Reviewer: EBernhardson <ebernhard...@wikimedia.org> Gerrit-Reviewer: Gehel <gleder...@wikimedia.org> Gerrit-Reviewer: Manybubbles <never...@wikimedia.org> Gerrit-Reviewer: jenkins-bot <> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits