EBernhardson has uploaded a new change for review.
https://gerrit.wikimedia.org/r/299195
Change subject: Allow searching subset of category parents
......................................................................
Allow searching subset of category parents
The existing incategory feature can be useful for editors, but as a
reader it is often not obvious that a page might be in the
"American comedy musicians" category rather than the "American comedians"
category. This allows wiki administrators to curate a list of categories
that will be considered top level and adds a search keyword
The method of allowing a wiki's administrators to build up a list of
acceptable top level categories is probably not sustainable, but we also
can't possibly list the full tree of categories each page belongs to. An
alternate solution might be to have the message identify a meta-category
("Top level search categories"?) and all categories in that category
would be considered top level.
Another downside is that changes to the definition of a top level
category are not immediately reflected in search. Rather we have to wait
for the page to be reindexed to pick up the new top level category
definition. It's plausible though we could create a job that sees the
change in definition and descends through the category subpages
reindexing everything necessary.
Change-Id: I0ae9863bad985a1413451d8925e558d535225bfb
---
M i18n/en.json
M i18n/qqq.json
M includes/BuildDocument/PageDataBuilder.php
M includes/Maintenance/MappingConfigBuilder.php
M includes/Searcher.php
M includes/Util.php
6 files changed, 88 insertions(+), 6 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch
refs/changes/95/299195/1
diff --git a/i18n/en.json b/i18n/en.json
index 327b96f..cfafe07 100644
--- a/i18n/en.json
+++ b/i18n/en.json
@@ -34,5 +34,6 @@
"cirrussearch-completion-profile-fuzzy": "Similar to normal with typo
correction (two typos supported).",
"cirrussearch-qi-profile-classic": "Ranking based on the number of
incoming links, some templates, article language and recency
(templates/language/recency may not be activated on this wiki).",
"cirrussearch-qi-profile-classic-noboostlinks": "Ranking based on some
templates, article language and recency when activated on this wiki.",
- "cirrussearch-qi-profile-empty": "Ranking based solely on query
dependent features (for debug only)."
+ "cirrussearch-qi-profile-empty": "Ranking based solely on query
dependent features (for debug only).",
+ "cirrussearch-top-level-categories": "# This message lets you configure
the categories considered 'top level' for use in the intopcategory: search
keyword.\n# Each line prefixed with a # will be removed\n# All other lines will
be treated as a category name. Names must use\n# spaces between words and be
capitalized the same as the category itself."
}
diff --git a/i18n/qqq.json b/i18n/qqq.json
index a3bd485..a8b124e 100644
--- a/i18n/qqq.json
+++ b/i18n/qqq.json
@@ -42,5 +42,6 @@
"cirrussearch-completion-profile-fuzzy": "Fuzzy profile for completion
(search as you type).",
"cirrussearch-qi-profile-classic": "Classic query independent search
profile (affects fulltext search ranking algorithm).",
"cirrussearch-qi-profile-classic-noboostlinks": "Classic query
independent search profile without boost links (affects fulltext search ranking
algorithm).",
- "cirrussearch-qi-profile-empty": "Empty query independent search
profile (affects fulltext search ranking algorithm)."
+ "cirrussearch-qi-profile-empty": "Empty query independent search
profile (affects fulltext search ranking algorithm).",
+ 'cirrussearch-top-level-categories": "Settings for the top level
categories feature.\n This feature is a search keyword that allows filtering to
a subset of parent categories.""
}
diff --git a/includes/BuildDocument/PageDataBuilder.php
b/includes/BuildDocument/PageDataBuilder.php
index 04a1e3d..24fd17c 100644
--- a/includes/BuildDocument/PageDataBuilder.php
+++ b/includes/BuildDocument/PageDataBuilder.php
@@ -60,10 +60,41 @@
private function categories() {
$categories = array();
+ $topLevel = array();
+ $endPoints = array_flip( Util::getTopLevelCategories() );
foreach ( array_keys( $this->parserOutput->getCategories() ) as
$key ) {
- $categories[] = Category::newFromName( $key
)->getTitle()->getText();
+ $category = Category::newFromName( $key );
+ $categories[] = $category->getTitle()->getText();
+ $topLevel = array_merge( $topLevel,
$this->findTopLevelCategory( $category, $endPoints ) );
}
$this->doc->set( 'category', $categories );
+ $this->doc->set( 'top_category', $topLevel );
+ }
+
+ private function findTopLevelCategory( Category $category, array
$endPoints, array &$visited = array() ) {
+ $results = array();
+ $name = $category->getName();
+ if ( isset( $visited[$name] ) ) {
+ return $results;
+ }
+ $visited[$category->getName()] = true;
+
+ $parents = wfGetDB( DB_SLAVE )->select(
+ array( 'categorylinks', 'category' ),
+ array( 'cat_title', 'cat_id', 'cat_subcats',
'cat_pages', 'cat_files' ),
+ array( 'cl_from' =>
$category->getTitle()->getArticleID(), 'cl_to = cat_title' ),
+ __METHOD__
+ );
+ foreach ( $parents as $row ) {
+ $parent = Category::newFromRow( $row );
+ $text = $parent->getTitle()->getText();
+ if ( isset( $endPoints[$text] ) ) {
+ $results[] = $text;
+ }
+ $results = array_merge( $results,
$this->findTopLevelCategory( $parent, $endPoints, $visited ) );
+ }
+
+ return $results;
}
private function externalLinks() {
diff --git a/includes/Maintenance/MappingConfigBuilder.php
b/includes/Maintenance/MappingConfigBuilder.php
index 6c12f1b..a174678 100644
--- a/includes/Maintenance/MappingConfigBuilder.php
+++ b/includes/Maintenance/MappingConfigBuilder.php
@@ -155,6 +155,14 @@
'ignore_above' =>
self::KEYWORD_IGNORE_ABOVE,
) )
),
+ 'top_category' => $this->buildStringField(
'category', $textOptions, array(
+ array(
+ 'analyzer' =>
'lowercase_keyword',
+ 'norms' => array( 'enabled' =>
false ),
+ 'index_options' => 'docs',
+ 'ignore_above' =>
self::KEYWORD_IGNORE_ABOVE,
+ ) )
+ ),
'template' =>
$this->buildLowercaseKeywordField(),
'outgoing_link' => $this->buildKeywordField(),
'external_link' => $this->buildKeywordField(),
diff --git a/includes/Searcher.php b/includes/Searcher.php
index 31e0f6a..9f1e324 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -205,7 +205,7 @@
* @param SearchConfig|null $config Configuration settings
* @param int[]|null $namespaces Array of namespace numbers to search
or null to search all namespaces.
* @param User|null $user user for which this search is being
performed. Attached to slow request logs.
- * @param string|boolean $index Base name for index to search from,
defaults to wfWikiId()
+ * @param string|boolean $index Base name for index to search from,
defaults to $wgCirrusSearchIndexBaseName
*/
public function __construct( Connection $conn, $offset, $limit,
SearchConfig $config = null, array $namespaces = null,
User $user = null, $index = false ) {
@@ -607,6 +607,17 @@
}
$searchContainedSyntax = true;
return '';
+ case 'intopcategory':
+ // @todo pre-filter by known
set of top categories?
+ $categories = array_slice(
explode( '|', $value ), 0, $this->config->get(
'CirrusSearchMaxIncategoryOptions' ) );
+ $categoryFilters =
$this->matchPageCategories( $categories, 'top category' );
+ if ( $categoryFilters === null
) {
+ $isEmptyQuery = true;
+ } else {
+ $filterDestination[] =
$categoryFilters;
+ }
+ $searchContainedSyntax = true;
+ return '';
case 'insource':
$updateReferences =
Filters::insource( $this->escaper, $this->getSearchContext(), $quotedValue );
$updateReferences( $fuzzyQuery,
$filterDestination, $highlightSource, $searchContainedSyntax );
@@ -823,10 +834,12 @@
/**
* Builds an or between many categories that the page could be in.
* @param string[] $categories categories to match
+ * @param string $type Toggle between direct categories and top level
+ * categories by passing either 'category' or 'top category'
* @return \Elastica\Query\BoolQuery|null A null return value means all
values are filtered
* and an empty result set should be returned.
*/
- public function matchPageCategories( $categories ) {
+ public function matchPageCategories( $categories, $type = 'category' ) {
$filter = new \Elastica\Query\BoolQuery();
$ids = array();
$names = array();
@@ -846,8 +859,11 @@
if ( !$names ) {
return null;
}
+ $field = $type === 'top category'
+ ? 'top_category.lowercase_keyword'
+ : 'category.lowercase_keyword';
foreach( $names as $name ) {
- $filter->addShould( $this->matchPage(
'category.lowercase_keyword', $name ) );
+ $filter->addShould( $this->matchPage( $field, $name ) );
}
return $filter;
}
diff --git a/includes/Util.php b/includes/Util.php
index 5faae73..ffc572b 100644
--- a/includes/Util.php
+++ b/includes/Util.php
@@ -46,6 +46,13 @@
private static $defaultBoostTemplates = null;
/**
+ * Cache getTopLevelCategories()
+ *
+ * @var array|null List of top level categories
+ */
+ private static $topLevelCategories = null;
+
+ /**
* Get the textual representation of a namespace with underscores
stripped, varying
* by gender if need be (using Title::getNsText()).
*
@@ -415,6 +422,24 @@
return self::$defaultBoostTemplates;
}
+ public static function getTopLevelCategories() {
+ if ( self::$topLevelCategories === null ) {
+ $cache = \ObjectCache::getLocalServerInstance();
+ self::$topLevelCategories = $cache->getWithSetCallback(
+ $cache->makeKey(
'cirrussearch-top-level-categories' ),
+ 6,
+ function () {
+ $source = wfMessage(
'cirrussearch-top-level-categories' )->inContentLanguage();
+ if ( $source->isDisabled() ) {
+ return array();
+ }
+ return Util::parseSettingsInMessage(
$source->plain() );
+ }
+ );
+ }
+
+ return self::$topLevelCategories;
+ }
/**
* radius, if provided, must have either m or km suffix. Valid formats:
* <title>
--
To view, visit https://gerrit.wikimedia.org/r/299195
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I0ae9863bad985a1413451d8925e558d535225bfb
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits