EBernhardson has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/299195

Change subject: Allow searching subset of category parents
......................................................................

Allow searching subset of category parents

The existing incategory feature can be useful for editors, but as a
reader it is often not obvious that a page might be in the
"American comedy musicians" category rather than the "American comedians"
category. This allows wiki administrators to curate a list of categories
that will be considered top level and adds a search keyword

The method of allowing a wiki's administrators to build up a list of
acceptable top level categories is probably not sustainable, but we also
can't possibly list the full tree of categories each page belongs to. An
alternate solution might be to have the message identify a meta-category
("Top level search categories"?) and all categories in that category
would be considered top level.

Another downside is that changes to the definition of a top level
category are not immediately reflected in search. Rather we have to wait
for the page to be reindexed to pick up the new top level category
definition. It's plausible though we could create a job that sees the
change in definition and descends through the category subpages
reindexing everything necessary.

Change-Id: I0ae9863bad985a1413451d8925e558d535225bfb
---
M i18n/en.json
M i18n/qqq.json
M includes/BuildDocument/PageDataBuilder.php
M includes/Maintenance/MappingConfigBuilder.php
M includes/Searcher.php
M includes/Util.php
6 files changed, 88 insertions(+), 6 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/CirrusSearch 
refs/changes/95/299195/1

diff --git a/i18n/en.json b/i18n/en.json
index 327b96f..cfafe07 100644
--- a/i18n/en.json
+++ b/i18n/en.json
@@ -34,5 +34,6 @@
        "cirrussearch-completion-profile-fuzzy": "Similar to normal with typo 
correction (two typos supported).",
        "cirrussearch-qi-profile-classic": "Ranking based on the number of 
incoming links, some templates, article language and recency 
(templates/language/recency may not be activated on this wiki).",
        "cirrussearch-qi-profile-classic-noboostlinks": "Ranking based on some 
templates, article language and recency when activated on this wiki.",
-       "cirrussearch-qi-profile-empty": "Ranking based solely on query 
dependent features (for debug only)."
+       "cirrussearch-qi-profile-empty": "Ranking based solely on query 
dependent features (for debug only).",
+       "cirrussearch-top-level-categories": "# This message lets you configure 
the categories considered 'top level' for use in the intopcategory: search 
keyword.\n# Each line prefixed with a # will be removed\n# All other lines will 
be treated as a category name. Names must use\n# spaces between words and be 
capitalized the same as the category itself."
 }
diff --git a/i18n/qqq.json b/i18n/qqq.json
index a3bd485..a8b124e 100644
--- a/i18n/qqq.json
+++ b/i18n/qqq.json
@@ -42,5 +42,6 @@
        "cirrussearch-completion-profile-fuzzy": "Fuzzy profile for completion 
(search as you type).",
        "cirrussearch-qi-profile-classic": "Classic query independent search 
profile (affects fulltext search ranking algorithm).",
        "cirrussearch-qi-profile-classic-noboostlinks": "Classic query 
independent search profile without boost links (affects fulltext search ranking 
algorithm).",
-       "cirrussearch-qi-profile-empty": "Empty query independent search 
profile (affects fulltext search ranking algorithm)."
+       "cirrussearch-qi-profile-empty": "Empty query independent search 
profile (affects fulltext search ranking algorithm).",
+       'cirrussearch-top-level-categories": "Settings for the top level 
categories feature.\n This feature is a search keyword that allows filtering to 
a subset of parent categories.""
 }
diff --git a/includes/BuildDocument/PageDataBuilder.php 
b/includes/BuildDocument/PageDataBuilder.php
index 04a1e3d..24fd17c 100644
--- a/includes/BuildDocument/PageDataBuilder.php
+++ b/includes/BuildDocument/PageDataBuilder.php
@@ -60,10 +60,41 @@
 
        private function categories() {
                $categories = array();
+               $topLevel = array();
+               $endPoints = array_flip( Util::getTopLevelCategories() );
                foreach ( array_keys( $this->parserOutput->getCategories() ) as 
$key ) {
-                       $categories[] = Category::newFromName( $key 
)->getTitle()->getText();
+                       $category = Category::newFromName( $key );
+                       $categories[] = $category->getTitle()->getText();
+                       $topLevel = array_merge( $topLevel, 
$this->findTopLevelCategory( $category, $endPoints ) );
                }
                $this->doc->set( 'category', $categories );
+               $this->doc->set( 'top_category', $topLevel );
+       }
+
+       private function findTopLevelCategory( Category $category, array 
$endPoints, array &$visited = array() ) {
+               $results = array();
+               $name = $category->getName();
+               if ( isset( $visited[$name] ) ) {
+                       return $results;
+               }
+               $visited[$category->getName()] = true;
+
+               $parents = wfGetDB( DB_SLAVE )->select(
+                       array( 'categorylinks', 'category' ),
+                       array( 'cat_title', 'cat_id', 'cat_subcats', 
'cat_pages', 'cat_files' ),
+                       array( 'cl_from' => 
$category->getTitle()->getArticleID(), 'cl_to = cat_title'  ),
+                       __METHOD__
+               );
+               foreach ( $parents as $row ) {
+                       $parent = Category::newFromRow( $row );
+                       $text = $parent->getTitle()->getText();
+                       if ( isset( $endPoints[$text] ) ) {
+                               $results[] = $text;
+                       }
+                       $results = array_merge( $results, 
$this->findTopLevelCategory( $parent, $endPoints, $visited ) );
+               }
+
+               return $results;
        }
 
        private function externalLinks() {
diff --git a/includes/Maintenance/MappingConfigBuilder.php 
b/includes/Maintenance/MappingConfigBuilder.php
index 6c12f1b..a174678 100644
--- a/includes/Maintenance/MappingConfigBuilder.php
+++ b/includes/Maintenance/MappingConfigBuilder.php
@@ -155,6 +155,14 @@
                                                'ignore_above' => 
self::KEYWORD_IGNORE_ABOVE,
                                        ) )
                                ),
+                               'top_category' => $this->buildStringField( 
'category', $textOptions, array(
+                                       array(
+                                               'analyzer' => 
'lowercase_keyword',
+                                               'norms' => array( 'enabled' => 
false ),
+                                               'index_options' => 'docs',
+                                               'ignore_above' => 
self::KEYWORD_IGNORE_ABOVE,
+                                       ) )
+                               ),
                                'template' => 
$this->buildLowercaseKeywordField(),
                                'outgoing_link' => $this->buildKeywordField(),
                                'external_link' => $this->buildKeywordField(),
diff --git a/includes/Searcher.php b/includes/Searcher.php
index 31e0f6a..9f1e324 100644
--- a/includes/Searcher.php
+++ b/includes/Searcher.php
@@ -205,7 +205,7 @@
         * @param SearchConfig|null $config Configuration settings
         * @param int[]|null $namespaces Array of namespace numbers to search 
or null to search all namespaces.
         * @param User|null $user user for which this search is being 
performed.  Attached to slow request logs.
-        * @param string|boolean $index Base name for index to search from, 
defaults to wfWikiId()
+        * @param string|boolean $index Base name for index to search from, 
defaults to $wgCirrusSearchIndexBaseName
         */
        public function __construct( Connection $conn, $offset, $limit, 
SearchConfig $config = null, array $namespaces = null,
                User $user = null, $index = false ) {
@@ -607,6 +607,17 @@
                                                }
                                                $searchContainedSyntax = true;
                                                return '';
+                                       case 'intopcategory':
+                                               // @todo pre-filter by known 
set of top categories?
+                                               $categories = array_slice( 
explode( '|', $value ), 0, $this->config->get( 
'CirrusSearchMaxIncategoryOptions' ) );
+                                               $categoryFilters = 
$this->matchPageCategories( $categories, 'top category' );
+                                               if ( $categoryFilters === null 
) {
+                                                       $isEmptyQuery = true;
+                                               } else {
+                                                       $filterDestination[] = 
$categoryFilters;
+                                               }
+                                               $searchContainedSyntax = true;
+                                               return '';
                                        case 'insource':
                                                $updateReferences = 
Filters::insource( $this->escaper, $this->getSearchContext(), $quotedValue );
                                                $updateReferences( $fuzzyQuery, 
$filterDestination, $highlightSource, $searchContainedSyntax );
@@ -823,10 +834,12 @@
        /**
         * Builds an or between many categories that the page could be in.
         * @param string[] $categories categories to match
+        * @param string $type Toggle between direct categories and top level
+        *  categories by passing either 'category' or 'top category'
         * @return \Elastica\Query\BoolQuery|null A null return value means all 
values are filtered
         *  and an empty result set should be returned.
         */
-       public function matchPageCategories( $categories ) {
+       public function matchPageCategories( $categories, $type = 'category' ) {
                $filter = new \Elastica\Query\BoolQuery();
                $ids = array();
                $names = array();
@@ -846,8 +859,11 @@
                if ( !$names ) {
                        return null;
                }
+               $field = $type === 'top category'
+                       ? 'top_category.lowercase_keyword'
+                       : 'category.lowercase_keyword';
                foreach( $names as $name ) {
-                       $filter->addShould( $this->matchPage( 
'category.lowercase_keyword', $name ) );
+                       $filter->addShould( $this->matchPage( $field, $name ) );
                }
                return $filter;
        }
diff --git a/includes/Util.php b/includes/Util.php
index 5faae73..ffc572b 100644
--- a/includes/Util.php
+++ b/includes/Util.php
@@ -46,6 +46,13 @@
        private static $defaultBoostTemplates = null;
 
        /**
+        * Cache getTopLevelCategories()
+        *
+        * @var array|null List of top level categories
+        */
+       private static $topLevelCategories = null;
+
+       /**
         * Get the textual representation of a namespace with underscores 
stripped, varying
         * by gender if need be (using Title::getNsText()).
         *
@@ -415,6 +422,24 @@
                return self::$defaultBoostTemplates;
        }
 
+       public static function getTopLevelCategories() {
+               if ( self::$topLevelCategories === null ) {
+                       $cache = \ObjectCache::getLocalServerInstance();
+                       self::$topLevelCategories = $cache->getWithSetCallback(
+                               $cache->makeKey( 
'cirrussearch-top-level-categories' ),
+                               6,
+                               function () {
+                                       $source = wfMessage( 
'cirrussearch-top-level-categories' )->inContentLanguage();
+                                       if ( $source->isDisabled() ) {
+                                               return array();
+                                       }
+                                       return Util::parseSettingsInMessage( 
$source->plain() );
+                               }
+                       );
+               }
+
+               return self::$topLevelCategories;
+       }
        /**
         * radius, if provided, must have either m or km suffix. Valid formats:
         *   <title>

-- 
To view, visit https://gerrit.wikimedia.org/r/299195
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I0ae9863bad985a1413451d8925e558d535225bfb
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/CirrusSearch
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to