Mattflaschen has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/146663

Change subject: Refactor GettingStartedGetPages to support 'morelike' 
suggestions
......................................................................

Refactor GettingStartedGetPages to support 'morelike' suggestions

* Factor out a PageSuggester interface
* Implement CategoryPageSuggester for existing Redis functionality
* Add MoreLikePageSuggester to use CirrusSearch to use 'morelike'
  functionality

Change-Id: I90d8753efa87a4edc2d57cd67f64b8ec8d5a00ad
---
A CategoryPageSuggester.php
M GettingStarted.php
A MoreLikePageSuggester.php
A PageSuggester.php
A PageSuggesterFactory.php
M README
M api/ApiGettingStartedGetPages.php
7 files changed, 249 insertions(+), 64 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/GettingStarted 
refs/changes/63/146663/1

diff --git a/CategoryPageSuggester.php b/CategoryPageSuggester.php
new file mode 100644
index 0000000..a0302ae
--- /dev/null
+++ b/CategoryPageSuggester.php
@@ -0,0 +1,48 @@
+<?php
+
+namespace GettingStarted;
+
+// See PageSuggester for API documentation
+class CategoryPageSuggester implements PageSuggester {
+       /** @var Category */
+       protected $category;
+
+       /**
+        * Constructs a CategoryPageSuggester that uses the given category
+        *
+        * @param Category Category to use for suggestions
+        */
+       public function __construct( \Category $category ) {
+               $this->category = $category;
+       }
+
+       // $offset is ignored because it does not make sense when randomly 
pulling articles
+       // out of Redis.
+       public function getArticles( $count, $offset ) {
+               $key = RedisCategorySync::makeCategoryKey( $this->category );
+
+               $redis = RedisCategorySync::getClient();
+               if ( !$redis ) {
+                       wfDebugLog( 'GettingStarted', "Unable to acquire redis 
connection.  Returning early.\n" );
+                       return array();
+               }
+
+               try {
+                       $randomArticleIDs = $redis->sRandMember( $key, $count );
+               } catch ( RedisException $e ) {
+                       wfDebugLog( 'GettingStarted', 'Redis exception: ' . 
$e->getMessage() . ".  Returning early.\n" );
+                       return array();
+               }
+
+               if ( is_array( $randomArticleIDs ) ) {
+                       return \Title::newFromIDs( $randomArticleIDs );
+               } else {
+                       wfDebugLog( 'GettingStarted', 'Redis returned a 
non-array value, possibly an error.' );
+                       return array();
+               }
+       }
+
+       public function isRandomized() {
+               return true;
+       }
+}
diff --git a/GettingStarted.php b/GettingStarted.php
index 3c4eea2..59f8670 100644
--- a/GettingStarted.php
+++ b/GettingStarted.php
@@ -81,6 +81,10 @@
        'GettingStarted\RedisCategorySync' => __DIR__ . 
'/RedisCategorySync.php',
        'GettingStarted\PageFilter' => __DIR__ . '/PageFilter.php',
        'GettingStarted\ApiGettingStartedGetPages' => __DIR__ . 
'/api/ApiGettingStartedGetPages.php',
+       'GettingStarted\PageSuggesterFactory' => __DIR__ . 
'/PageSuggesterFactory.php',
+       'GettingStarted\PageSuggester' => __DIR__ . '/PageSuggester.php',
+       'GettingStarted\CategoryPageSuggester' => __DIR__ . 
'/CategoryPageSuggester.php',
+       'GettingStarted\MoreLikePageSuggester' => __DIR__ . 
'/MoreLikePageSuggester.php',
 );
 
 $wgMessagesDirs['GettingStarted'] = __DIR__ . '/i18n';
diff --git a/MoreLikePageSuggester.php b/MoreLikePageSuggester.php
new file mode 100644
index 0000000..76688a2
--- /dev/null
+++ b/MoreLikePageSuggester.php
@@ -0,0 +1,69 @@
+<?php
+
+namespace GettingStarted;
+
+use Title;
+
+class MoreLikePageSuggester implements PageSuggester {
+       /** @var WebRequest */
+       protected $request;
+
+       /** @var Title */
+       protected $baseTitle;
+
+       /**
+        * Constructs a MoreLikePageSuggester with the given
+        * request and base title
+        *
+        * @param WebRequest $request Original web request
+        * @param Title $baseTitle Title to base suggestions on
+        */
+       public function __construct( \WebRequest $request, Title $baseTitle ) {
+               $this->request = $request;
+               $this->baseTitle = $baseTitle;
+       }
+
+       public function getArticles( $count, $offset ) {
+               global $wgSearchTypeAlternatives;
+
+               $query = 'morelike:' . $this->baseTitle->getPrefixedDBkey();
+               $params = array(
+                       'action' => 'query',
+                       'list' => 'search',
+                       'srnamespace' => NS_MAIN,
+                       'srlimit' => $count,
+                       'sroffset' => $offset,
+                       'srsearch' => $query,
+               );
+
+               if ( $wgSearchTypeAlternatives !== null &&
+                       count( $wgSearchTypeAlternatives ) > 0 ) {
+
+                       $params['srbackend'] = 'CirrusSearch';
+               }
+
+               $searchApiCall = new \ApiMain(
+                       new \DerivativeRequest(
+                               $this->request,
+                               $params,
+                               false // Not posted
+                       ),
+                       false // Don't enable write
+               );
+               $searchApiCall->execute();
+               $apiResult = $searchApiCall->getResultData();
+               $titles = array();
+               if ( isset( $apiResult['query']['search'] ) && is_array( 
$apiResult['query']['search'] ) ) {
+                       $searchResults = $apiResult['query']['search'];
+                       foreach ( $searchResults as $searchResult ) {
+                               $titles[] = Title::newFromText( 
$searchResult['title'] );
+                       }
+               }
+
+               return $titles;
+       }
+
+       public function isRandomized() {
+               return false;
+       }
+}
diff --git a/PageSuggester.php b/PageSuggester.php
new file mode 100644
index 0000000..669498f
--- /dev/null
+++ b/PageSuggester.php
@@ -0,0 +1,26 @@
+<?php
+
+namespace GettingStarted;
+
+interface PageSuggester {
+       /**
+        * Gets suggested articles
+        *
+        * @param int $count Number of articles to attempt to get;
+        *  May get less than this.
+        * @param int $offset Offset in results to start from (optional, 
defaults to
+        *  zero (no offset).  Only useful for non-randomized suggesters
+        *
+        * @return array Array of up to $count suggested articles, as Title 
objects
+        */
+       public function getArticles( $count, $offset );
+
+       /**
+        * Returns whether this PageSuggester is randomized.
+        *
+        * If it is randomized, retries will yield different results (and thus 
retrying can
+        * be useful if PageFilter rejects some the first time), and non-zero 
offsets for
+        * getArticles do not make sense.
+        */
+       public function isRandomized();
+}
diff --git a/PageSuggesterFactory.php b/PageSuggesterFactory.php
new file mode 100644
index 0000000..7d9099b
--- /dev/null
+++ b/PageSuggesterFactory.php
@@ -0,0 +1,44 @@
+<?php
+
+namespace GettingStarted;
+
+use Category, Title;
+
+class PageSuggesterFactory {
+       /**
+        * Gets the PageSuggester object for a given type
+        *
+        * @param string $taskName Name of task type, such as 'copyedit' or 
'morelikethis'.
+        *  Factory will determine which backend this corresponds to, such as
+        *  CategoryPageSuggester or MoreLikeThisPageSuggester
+        * @param WebRequest $request Request to use (needed for 
DerivativeRequest in some
+        *  cases)
+        * @param Title $sourceTitle Title of page used as a base for 
suggestions;
+        *  Required only for MoreLikeThisSuggester, otherwise optional.
+        *
+        * @return {PageSuggester|null} PageSuggester object, or null if no 
valid suggester
+        *  is found
+        */
+       public static function getPageSuggester( $taskName, \WebRequest 
$request, $sourceTitle ) {
+               global $wgGettingStartedCategoriesForTaskTypes;
+
+               if ( isset( $wgGettingStartedCategoriesForTaskTypes[$taskName] 
) ) {
+                       $sanitizedTitle = \Title::newFromText( 
$wgGettingStartedCategoriesForTaskTypes[ $taskName ] );
+
+                       if ( !( $sanitizedTitle && 
$sanitizedTitle->inNamespace( NS_CATEGORY ) ) ) {
+                               return null;
+                       }
+
+                       $category = \Category::newFromTitle( $sanitizedTitle );
+
+                       return new CategoryPageSuggester( $category );
+               } else if ( class_exists( 'CirrusSearch' ) &&
+                       $taskName === 'morelike' &&
+                       $sourceTitle !== null ) {
+
+                       return new MoreLikePageSuggester( $request, 
$sourceTitle );
+               } else {
+                       return null;
+               }
+       }
+}
diff --git a/README b/README
index 2b1c9ce..9b67931 100644
--- a/README
+++ b/README
@@ -23,6 +23,9 @@
 CentralAuth (https://www.mediawiki.org/wiki/Extension:CentralAuth) is 
supported,
 but not required.
 
+CirrusSearch is optional, but required if you use the 'morelike' suggestion 
type
+(pages similar to a given page based on text).
+
 Dependencies
 
 * https://www.mediawiki.org/wiki/Extension:EventLogging - Used for logging
diff --git a/api/ApiGettingStartedGetPages.php 
b/api/ApiGettingStartedGetPages.php
index bc53bb8..e91ebc1 100644
--- a/api/ApiGettingStartedGetPages.php
+++ b/api/ApiGettingStartedGetPages.php
@@ -5,11 +5,9 @@
 use ApiBase, Category, Title;
 
 class ApiGettingStartedGetPages extends ApiBase {
-       const MAX_ATTEMPTS = 100;
+       const MAX_SUGGESTER_CALLS = 10;
 
        public function execute() {
-               global $wgGettingStartedCategoriesForTaskTypes;
-
                $result = $this->getResult();
 
                // For PageFilter and specifically userCan( 'edit' )
@@ -22,21 +20,15 @@
                        'titles' => array()
                );
 
-               if ( isset( $wgGettingStartedCategoriesForTaskTypes[ $taskName 
] ) ) {
-                       $sanitizedTitle = Title::newFromText( 
$wgGettingStartedCategoriesForTaskTypes[ $taskName ] );
+               $suggester = PageSuggesterFactory::getPageSuggester( $taskName, 
$this->getRequest(), $excludedTitle );
+               if ( $suggester === null ) {
+                       $this->dieUsage( "Invalid 'taskname' parameter, or 
excludedtitle not provided when task requires it" );
+               }
+               $pageFilter = new PageFilter( $user, $excludedTitle );
+               $titles = self::getArticles( $count, $suggester, $pageFilter );
 
-                       if ( $sanitizedTitle && $sanitizedTitle->inNamespace( 
NS_CATEGORY ) ) {
-                               $category = Category::newFromTitle( 
$sanitizedTitle );
-                               $pageFilter = new PageFilter( $user, 
$excludedTitle );
-                               $titles = self::getRandomArticles( $count, 
$category, $pageFilter );
-
-                               foreach ( $titles as $title ) {
-                                       $data['titles'][] = 
$title->getPrefixedText();
-                               }
-                       }
-               } else {
-                       // TODO (phuedx 2014-02-05): This is technically a
-                       // failure and should be logged.
+               foreach ( $titles as $title ) {
+                       $data['titles'][] = $title->getPrefixedText();
                }
 
                $result->setIndexedTagName( $data['titles'], 'title' );
@@ -44,56 +36,55 @@
        }
 
        /**
-        * Get a random set of $numWanted unique pages in the
-        * category. If fewer than $numWanted pages exist in category,
-        * return as many as are available. It is up to the caller to decide
-        * how to handle the deficit.
+        * Get a set of $numWanted unique pages from the suggester, filtered by 
the
+        * PageFilter.
         *
-        * @param int $numWanted Number of unique pages to get.
-        * @param Category $category category to choose from
+        * If fewer than $numWanted pages acceptable suggestions are available, 
raturn as
+        * many as are available. It is up to the caller to decide how to 
handle the deficit.
+        *
+        * @param int $numWanted Number of unique pages to get
+        * @param PageSuggester $suggester PageSuggester to use
         * @param PageFilter $pageFilter filter than can approve or reject a 
page
-        * @return array Set of $numWanted unique Title objects (or however many
-        *   were available, if the desired count was not satisfiable).
+        * @return array Array of $numWanted unique Title objects (or however 
many were
+        *  available, if the desired count was not satisfiable).
         */
-       protected function getRandomArticles( $numWanted, Category $category, 
PageFilter $pageFilter ) {
-               $key = RedisCategorySync::makeCategoryKey( $category );
-
-               $redis = RedisCategorySync::getClient();
-               if ( !$redis ) {
-                       wfDebugLog( 'GettingStarted', "Unable to acquire redis 
connection.\n" );
-                       return array();
-               }
-
-               // Map article ID to Title.  At the end, we simply return a 
non-associative array of Titles.
-               // However, sRandMember can return the same ID more than once.  
This allows us to easily
-               // avoid these duplicates with array_key_exists.
-               $titles = array();
-
+       protected function getArticles( $numWanted, PageSuggester $suggester, 
PageFilter $pageFilter ) {
+               // We either retry or push the offset, depending on whether the 
suggester is randomized
+               $totalResultCount = 0;
                $attempts = 0;
-               while ( count( $titles ) < $numWanted ) {
-                       $attempts++;
-                       // Sanity check to prevent calling srand or filter too 
many times
-                       if ( $attempts >= self::MAX_ATTEMPTS ) {
-                               wfDebugLog( 'GettingStarted', 'Returning early 
after ' . self::MAX_ATTEMPTS . ".\n" );
-                               return array_values( $titles );
-                       }
-                       try {
-                               $randomArticleID = $redis->sRandMember( $key );
-                               // If it's not numeric, it's most likely false, 
meaning empty set or Redis failure.
-                               if ( is_numeric( $randomArticleID ) && 
!array_key_exists( $randomArticleID, $titles ) ) {
-                                       $title = Title::newFromID( 
$randomArticleID );
-                                       // Null means the title no longer 
exists, possibly due to bug 56044
-                                       if ( $title !== null && 
$pageFilter->isAllowedPage( $title ) ) {
-                                               $titles[$randomArticleID] = 
$title;
-                                       }
-                               }
-                       } catch ( RedisException $e ) {
-                               wfDebugLog( 'GettingStarted', 'Redis exception: 
' . $e->getMessage() . ".  Returning early.\n" );
-                               return array_values( $titles );
-                       }
-               }
+               $offset = 0;
+               $isRandomized = $suggester->isRandomized();
+               $filteredTitles = array();
 
-               return array_values( $titles );
+               do {
+                       $unfilteredTitles = $suggester->getArticles( $numWanted 
- $totalResultCount, $offset );
+
+                       $newFilteredTitles = array_filter( $unfilteredTitles, 
array( $pageFilter, 'isAllowedPage' ) );
+                       $newFilteredTitles = array_udiff( $newFilteredTitles, 
$filteredTitles, function ( $t1, $t2 ) {
+                               return $t1->getArticleID() - 
$t2->getArticleID();
+                       } );
+                       $filteredTitles = array_merge( $filteredTitles, 
$newFilteredTitles );
+
+                       $totalResultCount = count( $filteredTitles );
+
+                       if ( !$isRandomized ) {
+                               $numUnfilteredTitles = count( $unfilteredTitles 
);
+                               $prevOffset = $offset;
+                               $offset += $numUnfilteredTitles;
+                       }
+                       $attempts++;
+               } while (
+                       $totalResultCount < $numWanted &&
+                       $attempts < self::MAX_SUGGESTER_CALLS &&
+                       (
+                               $isRandomized ||
+
+                               // If it's not randomized, only continue if 
some progress is being made
+                               $offset !== $prevOffset
+                       )
+               );
+
+               return $filteredTitles;
        }
 
        public function getDescription() {
@@ -105,8 +96,8 @@
        public function getParamDescription() {
                return array(
                        'taskname' => 'Task name, for example, "copyedit"',
-                       'excludedtitle' => 'Full title of a page to exclude 
from the list',
-                       'count' => 'Requested count; will attempt to fetch this 
exact number, but may fetch fewer if no more are found after multiple attempts'
+                       'excludedtitle' => 'Full title of a page to exclude 
from the list; also used as the base title for recommendations based on a given 
page',
+                       'count' => 'Requested count; will attempt to fetch this 
exact number, but may fetch fewer if no more are found after multiple attempts',
                );
        }
 

-- 
To view, visit https://gerrit.wikimedia.org/r/146663
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I90d8753efa87a4edc2d57cd67f64b8ec8d5a00ad
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/GettingStarted
Gerrit-Branch: master
Gerrit-Owner: Mattflaschen <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to