Mattflaschen has uploaded a new change for review.
https://gerrit.wikimedia.org/r/146663
Change subject: Refactor GettingStartedGetPages to support 'morelike'
suggestions
......................................................................
Refactor GettingStartedGetPages to support 'morelike' suggestions
* Factor out a PageSuggester interface
* Implement CategoryPageSuggester for existing Redis functionality
* Add MoreLikePageSuggester to use CirrusSearch to use 'morelike'
functionality
Change-Id: I90d8753efa87a4edc2d57cd67f64b8ec8d5a00ad
---
A CategoryPageSuggester.php
M GettingStarted.php
A MoreLikePageSuggester.php
A PageSuggester.php
A PageSuggesterFactory.php
M README
M api/ApiGettingStartedGetPages.php
7 files changed, 249 insertions(+), 64 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/GettingStarted
refs/changes/63/146663/1
diff --git a/CategoryPageSuggester.php b/CategoryPageSuggester.php
new file mode 100644
index 0000000..a0302ae
--- /dev/null
+++ b/CategoryPageSuggester.php
@@ -0,0 +1,48 @@
+<?php
+
+namespace GettingStarted;
+
+// See PageSuggester for API documentation
+class CategoryPageSuggester implements PageSuggester {
+ /** @var Category */
+ protected $category;
+
+ /**
+ * Constructs a CategoryPageSuggester that uses the given category
+ *
+ * @param Category Category to use for suggestions
+ */
+ public function __construct( \Category $category ) {
+ $this->category = $category;
+ }
+
+ // $offset is ignored because it does not make sense when randomly
pulling articles
+ // out of Redis.
+ public function getArticles( $count, $offset ) {
+ $key = RedisCategorySync::makeCategoryKey( $this->category );
+
+ $redis = RedisCategorySync::getClient();
+ if ( !$redis ) {
+ wfDebugLog( 'GettingStarted', "Unable to acquire redis
connection. Returning early.\n" );
+ return array();
+ }
+
+ try {
+ $randomArticleIDs = $redis->sRandMember( $key, $count );
+ } catch ( RedisException $e ) {
+ wfDebugLog( 'GettingStarted', 'Redis exception: ' .
$e->getMessage() . ". Returning early.\n" );
+ return array();
+ }
+
+ if ( is_array( $randomArticleIDs ) ) {
+ return \Title::newFromIDs( $randomArticleIDs );
+ } else {
+ wfDebugLog( 'GettingStarted', 'Redis returned a
non-array value, possibly an error.' );
+ return array();
+ }
+ }
+
+ public function isRandomized() {
+ return true;
+ }
+}
diff --git a/GettingStarted.php b/GettingStarted.php
index 3c4eea2..59f8670 100644
--- a/GettingStarted.php
+++ b/GettingStarted.php
@@ -81,6 +81,10 @@
'GettingStarted\RedisCategorySync' => __DIR__ .
'/RedisCategorySync.php',
'GettingStarted\PageFilter' => __DIR__ . '/PageFilter.php',
'GettingStarted\ApiGettingStartedGetPages' => __DIR__ .
'/api/ApiGettingStartedGetPages.php',
+ 'GettingStarted\PageSuggesterFactory' => __DIR__ .
'/PageSuggesterFactory.php',
+ 'GettingStarted\PageSuggester' => __DIR__ . '/PageSuggester.php',
+ 'GettingStarted\CategoryPageSuggester' => __DIR__ .
'/CategoryPageSuggester.php',
+ 'GettingStarted\MoreLikePageSuggester' => __DIR__ .
'/MoreLikePageSuggester.php',
);
$wgMessagesDirs['GettingStarted'] = __DIR__ . '/i18n';
diff --git a/MoreLikePageSuggester.php b/MoreLikePageSuggester.php
new file mode 100644
index 0000000..76688a2
--- /dev/null
+++ b/MoreLikePageSuggester.php
@@ -0,0 +1,69 @@
+<?php
+
+namespace GettingStarted;
+
+use Title;
+
+class MoreLikePageSuggester implements PageSuggester {
+ /** @var WebRequest */
+ protected $request;
+
+ /** @var Title */
+ protected $baseTitle;
+
+ /**
+ * Constructs a MoreLikePageSuggester with the given
+ * request and base title
+ *
+ * @param WebRequest $request Original web request
+ * @param Title $baseTitle Title to base suggestions on
+ */
+ public function __construct( \WebRequest $request, Title $baseTitle ) {
+ $this->request = $request;
+ $this->baseTitle = $baseTitle;
+ }
+
+ public function getArticles( $count, $offset ) {
+ global $wgSearchTypeAlternatives;
+
+ $query = 'morelike:' . $this->baseTitle->getPrefixedDBkey();
+ $params = array(
+ 'action' => 'query',
+ 'list' => 'search',
+ 'srnamespace' => NS_MAIN,
+ 'srlimit' => $count,
+ 'sroffset' => $offset,
+ 'srsearch' => $query,
+ );
+
+ if ( $wgSearchTypeAlternatives !== null &&
+ count( $wgSearchTypeAlternatives ) > 0 ) {
+
+ $params['srbackend'] = 'CirrusSearch';
+ }
+
+ $searchApiCall = new \ApiMain(
+ new \DerivativeRequest(
+ $this->request,
+ $params,
+ false // Not posted
+ ),
+ false // Don't enable write
+ );
+ $searchApiCall->execute();
+ $apiResult = $searchApiCall->getResultData();
+ $titles = array();
+ if ( isset( $apiResult['query']['search'] ) && is_array(
$apiResult['query']['search'] ) ) {
+ $searchResults = $apiResult['query']['search'];
+ foreach ( $searchResults as $searchResult ) {
+ $titles[] = Title::newFromText(
$searchResult['title'] );
+ }
+ }
+
+ return $titles;
+ }
+
+ public function isRandomized() {
+ return false;
+ }
+}
diff --git a/PageSuggester.php b/PageSuggester.php
new file mode 100644
index 0000000..669498f
--- /dev/null
+++ b/PageSuggester.php
@@ -0,0 +1,26 @@
+<?php
+
+namespace GettingStarted;
+
+interface PageSuggester {
+ /**
+ * Gets suggested articles
+ *
+ * @param int $count Number of articles to attempt to get;
+ * May get less than this.
+ * @param int $offset Offset in results to start from (optional,
defaults to
+ * zero (no offset). Only useful for non-randomized suggesters
+ *
+ * @return array Array of up to $count suggested articles, as Title
objects
+ */
+ public function getArticles( $count, $offset );
+
+ /**
+ * Returns whether this PageSuggester is randomized.
+ *
+ * If it is randomized, retries will yield different results (and thus
retrying can
+ * be useful if PageFilter rejects some the first time), and non-zero
offsets for
+ * getArticles do not make sense.
+ */
+ public function isRandomized();
+}
diff --git a/PageSuggesterFactory.php b/PageSuggesterFactory.php
new file mode 100644
index 0000000..7d9099b
--- /dev/null
+++ b/PageSuggesterFactory.php
@@ -0,0 +1,44 @@
+<?php
+
+namespace GettingStarted;
+
+use Category, Title;
+
+class PageSuggesterFactory {
+ /**
+ * Gets the PageSuggester object for a given type
+ *
+ * @param string $taskName Name of task type, such as 'copyedit' or
'morelikethis'.
+ * Factory will determine which backend this corresponds to, such as
+ * CategoryPageSuggester or MoreLikeThisPageSuggester
+ * @param WebRequest $request Request to use (needed for
DerivativeRequest in some
+ * cases)
+ * @param Title $sourceTitle Title of page used as a base for
suggestions;
+ * Required only for MoreLikeThisSuggester, otherwise optional.
+ *
+ * @return {PageSuggester|null} PageSuggester object, or null if no
valid suggester
+ * is found
+ */
+ public static function getPageSuggester( $taskName, \WebRequest
$request, $sourceTitle ) {
+ global $wgGettingStartedCategoriesForTaskTypes;
+
+ if ( isset( $wgGettingStartedCategoriesForTaskTypes[$taskName]
) ) {
+ $sanitizedTitle = \Title::newFromText(
$wgGettingStartedCategoriesForTaskTypes[ $taskName ] );
+
+ if ( !( $sanitizedTitle &&
$sanitizedTitle->inNamespace( NS_CATEGORY ) ) ) {
+ return null;
+ }
+
+ $category = \Category::newFromTitle( $sanitizedTitle );
+
+ return new CategoryPageSuggester( $category );
+ } else if ( class_exists( 'CirrusSearch' ) &&
+ $taskName === 'morelike' &&
+ $sourceTitle !== null ) {
+
+ return new MoreLikePageSuggester( $request,
$sourceTitle );
+ } else {
+ return null;
+ }
+ }
+}
diff --git a/README b/README
index 2b1c9ce..9b67931 100644
--- a/README
+++ b/README
@@ -23,6 +23,9 @@
CentralAuth (https://www.mediawiki.org/wiki/Extension:CentralAuth) is
supported,
but not required.
+CirrusSearch is optional, but required if you use the 'morelike' suggestion
type
+(pages similar to a given page based on text).
+
Dependencies
* https://www.mediawiki.org/wiki/Extension:EventLogging - Used for logging
diff --git a/api/ApiGettingStartedGetPages.php
b/api/ApiGettingStartedGetPages.php
index bc53bb8..e91ebc1 100644
--- a/api/ApiGettingStartedGetPages.php
+++ b/api/ApiGettingStartedGetPages.php
@@ -5,11 +5,9 @@
use ApiBase, Category, Title;
class ApiGettingStartedGetPages extends ApiBase {
- const MAX_ATTEMPTS = 100;
+ const MAX_SUGGESTER_CALLS = 10;
public function execute() {
- global $wgGettingStartedCategoriesForTaskTypes;
-
$result = $this->getResult();
// For PageFilter and specifically userCan( 'edit' )
@@ -22,21 +20,15 @@
'titles' => array()
);
- if ( isset( $wgGettingStartedCategoriesForTaskTypes[ $taskName
] ) ) {
- $sanitizedTitle = Title::newFromText(
$wgGettingStartedCategoriesForTaskTypes[ $taskName ] );
+ $suggester = PageSuggesterFactory::getPageSuggester( $taskName,
$this->getRequest(), $excludedTitle );
+ if ( $suggester === null ) {
+ $this->dieUsage( "Invalid 'taskname' parameter, or
excludedtitle not provided when task requires it" );
+ }
+ $pageFilter = new PageFilter( $user, $excludedTitle );
+ $titles = self::getArticles( $count, $suggester, $pageFilter );
- if ( $sanitizedTitle && $sanitizedTitle->inNamespace(
NS_CATEGORY ) ) {
- $category = Category::newFromTitle(
$sanitizedTitle );
- $pageFilter = new PageFilter( $user,
$excludedTitle );
- $titles = self::getRandomArticles( $count,
$category, $pageFilter );
-
- foreach ( $titles as $title ) {
- $data['titles'][] =
$title->getPrefixedText();
- }
- }
- } else {
- // TODO (phuedx 2014-02-05): This is technically a
- // failure and should be logged.
+ foreach ( $titles as $title ) {
+ $data['titles'][] = $title->getPrefixedText();
}
$result->setIndexedTagName( $data['titles'], 'title' );
@@ -44,56 +36,55 @@
}
/**
- * Get a random set of $numWanted unique pages in the
- * category. If fewer than $numWanted pages exist in category,
- * return as many as are available. It is up to the caller to decide
- * how to handle the deficit.
+ * Get a set of $numWanted unique pages from the suggester, filtered by
the
+ * PageFilter.
*
- * @param int $numWanted Number of unique pages to get.
- * @param Category $category category to choose from
+ * If fewer than $numWanted pages acceptable suggestions are available,
raturn as
+ * many as are available. It is up to the caller to decide how to
handle the deficit.
+ *
+ * @param int $numWanted Number of unique pages to get
+ * @param PageSuggester $suggester PageSuggester to use
* @param PageFilter $pageFilter filter than can approve or reject a
page
- * @return array Set of $numWanted unique Title objects (or however many
- * were available, if the desired count was not satisfiable).
+ * @return array Array of $numWanted unique Title objects (or however
many were
+ * available, if the desired count was not satisfiable).
*/
- protected function getRandomArticles( $numWanted, Category $category,
PageFilter $pageFilter ) {
- $key = RedisCategorySync::makeCategoryKey( $category );
-
- $redis = RedisCategorySync::getClient();
- if ( !$redis ) {
- wfDebugLog( 'GettingStarted', "Unable to acquire redis
connection.\n" );
- return array();
- }
-
- // Map article ID to Title. At the end, we simply return a
non-associative array of Titles.
- // However, sRandMember can return the same ID more than once.
This allows us to easily
- // avoid these duplicates with array_key_exists.
- $titles = array();
-
+ protected function getArticles( $numWanted, PageSuggester $suggester,
PageFilter $pageFilter ) {
+ // We either retry or push the offset, depending on whether the
suggester is randomized
+ $totalResultCount = 0;
$attempts = 0;
- while ( count( $titles ) < $numWanted ) {
- $attempts++;
- // Sanity check to prevent calling srand or filter too
many times
- if ( $attempts >= self::MAX_ATTEMPTS ) {
- wfDebugLog( 'GettingStarted', 'Returning early
after ' . self::MAX_ATTEMPTS . ".\n" );
- return array_values( $titles );
- }
- try {
- $randomArticleID = $redis->sRandMember( $key );
- // If it's not numeric, it's most likely false,
meaning empty set or Redis failure.
- if ( is_numeric( $randomArticleID ) &&
!array_key_exists( $randomArticleID, $titles ) ) {
- $title = Title::newFromID(
$randomArticleID );
- // Null means the title no longer
exists, possibly due to bug 56044
- if ( $title !== null &&
$pageFilter->isAllowedPage( $title ) ) {
- $titles[$randomArticleID] =
$title;
- }
- }
- } catch ( RedisException $e ) {
- wfDebugLog( 'GettingStarted', 'Redis exception:
' . $e->getMessage() . ". Returning early.\n" );
- return array_values( $titles );
- }
- }
+ $offset = 0;
+ $isRandomized = $suggester->isRandomized();
+ $filteredTitles = array();
- return array_values( $titles );
+ do {
+ $unfilteredTitles = $suggester->getArticles( $numWanted
- $totalResultCount, $offset );
+
+ $newFilteredTitles = array_filter( $unfilteredTitles,
array( $pageFilter, 'isAllowedPage' ) );
+ $newFilteredTitles = array_udiff( $newFilteredTitles,
$filteredTitles, function ( $t1, $t2 ) {
+ return $t1->getArticleID() -
$t2->getArticleID();
+ } );
+ $filteredTitles = array_merge( $filteredTitles,
$newFilteredTitles );
+
+ $totalResultCount = count( $filteredTitles );
+
+ if ( !$isRandomized ) {
+ $numUnfilteredTitles = count( $unfilteredTitles
);
+ $prevOffset = $offset;
+ $offset += $numUnfilteredTitles;
+ }
+ $attempts++;
+ } while (
+ $totalResultCount < $numWanted &&
+ $attempts < self::MAX_SUGGESTER_CALLS &&
+ (
+ $isRandomized ||
+
+ // If it's not randomized, only continue if
some progress is being made
+ $offset !== $prevOffset
+ )
+ );
+
+ return $filteredTitles;
}
public function getDescription() {
@@ -105,8 +96,8 @@
public function getParamDescription() {
return array(
'taskname' => 'Task name, for example, "copyedit"',
- 'excludedtitle' => 'Full title of a page to exclude
from the list',
- 'count' => 'Requested count; will attempt to fetch this
exact number, but may fetch fewer if no more are found after multiple attempts'
+ 'excludedtitle' => 'Full title of a page to exclude
from the list; also used as the base title for recommendations based on a given
page',
+ 'count' => 'Requested count; will attempt to fetch this
exact number, but may fetch fewer if no more are found after multiple attempts',
);
}
--
To view, visit https://gerrit.wikimedia.org/r/146663
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I90d8753efa87a4edc2d57cd67f64b8ec8d5a00ad
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/GettingStarted
Gerrit-Branch: master
Gerrit-Owner: Mattflaschen <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits