Aaron Schulz has uploaded a new change for review.
https://gerrit.wikimedia.org/r/99033
Change subject: [WIP] Converted jobs to use BacklinkJobUtils
......................................................................
[WIP] Converted jobs to use BacklinkJobUtils
Change-Id: I5d5af179c342c7246055a114718eed29abc07ee0
---
M php/Parsoid.hooks.php
M php/ParsoidCacheUpdateJob.php
2 files changed, 81 insertions(+), 203 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Parsoid
refs/changes/33/99033/1
diff --git a/php/Parsoid.hooks.php b/php/Parsoid.hooks.php
index 5ddadbb..1d5bba7 100644
--- a/php/Parsoid.hooks.php
+++ b/php/Parsoid.hooks.php
@@ -18,6 +18,7 @@
$params = array( 'type' => $type );
if ( $type == 'OnDependencyChange' ) {
$params['table'] = $table;
+ $params['recursive'] = true;
return $params + Job::newRootJobParams(
"ParsoidCacheUpdateJob{$type}:{$title->getPrefixedText()}");
} else {
diff --git a/php/ParsoidCacheUpdateJob.php b/php/ParsoidCacheUpdateJob.php
index 7d7c73c..bce144c 100644
--- a/php/ParsoidCacheUpdateJob.php
+++ b/php/ParsoidCacheUpdateJob.php
@@ -2,37 +2,25 @@
/**
* HTML cache refreshing and -invalidation job for the Parsoid varnish caches.
+ *
+ * This job comes in a few variants:
+ * - a) Recursive jobs to purge caches for backlink pages for a given title.
+ * They have have
(type:OnDependencyChange,recursive:true,table:<table>) set.
+ * - b) Jobs to purge caches for a set of titles (the job title is ignored).
+ * They have have (type:OnDependencyChange,pages:(<page
ID>:(<namespace>,<title>),...) set.
+ * - c) Jobs to purge caches for a single page (the job title)
+ * They have (type:OnEdit) set.
+ *
* See
*
http://www.mediawiki.org/wiki/Parsoid/Minimal_performance_strategy_for_July_release
- * @TODO: This is mostly a copy of the HTMLCacheUpdate code. Eventually extend
- * some generic backlink job base class in core
*/
class ParsoidCacheUpdateJob extends Job {
- /** @var BacklinkCache */
- protected $blCache;
-
- protected $rowsPerJob;
-
- /**
- * Construct a job
- * @param $title Title: the title linked to
- * @param array $params job parameters (table, start and end page_ids)
- * @param $id Integer: job id
- */
function __construct( $title, $params, $id = 0 ) {
- wfDebug( "ParsoidCacheUpdateJob.__construct " . $title . "\n" );
- global $wgParsoidCacheUpdateTitlesPerJob;
-
// Map old jobs to new 'OnEdit' jobs
- if ( ! isset( $params['type'] ) ) {
- $params['type'] = 'OnEdit';
+ if ( !isset( $params['type'] ) ) {
+ $params['type'] = 'OnEdit'; // b/c
}
- parent::__construct( 'ParsoidCacheUpdateJob' . $params['type'],
- $title, $params, $id );
-
- $this->rowsPerJob = $wgParsoidCacheUpdateTitlesPerJob;
-
- $this->blCache = $title->getBacklinkCache();
+ parent::__construct( 'ParsoidCacheUpdateJob' . $params['type'],
$title, $params, $id );
if ( $params['type'] == 'OnEdit' ) {
// Simple duplicate removal for single-title jobs.
Other jobs are
@@ -41,162 +29,74 @@
}
}
- public function run() {
- if ( isset( $this->params['table'] ) ) {
- if ( isset( $this->params['start'] ) && isset(
$this->params['end'] ) ) {
- # This is a child job working on a sub-range of
a large number of
- # titles.
- return $this->doPartialUpdate();
- } else {
- # Update all pages depending on this resource
(transclusion or
- # file)
- return $this->doFullUpdate();
+ function run() {
+ global $wgParsoidCacheUpdateTitlesPerJob, $wgUpdateRowsPerJob,
$wgMaxBacklinksInvalidate;
+
+ if ( $this->params['type'] === 'OnEdit' ) {
+ $this->invalidateTitle( $this->title );
+ } elseif ( $this->params['type'] === 'OnDependencyChange' ) {
+ static $expected = array( 'recursive', 'pages' ); //
new jobs have one of these
+
+ $oldRangeJob = false;
+ if ( !array_intersect( array_keys( $this->params ),
$expected ) ) {
+ // B/C for older job params formats that lack
these fields:
+ // a) base jobs with just ("table") and b)
range jobs with ("table","start","end")
+ if ( isset( $this->params['start'] ) && isset(
$this->params['end'] ) ) {
+ $oldRangeJob = true;
+ } else {
+ $this->params['recursive'] = true; //
base job
+ }
}
- } else {
- # Refresh the Parsoid cache for the page itself
- return $this->invalidateTitle( $this->title );
- }
- }
- /**
- * Update all of the backlinks
- */
- protected function doFullUpdate() {
- global $wgParsoidMaxBacklinksInvalidate;
+ // Job to purge all (or a range of) backlink pages for
a page
+ if ( !empty( $this->params['recursive'] ) ) {
+ // @TODO: try to use delayed jobs if possible?
+ if ( !isset( $this->params['range'] ) &&
$wgMaxBacklinksInvalidate !== false ) {
+ $numRows =
$this->title->getBacklinkCache()->getNumLinks(
+ $this->params['table'],
$wgMaxBacklinksInvalidate );
+ if ( $numRows >
$wgMaxBacklinksInvalidate ) {
+ return true;
+ }
+ }
+ // Convert this into some title-batch jobs and
possibly a
+ // recursive ParsoidCacheUpdateJob job for the
rest of the backlinks
+ $jobs = BacklinkJobUtils::partitionBacklinkJob(
+ $this,
+ $wgUpdateRowsPerJob,
+ $wgParsoidCacheUpdateTitlesPerJob, //
jobs-per-title
+ // Carry over information for
de-duplication
+ array( 'params' =>
+ array( 'type' =>
'OnDependencyChange' ) + $this->getRootJobParams() )
+ );
+ JobQueueGroup::singleton()->push( $jobs );
+ // Job to purge pages for for a set of titles
+ } elseif ( isset( $this->params['pages'] ) ) {
+ $this->invalidateTitles( $this->params['pages']
);
+ // B/C for job to purge a range of backlink pages for a
given page
+ } elseif ( $oldRangeJob ) {
+ $titleArray =
$this->title->getBacklinkCache()->getLinks(
+ $this->params['table'],
$this->params['start'], $this->params['end'] );
- # Get an estimate of the number of rows from the BacklinkCache
- $max = max( $this->rowsPerJob * 2,
$wgParsoidMaxBacklinksInvalidate ) + 1;
- $numRows = $this->blCache->getNumLinks( $this->params['table'],
$max );
- if ( $wgParsoidMaxBacklinksInvalidate !== false
- && $numRows > $wgParsoidMaxBacklinksInvalidate ) {
- wfDebug( "Skipped HTML cache invalidation of
{$this->title->getPrefixedText()}." );
- return true;
- }
+ $pages = array(); // same format
BacklinkJobUtils uses
+ foreach ( $titleArray as $tl ) {
+ $pages[$tl->getArticleId()] = array(
$tl->getNamespace(), $tl->getDbKey() );
+ }
- if ( $numRows > $this->rowsPerJob * 2 ) {
- # Do fast cached partition
- $this->insertPartitionJobs();
- } else {
- # Get the links from the DB
- $titleArray = $this->blCache->getLinks(
$this->params['table'] );
- # Check if the row count estimate was correct
- if ( $titleArray->count() > $this->rowsPerJob * 2 ) {
- # Not correct, do accurate partition
- wfDebug( __METHOD__ . ": row count estimate was
incorrect, repartitioning\n" );
- $this->insertJobsFromTitles( $titleArray );
- } else {
- return $this->invalidateTitles( $titleArray );
// just do the query
+ $jobs = array();
+ foreach ( array_chunk(
$wgParsoidCacheUpdateTitlesPerJob, $pages ) as $pageChunk ) {
+ $jobs[] = new ParsoidCacheUpdateJob(
$this->title,
+ array(
+ 'type' =>
'OnDependencyChange',
+ 'table' => $this->table,
+ 'pages' => $pageChunk
+ ) + $this->getRootJobParams()
// carry over information for de-duplication
+ );
+ }
+ JobQueueGroup::singleton()->push( $jobs );
}
}
return true;
- }
-
- /**
- * Update some of the backlinks, defined by a page ID range
- */
- protected function doPartialUpdate() {
- $titleArray = $this->blCache->getLinks(
- $this->params['table'], $this->params['start'],
$this->params['end'] );
- if ( $titleArray->count() <= $this->rowsPerJob * 2 ) {
- # This partition is small enough, do the update
- return $this->invalidateTitles( $titleArray );
- } else {
- # Partitioning was excessively inaccurate. Divide the
job further.
- # This can occur when a large number of links are added
in a short
- # period of time, say by updating a heavily-used
template.
- $this->insertJobsFromTitles( $titleArray );
- return true;
- }
- }
-
- /**
- * Partition the current range given by $this->params['start'] and
$this->params['end'],
- * using a pre-calculated title array which gives the links in that
range.
- * Queue the resulting jobs.
- *
- * @param $titleArray array
- * @param $rootJobParams array
- * @return void
- */
- protected function insertJobsFromTitles( $titleArray, $rootJobParams =
array() ) {
- // Carry over any "root job" information
- $rootJobParams = $this->getRootJobParams();
- # We make subpartitions in the sense that the start of the
first job
- # will be the start of the parent partition, and the end of the
last
- # job will be the end of the parent partition.
- $jobs = array();
- $start = $this->params['start']; # start of the current job
- $numTitles = 0;
- foreach ( $titleArray as $title ) {
- $id = $title->getArticleID();
- # $numTitles is now the number of titles in the current
job not
- # including the current ID
- if ( $numTitles >= $this->rowsPerJob ) {
- # Add a job up to but not including the current
ID
- $jobs[] = new ParsoidCacheUpdateJob(
$this->title,
- array(
- 'table' =>
$this->params['table'],
- 'start' => $start,
- 'end' => $id - 1,
- 'type' => 'OnDependencyChange'
- ) + $rootJobParams // carry over
information for de-duplication
- );
- $start = $id;
- $numTitles = 0;
- }
- $numTitles++;
- }
- # Last job
- $jobs[] = new ParsoidCacheUpdateJob( $this->title,
- array(
- 'table' => $this->params['table'],
- 'start' => $start,
- 'end' => $this->params['end'],
- 'type' => 'OnDependencyChange'
- ) + $rootJobParams // carry over information for
de-duplication
- );
- wfDebug( __METHOD__ . ": repartitioning into " . count( $jobs )
. " jobs\n" );
-
- if ( count( $jobs ) < 2 ) {
- # I don't think this is possible at present, but
handling this case
- # makes the code a bit more robust against future code
updates and
- # avoids a potential infinite loop of repartitioning
- wfDebug( __METHOD__ . ": repartitioning failed!\n" );
- $this->invalidateTitles( $titleArray );
- } else {
- JobQueueGroup::singleton()->push( $jobs );
- }
- }
-
-
- /**
- * @param $rootJobParams array
- * @return void
- */
- protected function insertPartitionJobs( $rootJobParams = array() ) {
- // Carry over any "root job" information
- $rootJobParams = $this->getRootJobParams();
-
- $batches = $this->blCache->partition( $this->params['table'],
$this->rowsPerJob );
- if ( !count( $batches ) ) {
- return; // no jobs to insert
- }
-
- $jobs = array();
- foreach ( $batches as $batch ) {
- list( $start, $end ) = $batch;
- $jobs[] = new ParsoidCacheUpdateJob( $this->title,
- array(
- 'table' => $this->params['table'],
- 'start' => $start,
- 'end' => $end,
- 'type' => 'OnDependencyChange'
- ) + $rootJobParams // carry over information
for de-duplication
- );
- }
-
- JobQueueGroup::singleton()->push( $jobs );
}
/**
@@ -239,7 +139,7 @@
* Parsoid reuse transclusion and extension expansions.
* @param $title Title
*/
- protected function invalidateTitle( $title ) {
+ protected function invalidateTitle( Title $title ) {
global $wgParsoidCacheServers;
# First request the new version
@@ -258,7 +158,7 @@
'Cache-control: no-cache'
)
);
- };
+ }
wfDebug( "ParsoidCacheUpdateJob::invalidateTitle: " .
serialize( $requests ) . "\n" );
$this->checkCurlResults( CurlMultiClient::request( $requests )
);
@@ -271,7 +171,7 @@
$requests[] = array(
'url' => $this->getParsoidURL( $title, $server,
true )
);
- };
+ }
$options = CurlMultiClient::getDefaultOptions();
$options[CURLOPT_CUSTOMREQUEST] = "PURGE";
$this->checkCurlResults( CurlMultiClient::request( $requests,
$options ) );
@@ -283,10 +183,11 @@
* Invalidate an array (or iterator) of Title objects, right now. Send
* headers that signal Parsoid which of transclusions or extensions need
* to be updated.
- * @param $titleArray array
+ * @param $pages array (page ID => (namespace, DB key)) mapping
*/
- protected function invalidateTitles( $titleArray ) {
+ protected function invalidateTitles( array $pages ) {
global $wgParsoidCacheServers, $wgLanguageCode;
+
if ( !isset( $wgParsoidCacheServers ) ) {
$wgParsoidCacheServers = array( 'localhost' );
}
@@ -302,7 +203,8 @@
# Build an array of update requests
$requests = array();
foreach ( $wgParsoidCacheServers as $server ) {
- foreach ( $titleArray as $title ) {
+ foreach ( $pages as $id => $nsDbKey ) {
+ $title = Title::makeTitle( $nsDbKey[0],
$nsDbKey[1] );
# TODO, but low prio: if getLatestRevID returns
0, only purge title (deletion).
# Low prio because VE would normally refuse to
load the page
# anyway, and no private info is exposed.
@@ -329,30 +231,5 @@
serialize( $requests ) . "\n" );
return $this->getLastError() == null;
-
- /*
- # PURGE
- # Not needed with implicit updates (see above)
- # Build an array of purge requests
- $requests = array();
- foreach ( $wgParsoidCacheServers as $server ) {
- foreach ( $titleArray as $title ) {
- $url = $this->getParsoidURL( $title, $server, false );
-
- $requests[] = array(
- 'url' => $url
- );
- }
- }
-
- $options = CurlMultiClient::getDefaultOptions();
- $options[CURLOPT_CUSTOMREQUEST] = "PURGE";
- // Now send off all those purge requests
- CurlMultiClient::request( $requests, $options );
-
- wfDebug('ParsoidCacheUpdateJob::invalidateTitles purge: ' .
- serialize($requests) . "\n" );
- */
}
-
}
--
To view, visit https://gerrit.wikimedia.org/r/99033
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I5d5af179c342c7246055a114718eed29abc07ee0
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Parsoid
Gerrit-Branch: master
Gerrit-Owner: Aaron Schulz <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits