EBernhardson has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/325328

Change subject: Add job queue option for initImageData maintenance script
......................................................................

Add job queue option for initImageData maintenance script

Trying to run this script in the cluster fatals out due to memory
problems somewhat regularly. The --start option helps to restart
it where it fell down, but when trying to run against hundreds of
wiki's that is a one-off solution that makes ensuring everything is
actually visited a pain.

To try and isolate errors add an option to push the parsing into the
job queue. There is still the possibility to miss pages, but job queue
retries should take care of us for the most part. Attempts to keep
load down on the databases by making sure no more than a specified
number of jobs are queued/processing at a given time.

Bug: T152155
Change-Id: I3a4e3a415b2f03de0bb36ac0515241e950130fde
---
M extension.json
A includes/Job/InitImageDataJob.php
M maintenance/initImageData.php
3 files changed, 45 insertions(+), 5 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/PageImages 
refs/changes/28/325328/1

diff --git a/extension.json b/extension.json
index 4e74d66..f78c44a 100644
--- a/extension.json
+++ b/extension.json
@@ -18,7 +18,8 @@
                "ApiQueryPageImages": "includes/ApiQueryPageImages.php",
                "PageImages": "includes/PageImages.php",
                "PageImages\\Hooks\\LinksUpdateHookHandler": 
"includes/LinksUpdateHookHandler.php",
-               "PageImages\\Hooks\\ParserFileProcessingHookHandlers": 
"includes/ParserFileProcessingHookHandlers.php"
+               "PageImages\\Hooks\\ParserFileProcessingHookHandlers": 
"includes/ParserFileProcessingHookHandlers.php",
+               "PageImages\\Job\\InitImageDataJob": 
"includes/Job/InitImageDataJob.php"
        },
        "Hooks": {
                "ParserMakeImageParams": 
"PageImages\\Hooks\\ParserFileProcessingHookHandlers::onParserMakeImageParams",
@@ -29,6 +30,9 @@
                "AfterParserFetchFileAndTitle": 
"PageImages\\Hooks\\ParserFileProcessingHookHandlers::onAfterParserFetchFileAndTitle",
                "SpecialMobileEditWatchlist::images": 
"PageImages::onSpecialMobileEditWatchlist_images"
        },
+       "JobClasses": {
+               "InitImageDataJob": "PageImages\\Job\\InitImageDataJob"
+       },
        "config": {
                "PageImagesScores": {
                        "value": {
diff --git a/includes/Job/InitImageDataJob.php 
b/includes/Job/InitImageDataJob.php
new file mode 100644
index 0000000..c35adf8
--- /dev/null
+++ b/includes/Job/InitImageDataJob.php
@@ -0,0 +1,17 @@
+<?php
+
+namespace PageImages\Job;
+
+use MediaWiki\MediaWikiServices;
+use RefreshLinks;
+
+class InitImageDataJob extends Job {
+       public function run() {
+               $lbFactory = 
MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
+
+               foreach ( $this->params['page_ids'] as $id ) {
+                       RefreshLinks::fixLinksFromArticle( $id );
+                       $lbFactory->waitForReplication();
+               }
+       }
+}
diff --git a/maintenance/initImageData.php b/maintenance/initImageData.php
index af3453d..0463056 100644
--- a/maintenance/initImageData.php
+++ b/maintenance/initImageData.php
@@ -7,6 +7,7 @@
 require_once ( "$IP/maintenance/Maintenance.php" );
 
 use MediaWiki\MediaWikiServices;
+use PageImages\Job\InitImageDataJob;
 
 /**
  * @license WTFPL 2.0
@@ -21,6 +22,7 @@
                $this->addOption( 'earlier-than',
                        'Run only on pages earlier than this timestamp', false, 
true );
                $this->addOption( 'start', 'Starting page ID', false, true );
+               $this->addOption( 'queue-pressure', 'Maximum number of jobs to 
enqueue at a time. If unprovided or 0 will be run in-process.', false, true );
                $this->setBatchSize( 100 );
        }
 
@@ -28,7 +30,11 @@
                global $wgPageImagesNamespaces;
 
                $id = $this->getOption( 'start', 0 );
-               $lbFactory = 
MediaWikiServices::getInstance()->getDBLoadBalancerFactory();
+               $queue = null;
+               $maxPressure = $this->getOption( 'queue-pressure', 0 );
+               if ( $maxPressure > 0 ) {
+                       $queue = JobQueueGroup::singleton();
+               }
 
                do {
                        $tables = [ 'page', 'imagelinks' ];
@@ -57,15 +63,28 @@
                                [ 'LIMIT' => $this->mBatchSize, 'ORDER_BY' => 
'page_id', 'GROUP BY' => 'page_id' ],
                                $joinConds
                        );
+                       $page_ids = [];
                        foreach ( $res as $row ) {
-                               $id = $row->page_id;
-                               RefreshLinks::fixLinksFromArticle( $id );
-                               $lbFactory->waitForReplication();
+                               $page_ids[] = $row->page_id;
+                       }
+                       $job = new InitImageDataJob( Title::newMainPage(), [ 
'page_ids' => $page_ids ] );
+                       if ( $queue === null ) {
+                               $job->run();
+                       } else {
+                               $queue->push( $job );
+                               do {
+                                       sleep(1);
+                               } while ( $this->getJobPressure( $queue ) >= 
$maxPressure );
                        }
                        $this->output( "$id\n" );
                } while ( $res->numRows() );
                $this->output( "done\n" );
        }
+
+       private function getJobPressure( $queue ) {
+               $group = $queue->get( 'InitImageDataJob' );
+               return $group->getSize() + $group->getAcquiredCount();
+       }
 }
 
 $maintClass = 'InitImageData';

-- 
To view, visit https://gerrit.wikimedia.org/r/325328
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I3a4e3a415b2f03de0bb36ac0515241e950130fde
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/PageImages
Gerrit-Branch: master
Gerrit-Owner: EBernhardson <ebernhard...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to