Daniel Kinzler has uploaded a new change for review.
https://gerrit.wikimedia.org/r/92644
Change subject: Make dumpJson.php robust against failures.
......................................................................
Make dumpJson.php robust against failures.
This introduces an ExceptionHandler interface to handle
exceptions in a way other than throwing them. This is used
to continue processing after logging errors that may occurr
when generating a (potentially very large) json dump.
Change-Id: I282d5304e584c48355950de7011c90dbe60f52e3
---
M lib/includes/Dumpers/JsonDumpGenerator.php
M lib/includes/IO/EntityIdReader.php
M lib/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php
A lib/tests/phpunit/IO/EntityIdReaderTest.bad.txt
M lib/tests/phpunit/IO/EntityIdReaderTest.txt
M lib/tests/phpunit/IO/EntrityIdReaderTest.php
M repo/Wikibase.classes.php
M repo/maintenance/dumpJson.php
8 files changed, 285 insertions(+), 38 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase
refs/changes/44/92644/1
diff --git a/lib/includes/Dumpers/JsonDumpGenerator.php
b/lib/includes/Dumpers/JsonDumpGenerator.php
index 09c8439..49703b8 100644
--- a/lib/includes/Dumpers/JsonDumpGenerator.php
+++ b/lib/includes/Dumpers/JsonDumpGenerator.php
@@ -2,7 +2,11 @@
namespace Wikibase\Dumpers;
+use ExceptionHandler;
+use MessageReporter;
use MWException;
+use NullMessageReporter;
+use RethrowingExceptionHandler;
use Traversable;
use Wikibase\DataModel\Entity\EntityId;
use Wikibase\EntityLookup;
@@ -23,6 +27,11 @@
* @var int flags to use with json_encode as a bit field, see PHP's
JSON_XXX constants.
*/
public $jsonFlags = 0;
+
+ /**
+ * @var int interval at which to output progress messages.
+ */
+ public $progressInterval = 100;
/**
* @var resource File handle for output
@@ -55,6 +64,16 @@
protected $entityType = null;
/**
+ * @var MessageReporter
+ */
+ protected $progressReporter;
+
+ /**
+ * @var ExceptionHandler
+ */
+ protected $exceptionHandler;
+
+ /**
* @param resource $out
* @param \Wikibase\EntityLookup $lookup
* @param Serializer $entitySerializer
@@ -69,6 +88,55 @@
$this->out = $out;
$this->entitySerializer = $entitySerializer;
$this->entityLookup = $lookup;
+
+ $this->progressReporter = new NullMessageReporter();
+ $this->exceptionHandler = new RethrowingExceptionHandler();
+ }
+
+ /**
+ * Sets the interval for progress reporting
+ *
+ * @param int $progressInterval
+ */
+ public function setProgressInterval( $progressInterval ) {
+ $this->progressInterval = $progressInterval;
+ }
+
+ /**
+ * Returns the interval for progress reporting
+ *
+ * @return int
+ */
+ public function getProgressInterval() {
+ return $this->progressInterval;
+ }
+
+ /**
+ * @param \MessageReporter $progressReporter
+ */
+ public function setProgressReporter( MessageReporter $progressReporter
) {
+ $this->progressReporter = $progressReporter;
+ }
+
+ /**
+ * @return \MessageReporter
+ */
+ public function getProgressReporter() {
+ return $this->progressReporter;
+ }
+
+ /**
+ * @param \ExceptionHandler $exceptionHandler
+ */
+ public function setExceptionHandler( ExceptionHandler $exceptionHandler
) {
+ $this->exceptionHandler = $exceptionHandler;
+ }
+
+ /**
+ * @return \ExceptionHandler
+ */
+ public function getExceptionHandler() {
+ return $this->exceptionHandler;
}
/**
@@ -134,15 +202,25 @@
}
$entity = $this->entityLookup->getEntity( $id );
+
+ if ( !$entity ) {
+ throw new StorageException( 'Entity not
found: ' . $id->getSerialization() );
+ }
+
$data = $this->entitySerializer->getSerialized(
$entity );
$json = $this->encode( $data );
+
$this->writeToDump( $json );
} catch ( StorageException $ex ) {
- $this->handleStorageException( $ex );
+ $this->exceptionHandler->handleException( $ex,
'failed-to-dump', 'Failed to dump '. $id );
}
- //TODO: use callback for reporting progress
+ if ( $this->progressInterval > 0 && ( $i %
$this->progressInterval ) === 0 ) {
+ $this->progressReporter->reportMessage(
'Processed ' . $i . ' entities.' );
+ }
}
+
+ $this->progressReporter->reportMessage( 'Processed ' . $i . '
entities.' );
$json = "\n]\n"; //TODO: make optional
$this->writeToDump( $json );
@@ -167,15 +245,6 @@
}
/**
- * @param $ex
- * @throws StorageException
- */
- private function handleStorageException( $ex ) {
- //TODO: optionally, log & ignore.
- throw $ex;
- }
-
- /**
* Encodes the given data as JSON
*
* @param $data
@@ -187,8 +256,7 @@
$json = json_encode( $data, $this->jsonFlags );
if ( $json === false ) {
- // TODO: optionally catch & skip this
- throw new MWException( 'Failed to encode data
structure.' );
+ throw new StorageException( 'Failed to encode data
structure.' );
}
return $json;
diff --git a/lib/includes/IO/EntityIdReader.php
b/lib/includes/IO/EntityIdReader.php
index b4b62dc..85e2e21 100644
--- a/lib/includes/IO/EntityIdReader.php
+++ b/lib/includes/IO/EntityIdReader.php
@@ -3,7 +3,9 @@
namespace Wikibase\IO;
use Disposable;
+use ExceptionHandler;
use Iterator;
+use ValueParsers\ParseException;
use Wikibase\DataModel\Entity\EntityId;
use Wikibase\Lib\EntityIdParser;
@@ -21,6 +23,16 @@
protected $reader;
/**
+ * @var ExceptionHandler
+ */
+ protected $exceptionHandler;
+
+ /**
+ * @var EntityId|null
+ */
+ protected $current = null;
+
+ /**
* @param resource $fileHandle The file to read from.
* @param bool $canClose Whether calling dispose() should close the
fine handle.
* @param bool $autoDispose Whether to automatically call dispose()
when reaching EOF.
@@ -30,24 +42,67 @@
public function __construct( $fileHandle, $canClose = true,
$autoDispose = false ) {
$this->reader = new LineReader( $fileHandle, $canClose,
$autoDispose );
$this->parser = new EntityIdParser(); //TODO: inject?
+
+ $this->exceptionHandler = new \RethrowingExceptionHandler();
+ }
+
+ /**
+ * @param \ExceptionHandler $exceptionHandler
+ */
+ public function setExceptionHandler( $exceptionHandler ) {
+ $this->exceptionHandler = $exceptionHandler;
+ }
+
+ /**
+ * @return \ExceptionHandler
+ */
+ public function getExceptionHandler() {
+ return $this->exceptionHandler;
}
/**
* @param string $line
- * @return EntityId
+ * @return EntityId|null
*/
protected function lineToId( $line ) {
$line = trim( $line );
- $id = $this->parser->parse( $line );
- //TODO: optionally catch, log & ignore ParseException
+
+ try {
+ $id = $this->parser->parse( $line );
+ } catch ( ParseException $ex ) {
+ $this->exceptionHandler->handleException( $ex,
'bad-entity-id', "Failed to parse Entity ID $line" );
+ $id = null;
+ }
return $id;
}
/**
+ * Closes the underlying input stream
*/
public function dispose() {
$this->reader->dispose();
+ }
+
+ /**
+ * Returns the current ID, or, if that has been consumed, finds the
next ID on
+ * the input stream and return it.
+ *
+ * @return EntityId|null
+ */
+ protected function fill() {
+ while ( $this->current === null && $this->reader->valid() ) {
+ $line = trim( $this->reader->current() );
+ $this->reader->next();
+
+ if ( $line === '' ) {
+ continue;
+ }
+
+ $this->current = $this->lineToId( $line );
+ };
+
+ return $this->current;
}
/**
@@ -57,8 +112,8 @@
* @return EntityId
*/
public function current() {
- $line = $this->reader->current();
- return $this->lineToId( $line );
+ $id = $this->fill();
+ return $id;
}
/**
@@ -67,9 +122,8 @@
* @see LineReader::next()
*/
public function next() {
- do {
- $this->reader->next();
- } while ( $this->reader->valid() && trim(
$this->reader->current() ) === '' );
+ $this->current = null; // consume current
+ $this->fill();
}
/**
@@ -85,13 +139,14 @@
* @return boolean
*/
public function valid() {
- return $this->reader->valid();
+ return $this->fill() !== null;
}
/**
* @see LineReader::rewind()
*/
public function rewind() {
+ $this->current = null;
$this->reader->rewind();
}
diff --git a/lib/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php
b/lib/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php
index 6805f6f..b52a964 100644
--- a/lib/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php
+++ b/lib/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php
@@ -12,6 +12,7 @@
use Wikibase\EntityFactory;
use Wikibase\Item;
use Wikibase\Property;
+use Wikibase\StorageException;
/**
* @covers Wikibase\Dumpers\JsonDumpGenerator
@@ -49,7 +50,7 @@
*
* @return JsonDumpGenerator
*/
- protected function newDumpGenerator( array $ids = array() ) {
+ protected function newDumpGenerator( array $ids = array(), array
$missingIds = array() ) {
$out = fopen( 'php://output', 'w' );
$serializer = $this->getMock(
'Wikibase\Lib\Serializers\Serializer' );
@@ -67,7 +68,11 @@
$entityLookup = $this->getMock( 'Wikibase\EntityLookup' );
$entityLookup->expects( $this->any() )
->method( 'getEntity' )
- ->will( $this->returnCallback( function ( EntityId $id
) use ( $entities ) {
+ ->will( $this->returnCallback( function ( EntityId $id
) use ( $entities, $missingIds ) {
+ if ( in_array( $id, $missingIds ) ) {
+ return null;
+ }
+
$key = $id->getPrefixedId();
return $entities[$key];
}
@@ -248,4 +253,53 @@
);
}
+ public function testExceptionHandler() {
+ $ids = array();
+ $missingIds = array();
+
+ for ( $i = 1; $i<=100; $i++) {
+ $id = new ItemId( "Q$i" );
+ $ids[] = $id;
+
+ if ( ( $i % 10 ) === 0 ) {
+ $missingIds[] = $id;
+ }
+ }
+
+ $dumper = $this->newDumpGenerator( $ids, $missingIds );
+ $idList = new ArrayObject( $ids );
+
+ $exceptionHandler = $this->getMock( 'ExceptionHandler' );
+ $exceptionHandler->expects( $this->exactly( count( $missingIds
) ) )
+ ->method( 'handleException' );
+
+ $dumper->setExceptionHandler( $exceptionHandler );
+
+ ob_start();
+ $dumper->generateDump( $idList );
+ ob_end_clean();
+ }
+
+ public function testProgressReporter() {
+ $ids = array();
+
+ for ( $i = 1; $i<=100; $i++) {
+ $id = new ItemId( "Q$i" );
+ $ids[] = $id;
+ }
+
+ $dumper = $this->newDumpGenerator( $ids );
+ $idList = new ArrayObject( $ids );
+
+ $progressReporter = $this->getMock( 'MessageReporter' );
+ $progressReporter->expects( $this->exactly( ( count( $ids ) /
10 ) +1 ) )
+ ->method( 'reportMessage' );
+
+ $dumper->setProgressInterval( 10 );
+ $dumper->setProgressReporter( $progressReporter );
+
+ ob_start();
+ $dumper->generateDump( $idList );
+ ob_end_clean();
+ }
}
diff --git a/lib/tests/phpunit/IO/EntityIdReaderTest.bad.txt
b/lib/tests/phpunit/IO/EntityIdReaderTest.bad.txt
new file mode 100644
index 0000000..9c03cdf
--- /dev/null
+++ b/lib/tests/phpunit/IO/EntityIdReaderTest.bad.txt
@@ -0,0 +1,4 @@
+Q23x
+Q23
+asdf
+p42
\ No newline at end of file
diff --git a/lib/tests/phpunit/IO/EntityIdReaderTest.txt
b/lib/tests/phpunit/IO/EntityIdReaderTest.txt
index 8eef7b3..b70a1c3 100644
--- a/lib/tests/phpunit/IO/EntityIdReaderTest.txt
+++ b/lib/tests/phpunit/IO/EntityIdReaderTest.txt
@@ -3,4 +3,4 @@
q3
- p4
\ No newline at end of file
+ p4
\ No newline at end of file
diff --git a/lib/tests/phpunit/IO/EntrityIdReaderTest.php
b/lib/tests/phpunit/IO/EntrityIdReaderTest.php
index 608f4fe..d5f4c7e 100644
--- a/lib/tests/phpunit/IO/EntrityIdReaderTest.php
+++ b/lib/tests/phpunit/IO/EntrityIdReaderTest.php
@@ -23,7 +23,8 @@
}
protected function openIdReader( $file ) {
- $handle = fopen( $file, 'r' );
+ $path = __DIR__ . '/' . $file;
+ $handle = fopen( $path, 'r' );
return new EntityIdReader( $handle );
}
@@ -35,8 +36,26 @@
new PropertyId( 'P4' ),
);
- $file = $this->getTestFile();
- $reader = $this->openIdReader( $file );
+ $reader = $this->openIdReader( 'EntityIdReaderTest.txt' );
+ $actual = iterator_to_array( $reader );
+ $reader->dispose();
+
+ $this->assertEmpty( array_diff( $expected, $actual ),
"Different IDs" );
+ }
+
+ public function testIterationWithErrors() {
+ $expected = array(
+ new ItemId( 'Q23' ),
+ new PropertyId( 'P42' ),
+ );
+
+ $exceptionHandler = $this->getMock( 'ExceptionHandler' );
+ $exceptionHandler->expects( $this->exactly( 2 ) ) //two bad
lines in EntityIdReaderTest.bad.txt
+ ->method( 'handleException' );
+
+ $reader = $this->openIdReader( 'EntityIdReaderTest.bad.txt' );
+ $reader->setExceptionHandler( $exceptionHandler );
+
$actual = iterator_to_array( $reader );
$reader->dispose();
diff --git a/repo/Wikibase.classes.php b/repo/Wikibase.classes.php
index 9d7058b..1786ffe 100644
--- a/repo/Wikibase.classes.php
+++ b/repo/Wikibase.classes.php
@@ -197,6 +197,10 @@
'ObservableMessageReporter' => 'includes/MessageReporter.php',
'NullMessageReporter' => 'includes/MessageReporter.php',
+ 'ExceptionHandler' => 'includes/ExceptionHandler.php',
+ 'ReportingExceptionHandler' => 'includes/ExceptionHandler.php',
+ 'RethrowingExceptionHandler' => 'includes/ExceptionHandler.php',
+
'Wikibase\Test\EntityDataSerializationServiceTest' =>
'tests/phpunit/includes/LinkedData/EntityDataSerializationServiceTest.php',
'Wikibase\Test\EntityDataRequestHandlerTest' =>
'tests/phpunit/includes/LinkedData/EntityDataRequestHandlerTest.php',
'Wikibase\Test\EntityDataTestProvider' =>
'tests/phpunit/includes/LinkedData/EntityDataTestProvider.php',
diff --git a/repo/maintenance/dumpJson.php b/repo/maintenance/dumpJson.php
index 5c52d2f..2adbd17 100644
--- a/repo/maintenance/dumpJson.php
+++ b/repo/maintenance/dumpJson.php
@@ -3,6 +3,7 @@
namespace Wikibase;
use Disposable;
+use ExceptionHandler;
use Iterator;
use Maintenance;
use Traversable;
@@ -46,6 +47,8 @@
*/
public $entityPerPage;
+ public $quiet = true;
+
public function __construct() {
parent::__construct();
@@ -55,6 +58,8 @@
$this->addOption( 'entity-type', "Only dump this kind of
entitiy, e.g. `item` or `property`.", false, true );
$this->addOption( 'sharding-factor', "The number of shards
(must be >= 1)", false, true );
$this->addOption( 'shard', "A the shard to output (must be lett
than the sharding-factor) ", false, true );
+ $this->addOption( 'output', "Output file (default is stdout) ",
false, true );
+ $this->addOption( 'quiet', "Disable progress reporting", false,
false );
}
public function initServices() {
@@ -69,10 +74,12 @@
/**
* Outputs a message vis the output() method.
*
- * @param $msg
+ * @see MessageReporter::reportMessage()
+ *
+ * @param string $message
*/
- public function report( $msg ) {
- $this->output( "$msg\n" );
+ public function reportMessage( $message ) {
+ $this->output( "$message\n" );
}
/**
@@ -85,17 +92,48 @@
$entityType = $this->getOption( 'entity-type' );
$shardingFactor = (int)$this->getOption( 'sharding-factor', 1 );
$shard = (int)$this->getOption( 'shard', 0 );
- //TODO: echo filter options to reporter
- $output = fopen( 'php://stdout', 'wa' ); //TODO: Allow
injection of an OutputStream
+ $outFile = $this->getOption( 'output', 'php://stdout' );
+
+ if ( $outFile === '-' ) {
+ $outFile = 'php://stdout';
+ }
+
+ $output = fopen( $outFile, 'wa' ); //TODO: Allow injection of
an OutputStream
+
+ if ( !$output ) {
+ throw new \MWException( 'Failed to open ' . $outFile .
'!' );
+ }
+
+ if ( $entityType ) {
+ $this->reportMessage( "Dumping entities of type
$entityType" );
+ }
+
+ if ( $shardingFactor && $shard ) {
+ $this->reportMessage( "Dumping shard
$shard/$shardingFactor" );
+ }
+
$dumper = new JsonDumpGenerator( $output, $this->entityLookup,
$this->entitySerializer );
+
+ if ( $outFile !== 'php://stdout' && $outFile !== 'php://output'
) {
+ $progressReporter = new \ObservableMessageReporter();
+ $progressReporter->registerReporterCallback( array(
$this, 'reportMessage' ) );
+
+ $dumper->setProgressReporter( $progressReporter );
+ } else {
+ $progressReporter = new \NullMessageReporter();
+ $this->mQuiet = true; // suppress in-band output
+ }
+
+ $exceptionReporter = new \ReportingExceptionHandler(
$progressReporter );
+ $dumper->setExceptionHandler( $exceptionReporter );
//NOTE: we filter for $entityType twice: filtering in the DB is
efficient,
// but filtering in the dumper is needed when working from
a list file.
$dumper->setShardingFilter( $shardingFactor, $shard );
$dumper->setEntityTypeFilter( $entityType );
- $idStream = $this->makeIdStream( $entityType );
+ $idStream = $this->makeIdStream( $entityType,
$exceptionReporter );
$dumper->generateDump( $idStream );
if ( $idStream instanceof Disposable ) {
@@ -106,14 +144,15 @@
/**
* @param string|null $entityType
+ * @param \ExceptionHandler $exceptionReporter
*
* @return Iterator a stream of EntityId objects
*/
- public function makeIdStream( $entityType = null ) {
+ public function makeIdStream( $entityType = null, ExceptionHandler
$exceptionReporter = null ) {
$listFile = $this->getOption( 'list-file' );
if ( $listFile !== null ) {
- $stream = $this->makeIdFileStream( $listFile );
+ $stream = $this->makeIdFileStream( $listFile,
$exceptionReporter );
} else {
$stream = $this->entityPerPage->getEntities(
$entityType );
}
@@ -123,11 +162,12 @@
/**
* @param $listFile
+ * @param \ExceptionHandler $exceptionReporter
*
- * @return Traversable
* @throws \MWException
+ * @return Traversable
*/
- protected function makeIdFileStream( $listFile ) {
+ protected function makeIdFileStream( $listFile, ExceptionHandler
$exceptionReporter = null ) {
$input = fopen( $listFile, 'r' );
if ( !$input ) {
@@ -135,8 +175,11 @@
}
$stream = new EntityIdReader( $input );
+ $stream->setExceptionHandler( $exceptionReporter );
+
return $stream;
}
+
}
$maintClass = 'Wikibase\DumpJson';
--
To view, visit https://gerrit.wikimedia.org/r/92644
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I282d5304e584c48355950de7011c90dbe60f52e3
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Daniel Kinzler <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits