Daniel Kinzler has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/92644


Change subject: Make dumpJson.php robust against failures.
......................................................................

Make dumpJson.php robust against failures.

This introduces an ExceptionHandler interface to handle
exceptions in a way other than throwing them. This is used
to continue processing after logging errors that may occurr
when generating a (potentially very large) json dump.

Change-Id: I282d5304e584c48355950de7011c90dbe60f52e3
---
M lib/includes/Dumpers/JsonDumpGenerator.php
M lib/includes/IO/EntityIdReader.php
M lib/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php
A lib/tests/phpunit/IO/EntityIdReaderTest.bad.txt
M lib/tests/phpunit/IO/EntityIdReaderTest.txt
M lib/tests/phpunit/IO/EntrityIdReaderTest.php
M repo/Wikibase.classes.php
M repo/maintenance/dumpJson.php
8 files changed, 285 insertions(+), 38 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase 
refs/changes/44/92644/1

diff --git a/lib/includes/Dumpers/JsonDumpGenerator.php 
b/lib/includes/Dumpers/JsonDumpGenerator.php
index 09c8439..49703b8 100644
--- a/lib/includes/Dumpers/JsonDumpGenerator.php
+++ b/lib/includes/Dumpers/JsonDumpGenerator.php
@@ -2,7 +2,11 @@
 
 namespace Wikibase\Dumpers;
 
+use ExceptionHandler;
+use MessageReporter;
 use MWException;
+use NullMessageReporter;
+use RethrowingExceptionHandler;
 use Traversable;
 use Wikibase\DataModel\Entity\EntityId;
 use Wikibase\EntityLookup;
@@ -23,6 +27,11 @@
         * @var int flags to use with json_encode as a bit field, see PHP's 
JSON_XXX constants.
         */
        public $jsonFlags = 0;
+
+       /**
+        * @var int interval at which to output progress messages.
+        */
+       public $progressInterval = 100;
 
        /**
         * @var resource File handle for output
@@ -55,6 +64,16 @@
        protected $entityType = null;
 
        /**
+        * @var MessageReporter
+        */
+       protected $progressReporter;
+
+       /**
+        * @var ExceptionHandler
+        */
+       protected $exceptionHandler;
+
+       /**
         * @param resource $out
         * @param \Wikibase\EntityLookup $lookup
         * @param Serializer $entitySerializer
@@ -69,6 +88,55 @@
                $this->out = $out;
                $this->entitySerializer = $entitySerializer;
                $this->entityLookup = $lookup;
+
+               $this->progressReporter = new NullMessageReporter();
+               $this->exceptionHandler = new RethrowingExceptionHandler();
+       }
+
+       /**
+        * Sets the interval for progress reporting
+        *
+        * @param int $progressInterval
+        */
+       public function setProgressInterval( $progressInterval ) {
+               $this->progressInterval = $progressInterval;
+       }
+
+       /**
+        * Returns the interval for progress reporting
+        *
+        * @return int
+        */
+       public function getProgressInterval() {
+               return $this->progressInterval;
+       }
+
+       /**
+        * @param \MessageReporter $progressReporter
+        */
+       public function setProgressReporter( MessageReporter $progressReporter 
) {
+               $this->progressReporter = $progressReporter;
+       }
+
+       /**
+        * @return \MessageReporter
+        */
+       public function getProgressReporter() {
+               return $this->progressReporter;
+       }
+
+       /**
+        * @param \ExceptionHandler $exceptionHandler
+        */
+       public function setExceptionHandler( ExceptionHandler $exceptionHandler 
) {
+               $this->exceptionHandler = $exceptionHandler;
+       }
+
+       /**
+        * @return \ExceptionHandler
+        */
+       public function getExceptionHandler() {
+               return $this->exceptionHandler;
        }
 
        /**
@@ -134,15 +202,25 @@
                                }
 
                                $entity = $this->entityLookup->getEntity( $id );
+
+                               if ( !$entity ) {
+                                       throw new StorageException( 'Entity not 
found: ' . $id->getSerialization() );
+                               }
+
                                $data = $this->entitySerializer->getSerialized( 
$entity );
                                $json = $this->encode( $data );
+
                                $this->writeToDump( $json );
                        } catch ( StorageException $ex ) {
-                               $this->handleStorageException( $ex );
+                               $this->exceptionHandler->handleException( $ex, 
'failed-to-dump', 'Failed to dump '. $id );
                        }
 
-                       //TODO: use callback for reporting progress
+                       if ( $this->progressInterval > 0 && ( $i % 
$this->progressInterval ) === 0 ) {
+                               $this->progressReporter->reportMessage( 
'Processed ' . $i . ' entities.' );
+                       }
                }
+
+               $this->progressReporter->reportMessage( 'Processed ' . $i . ' 
entities.' );
 
                $json = "\n]\n"; //TODO: make optional
                $this->writeToDump( $json );
@@ -167,15 +245,6 @@
        }
 
        /**
-        * @param $ex
-        * @throws StorageException
-        */
-       private function handleStorageException( $ex ) {
-               //TODO: optionally, log & ignore.
-               throw $ex;
-       }
-
-       /**
         * Encodes the given data as JSON
         *
         * @param $data
@@ -187,8 +256,7 @@
                $json = json_encode( $data, $this->jsonFlags );
 
                if ( $json === false ) {
-                       // TODO: optionally catch & skip this
-                       throw new MWException( 'Failed to encode data 
structure.' );
+                       throw new StorageException( 'Failed to encode data 
structure.' );
                }
 
                return $json;
diff --git a/lib/includes/IO/EntityIdReader.php 
b/lib/includes/IO/EntityIdReader.php
index b4b62dc..85e2e21 100644
--- a/lib/includes/IO/EntityIdReader.php
+++ b/lib/includes/IO/EntityIdReader.php
@@ -3,7 +3,9 @@
 namespace Wikibase\IO;
 
 use Disposable;
+use ExceptionHandler;
 use Iterator;
+use ValueParsers\ParseException;
 use Wikibase\DataModel\Entity\EntityId;
 use Wikibase\Lib\EntityIdParser;
 
@@ -21,6 +23,16 @@
        protected $reader;
 
        /**
+        * @var ExceptionHandler
+        */
+       protected $exceptionHandler;
+
+       /**
+        * @var EntityId|null
+        */
+       protected $current = null;
+
+       /**
         * @param resource $fileHandle The file to read from.
         * @param bool $canClose Whether calling dispose() should close the 
fine handle.
         * @param bool $autoDispose Whether to automatically call dispose() 
when reaching EOF.
@@ -30,24 +42,67 @@
        public function __construct( $fileHandle, $canClose = true, 
$autoDispose = false ) {
                $this->reader = new LineReader( $fileHandle, $canClose, 
$autoDispose );
                $this->parser = new EntityIdParser(); //TODO: inject?
+
+               $this->exceptionHandler = new \RethrowingExceptionHandler();
+       }
+
+       /**
+        * @param \ExceptionHandler $exceptionHandler
+        */
+       public function setExceptionHandler( $exceptionHandler ) {
+               $this->exceptionHandler = $exceptionHandler;
+       }
+
+       /**
+        * @return \ExceptionHandler
+        */
+       public function getExceptionHandler() {
+               return $this->exceptionHandler;
        }
 
        /**
         * @param string $line
-        * @return EntityId
+        * @return EntityId|null
         */
        protected function lineToId( $line ) {
                $line = trim( $line );
-               $id = $this->parser->parse( $line );
-               //TODO: optionally catch, log & ignore ParseException
+
+               try {
+                       $id = $this->parser->parse( $line );
+               } catch ( ParseException $ex ) {
+                       $this->exceptionHandler->handleException( $ex, 
'bad-entity-id', "Failed to parse Entity ID $line" );
+                       $id = null;
+               }
 
                return $id;
        }
 
        /**
+        * Closes the underlying input stream
         */
        public function dispose() {
                $this->reader->dispose();
+       }
+
+       /**
+        * Returns the current ID, or, if that has been consumed, finds the 
next ID on
+        * the input stream and return it.
+        *
+        * @return EntityId|null
+        */
+       protected function fill() {
+               while ( $this->current === null && $this->reader->valid() ) {
+                       $line = trim( $this->reader->current() );
+                       $this->reader->next();
+
+                       if ( $line === '' ) {
+                               continue;
+                       }
+
+                       $this->current = $this->lineToId( $line );
+               };
+
+               return $this->current;
        }
 
        /**
@@ -57,8 +112,8 @@
         * @return EntityId
         */
        public function current() {
-               $line = $this->reader->current();
-               return $this->lineToId( $line );
+               $id = $this->fill();
+               return $id;
        }
 
        /**
@@ -67,9 +122,8 @@
         * @see LineReader::next()
         */
        public function next() {
-               do {
-                       $this->reader->next();
-               } while ( $this->reader->valid() && trim( 
$this->reader->current() ) === '' );
+               $this->current = null; // consume current
+               $this->fill();
        }
 
        /**
@@ -85,13 +139,14 @@
         * @return boolean
         */
        public function valid() {
-               return $this->reader->valid();
+               return $this->fill() !== null;
        }
 
        /**
         * @see LineReader::rewind()
         */
        public function rewind() {
+               $this->current = null;
                $this->reader->rewind();
        }
 
diff --git a/lib/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php 
b/lib/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php
index 6805f6f..b52a964 100644
--- a/lib/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php
+++ b/lib/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php
@@ -12,6 +12,7 @@
 use Wikibase\EntityFactory;
 use Wikibase\Item;
 use Wikibase\Property;
+use Wikibase\StorageException;
 
 /**
  * @covers Wikibase\Dumpers\JsonDumpGenerator
@@ -49,7 +50,7 @@
         *
         * @return JsonDumpGenerator
         */
-       protected function newDumpGenerator( array $ids = array() ) {
+       protected function newDumpGenerator( array $ids = array(), array 
$missingIds = array() ) {
                $out = fopen( 'php://output', 'w' );
 
                $serializer = $this->getMock( 
'Wikibase\Lib\Serializers\Serializer' );
@@ -67,7 +68,11 @@
                $entityLookup = $this->getMock( 'Wikibase\EntityLookup' );
                $entityLookup->expects( $this->any() )
                        ->method( 'getEntity' )
-                       ->will( $this->returnCallback( function ( EntityId $id 
) use ( $entities ) {
+                       ->will( $this->returnCallback( function ( EntityId $id 
) use ( $entities, $missingIds ) {
+                                       if ( in_array( $id, $missingIds ) ) {
+                                               return null;
+                                       }
+
                                        $key = $id->getPrefixedId();
                                        return $entities[$key];
                                }
@@ -248,4 +253,53 @@
                );
        }
 
+       public function testExceptionHandler() {
+               $ids = array();
+               $missingIds = array();
+
+               for ( $i = 1; $i<=100; $i++) {
+                       $id = new ItemId( "Q$i" );
+                       $ids[] = $id;
+
+                       if ( ( $i % 10 ) === 0 ) {
+                               $missingIds[] = $id;
+                       }
+               }
+
+               $dumper = $this->newDumpGenerator( $ids, $missingIds );
+               $idList = new ArrayObject( $ids );
+
+               $exceptionHandler = $this->getMock( 'ExceptionHandler' );
+               $exceptionHandler->expects( $this->exactly( count( $missingIds 
) ) )
+                       ->method( 'handleException' );
+
+               $dumper->setExceptionHandler( $exceptionHandler );
+
+               ob_start();
+               $dumper->generateDump( $idList );
+               ob_end_clean();
+       }
+
+       public function testProgressReporter() {
+               $ids = array();
+
+               for ( $i = 1; $i<=100; $i++) {
+                       $id = new ItemId( "Q$i" );
+                       $ids[] = $id;
+               }
+
+               $dumper = $this->newDumpGenerator( $ids );
+               $idList = new ArrayObject( $ids );
+
+               $progressReporter = $this->getMock( 'MessageReporter' );
+               $progressReporter->expects( $this->exactly( ( count( $ids ) / 
10 ) +1 ) )
+                       ->method( 'reportMessage' );
+
+               $dumper->setProgressInterval( 10 );
+               $dumper->setProgressReporter( $progressReporter );
+
+               ob_start();
+               $dumper->generateDump( $idList );
+               ob_end_clean();
+       }
 }
diff --git a/lib/tests/phpunit/IO/EntityIdReaderTest.bad.txt 
b/lib/tests/phpunit/IO/EntityIdReaderTest.bad.txt
new file mode 100644
index 0000000..9c03cdf
--- /dev/null
+++ b/lib/tests/phpunit/IO/EntityIdReaderTest.bad.txt
@@ -0,0 +1,4 @@
+Q23x
+Q23
+asdf
+p42
\ No newline at end of file
diff --git a/lib/tests/phpunit/IO/EntityIdReaderTest.txt 
b/lib/tests/phpunit/IO/EntityIdReaderTest.txt
index 8eef7b3..b70a1c3 100644
--- a/lib/tests/phpunit/IO/EntityIdReaderTest.txt
+++ b/lib/tests/phpunit/IO/EntityIdReaderTest.txt
@@ -3,4 +3,4 @@
 
   q3
 
- p4
\ No newline at end of file
+       p4
\ No newline at end of file
diff --git a/lib/tests/phpunit/IO/EntrityIdReaderTest.php 
b/lib/tests/phpunit/IO/EntrityIdReaderTest.php
index 608f4fe..d5f4c7e 100644
--- a/lib/tests/phpunit/IO/EntrityIdReaderTest.php
+++ b/lib/tests/phpunit/IO/EntrityIdReaderTest.php
@@ -23,7 +23,8 @@
        }
 
        protected function openIdReader( $file ) {
-               $handle = fopen( $file, 'r' );
+               $path = __DIR__ . '/' . $file;
+               $handle = fopen( $path, 'r' );
                return new EntityIdReader( $handle );
        }
 
@@ -35,8 +36,26 @@
                        new PropertyId( 'P4' ),
                );
 
-               $file = $this->getTestFile();
-               $reader = $this->openIdReader( $file );
+               $reader = $this->openIdReader( 'EntityIdReaderTest.txt' );
+               $actual = iterator_to_array( $reader );
+               $reader->dispose();
+
+               $this->assertEmpty( array_diff( $expected, $actual ), 
"Different IDs" );
+       }
+
+       public function testIterationWithErrors() {
+               $expected = array(
+                       new ItemId( 'Q23' ),
+                       new PropertyId( 'P42' ),
+               );
+
+               $exceptionHandler = $this->getMock( 'ExceptionHandler' );
+               $exceptionHandler->expects( $this->exactly( 2 ) ) //two bad 
lines in EntityIdReaderTest.bad.txt
+                       ->method( 'handleException' );
+
+               $reader = $this->openIdReader( 'EntityIdReaderTest.bad.txt' );
+               $reader->setExceptionHandler( $exceptionHandler );
+
                $actual = iterator_to_array( $reader );
                $reader->dispose();
 
diff --git a/repo/Wikibase.classes.php b/repo/Wikibase.classes.php
index 9d7058b..1786ffe 100644
--- a/repo/Wikibase.classes.php
+++ b/repo/Wikibase.classes.php
@@ -197,6 +197,10 @@
                'ObservableMessageReporter' => 'includes/MessageReporter.php',
                'NullMessageReporter' => 'includes/MessageReporter.php',
 
+               'ExceptionHandler' => 'includes/ExceptionHandler.php',
+               'ReportingExceptionHandler' => 'includes/ExceptionHandler.php',
+               'RethrowingExceptionHandler' => 'includes/ExceptionHandler.php',
+
                'Wikibase\Test\EntityDataSerializationServiceTest' => 
'tests/phpunit/includes/LinkedData/EntityDataSerializationServiceTest.php',
                'Wikibase\Test\EntityDataRequestHandlerTest' => 
'tests/phpunit/includes/LinkedData/EntityDataRequestHandlerTest.php',
                'Wikibase\Test\EntityDataTestProvider' => 
'tests/phpunit/includes/LinkedData/EntityDataTestProvider.php',
diff --git a/repo/maintenance/dumpJson.php b/repo/maintenance/dumpJson.php
index 5c52d2f..2adbd17 100644
--- a/repo/maintenance/dumpJson.php
+++ b/repo/maintenance/dumpJson.php
@@ -3,6 +3,7 @@
 namespace Wikibase;
 
 use Disposable;
+use ExceptionHandler;
 use Iterator;
 use Maintenance;
 use Traversable;
@@ -46,6 +47,8 @@
         */
        public $entityPerPage;
 
+       public $quiet = true;
+
        public function __construct() {
                parent::__construct();
 
@@ -55,6 +58,8 @@
                $this->addOption( 'entity-type', "Only dump this kind of 
entitiy, e.g. `item` or `property`.", false, true );
                $this->addOption( 'sharding-factor', "The number of shards 
(must be >= 1)", false, true );
                $this->addOption( 'shard', "A the shard to output (must be lett 
than the sharding-factor) ", false, true );
+               $this->addOption( 'output', "Output file (default is stdout) ", 
false, true );
+               $this->addOption( 'quiet', "Disable progress reporting", false, 
false );
        }
 
        public function initServices() {
@@ -69,10 +74,12 @@
        /**
         * Outputs a message vis the output() method.
         *
-        * @param $msg
+        * @see MessageReporter::reportMessage()
+        *
+        * @param string $message
         */
-       public function report( $msg ) {
-               $this->output( "$msg\n" );
+       public function reportMessage( $message ) {
+                       $this->output( "$message\n" );
        }
 
        /**
@@ -85,17 +92,48 @@
                $entityType = $this->getOption( 'entity-type' );
                $shardingFactor = (int)$this->getOption( 'sharding-factor', 1 );
                $shard = (int)$this->getOption( 'shard', 0 );
-               //TODO: echo filter options to reporter
 
-               $output = fopen( 'php://stdout', 'wa' ); //TODO: Allow 
injection of an OutputStream
+               $outFile = $this->getOption( 'output', 'php://stdout' );
+
+               if ( $outFile === '-' ) {
+                       $outFile = 'php://stdout';
+               }
+
+               $output = fopen( $outFile, 'wa' ); //TODO: Allow injection of 
an OutputStream
+
+               if ( !$output ) {
+                       throw new \MWException( 'Failed to open ' . $outFile . 
'!' );
+               }
+
+               if ( $entityType ) {
+                       $this->reportMessage( "Dumping entities of type 
$entityType" );
+               }
+
+               if ( $shardingFactor && $shard ) {
+                       $this->reportMessage( "Dumping shard 
$shard/$shardingFactor" );
+               }
+
                $dumper = new JsonDumpGenerator( $output, $this->entityLookup, 
$this->entitySerializer );
+
+               if ( $outFile !== 'php://stdout' && $outFile !== 'php://output' 
) {
+                       $progressReporter = new \ObservableMessageReporter();
+                       $progressReporter->registerReporterCallback( array( 
$this, 'reportMessage' ) );
+
+                       $dumper->setProgressReporter( $progressReporter );
+               } else {
+                       $progressReporter = new \NullMessageReporter();
+                       $this->mQuiet = true; // suppress in-band output
+               }
+
+               $exceptionReporter = new \ReportingExceptionHandler( 
$progressReporter );
+               $dumper->setExceptionHandler( $exceptionReporter );
 
                //NOTE: we filter for $entityType twice: filtering in the DB is 
efficient,
                //      but filtering in the dumper is needed when working from 
a list file.
                $dumper->setShardingFilter( $shardingFactor, $shard );
                $dumper->setEntityTypeFilter( $entityType );
 
-               $idStream = $this->makeIdStream( $entityType );
+               $idStream = $this->makeIdStream( $entityType, 
$exceptionReporter );
                $dumper->generateDump( $idStream );
 
                if ( $idStream instanceof Disposable ) {
@@ -106,14 +144,15 @@
 
        /**
         * @param string|null $entityType
+        * @param \ExceptionHandler $exceptionReporter
         *
         * @return Iterator a stream of EntityId objects
         */
-       public function makeIdStream( $entityType = null ) {
+       public function makeIdStream( $entityType = null, ExceptionHandler 
$exceptionReporter = null ) {
                $listFile = $this->getOption( 'list-file' );
 
                if ( $listFile !== null ) {
-                       $stream = $this->makeIdFileStream( $listFile );
+                       $stream = $this->makeIdFileStream( $listFile, 
$exceptionReporter );
                } else {
                        $stream = $this->entityPerPage->getEntities( 
$entityType );
                }
@@ -123,11 +162,12 @@
 
        /**
         * @param $listFile
+        * @param \ExceptionHandler $exceptionReporter
         *
-        * @return Traversable
         * @throws \MWException
+        * @return Traversable
         */
-       protected function makeIdFileStream( $listFile ) {
+       protected function makeIdFileStream( $listFile, ExceptionHandler 
$exceptionReporter = null ) {
                $input = fopen( $listFile, 'r' );
 
                if ( !$input ) {
@@ -135,8 +175,11 @@
                }
 
                $stream = new EntityIdReader( $input );
+               $stream->setExceptionHandler( $exceptionReporter );
+
                return $stream;
        }
+
 }
 
 $maintClass = 'Wikibase\DumpJson';

-- 
To view, visit https://gerrit.wikimedia.org/r/92644
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I282d5304e584c48355950de7011c90dbe60f52e3
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Daniel Kinzler <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to