jenkins-bot has submitted this change and it was merged. Change subject: (bug 52799) Dump JSON of entities listed in file. ......................................................................
(bug 52799) Dump JSON of entities listed in file. This allows dumpJson.php to dump just those entities given in a given list file. Change-Id: Ie101cc58b0218d623e73b832a4fc9d45f1d1ac4a --- M lib/WikibaseLib.classes.php A lib/includes/Disposable.php M lib/includes/Dumpers/JsonDumpGenerator.php A lib/includes/IO/EntityIdReader.php A lib/includes/IO/LineReader.php A lib/tests/phpunit/IO/EntityIdReaderTest.txt A lib/tests/phpunit/IO/EntrityIdReaderTest.php A lib/tests/phpunit/IO/LineReaderTest.php A lib/tests/phpunit/IO/LineReaderTest.txt M repo/includes/store/sql/ConvertingResultWrapper.php M repo/maintenance/dumpJson.php 11 files changed, 426 insertions(+), 6 deletions(-) Approvals: Addshore: Looks good to me, approved jenkins-bot: Verified diff --git a/lib/WikibaseLib.classes.php b/lib/WikibaseLib.classes.php index c6120b8..fbde8a6 100644 --- a/lib/WikibaseLib.classes.php +++ b/lib/WikibaseLib.classes.php @@ -32,6 +32,9 @@ // Autoloading 'Wikibase\LibHooks' => 'WikibaseLib.hooks.php', + // generic things that could be factored out + 'Disposable' => 'includes/Disposable.php', + // includes 'Wikibase\ChangeNotifier' => 'includes/ChangeNotifier.php', 'Wikibase\ChangeNotificationJob' => 'includes/ChangeNotificationJob.php', @@ -100,6 +103,10 @@ 'Wikibase\Lib\SnakFormatterFactory' => 'includes/formatters/SnakFormatterFactory.php', 'Wikibase\Lib\WikibaseSnakFormatterBuilders' => 'includes/formatters/WikibaseSnakFormatterBuilders.php', + // includes/IO + 'Wikibase\IO\LineReader' => 'includes/IO/LineReader.php', + 'Wikibase\IO\EntityIdReader' => 'includes/IO/EntityIdReader.php', + // includes/modules 'Wikibase\RepoAccessModule' => 'includes/modules/RepoAccessModule.php', 'Wikibase\SitesModule' => 'includes/modules/SitesModule.php', diff --git a/lib/includes/Disposable.php b/lib/includes/Disposable.php new file mode 100644 index 0000000..a362c4b --- /dev/null +++ b/lib/includes/Disposable.php @@ -0,0 +1,24 @@ +<?php + +/** + * An interface for objects that support explicit disposal. + * + * @license GPL 2+ + * @author Daniel Kinzler + * + * @todo make this reusable outside Wikibase + */ +interface Disposable { + + /** + * Releases any system (or other) resources held by this object. + * + * It is safe to call dispose() multiple times. + * The behavior of all other methods of this object becomes undefined after calling dispose() + * for the first time. + * + * Implementing classes may choose to implement the __destruct() method to call dispose(). + */ + public function dispose(); + +} diff --git a/lib/includes/Dumpers/JsonDumpGenerator.php b/lib/includes/Dumpers/JsonDumpGenerator.php index 20f007f..efad245 100644 --- a/lib/includes/Dumpers/JsonDumpGenerator.php +++ b/lib/includes/Dumpers/JsonDumpGenerator.php @@ -83,7 +83,7 @@ } } - $json = "]\n"; //TODO: make optional + $json = "\n]\n"; //TODO: make optional $this->writeToDump( $json ); } diff --git a/lib/includes/IO/EntityIdReader.php b/lib/includes/IO/EntityIdReader.php new file mode 100644 index 0000000..b4b62dc --- /dev/null +++ b/lib/includes/IO/EntityIdReader.php @@ -0,0 +1,98 @@ +<?php + +namespace Wikibase\IO; + +use Disposable; +use Iterator; +use Wikibase\DataModel\Entity\EntityId; +use Wikibase\Lib\EntityIdParser; + +/** + * EntityIdReader reads entity IDs from a file, one per line. + * + * @license GPL 2+ + * @author Daniel Kinzler + */ +class EntityIdReader implements Iterator, Disposable { + + /** + * @var LineReader + */ + protected $reader; + + /** + * @param resource $fileHandle The file to read from. + * @param bool $canClose Whether calling dispose() should close the fine handle. + * @param bool $autoDispose Whether to automatically call dispose() when reaching EOF. + * + * @throws \InvalidArgumentException + */ + public function __construct( $fileHandle, $canClose = true, $autoDispose = false ) { + $this->reader = new LineReader( $fileHandle, $canClose, $autoDispose ); + $this->parser = new EntityIdParser(); //TODO: inject? + } + + /** + * @param string $line + * @return EntityId + */ + protected function lineToId( $line ) { + $line = trim( $line ); + $id = $this->parser->parse( $line ); + //TODO: optionally catch, log & ignore ParseException + + return $id; + } + + /** + */ + public function dispose() { + $this->reader->dispose(); + } + + /** + * Returns the current ID. + * + * @link http://php.net/manual/en/iterator.current.php + * @return EntityId + */ + public function current() { + $line = $this->reader->current(); + return $this->lineToId( $line ); + } + + /** + * Advance to next ID. Blank lines are skipped. + * + * @see LineReader::next() + */ + public function next() { + do { + $this->reader->next(); + } while ( $this->reader->valid() && trim( $this->reader->current() ) === '' ); + } + + /** + * @see LineReader::key() + * @return int + */ + public function key() { + return $this->reader->key(); + } + + /** + * @see LineReader::valid() + * @return boolean + */ + public function valid() { + return $this->reader->valid(); + } + + /** + * @see LineReader::rewind() + */ + public function rewind() { + $this->reader->rewind(); + } + +} \ No newline at end of file diff --git a/lib/includes/IO/LineReader.php b/lib/includes/IO/LineReader.php new file mode 100644 index 0000000..ed2e054 --- /dev/null +++ b/lib/includes/IO/LineReader.php @@ -0,0 +1,155 @@ +<?php + +namespace Wikibase\IO; + +use Disposable; +use Iterator; + +/** + * LineReader allows iterating over the lines of a file. + * Each line returned will contain the line separator character(s) and all whitespace. + * Concatenating all lines returned by the reader should result in the original file. + * + * @license GPL 2+ + * @author Daniel Kinzler + */ +class LineReader implements Iterator, Disposable { + + /** + * @var resource + */ + protected $fileHandle; + + /** + * Whether dispose() will close the file handle. + * + * @var bool + */ + protected $canClose; + + /** + * Whether dispose() is called automatically when the end of file is reached. + * + * @var bool + */ + protected $autoDispose; + + /** + * @var string + */ + protected $current = null; + + /** + * @var int + */ + protected $line = 0; + + /** + * @param resource $fileHandle The file to read from. + * @param bool $canClose Whether calling dispose() should close the fine handle. + * @param bool $autoDispose Whether to automatically call dispose() when reaching EOF + * or when this reader is destructed. + * + * @throws \InvalidArgumentException + */ + public function __construct( $fileHandle, $canClose = true, $autoDispose = false ) { + if ( !is_resource( $fileHandle ) ) { + throw new \InvalidArgumentException( '$fileHandle must be a file resource.' ); + } + + if ( !is_bool( $canClose ) ) { + throw new \InvalidArgumentException( '$canClose must be a boolean.' ); + } + + if ( !is_bool( $autoDispose ) ) { + throw new \InvalidArgumentException( '$autoDispose must be a boolean.' ); + } + + $this->fileHandle = $fileHandle; + + $this->canClose = $canClose; + $this->autoDispose = $autoDispose; + } + + /** + * Closes the underlying file handle if the $canClose parameter was given as + * true (the default) in the constructor. + */ + public function dispose() { + if ( $this->fileHandle && $this->canClose ) { + fclose( $this->fileHandle ); + } + + $this->fileHandle = false; + } + + /** + * Destructor, calls dispose() if $autoDispose was set in the constructor. + */ + public function __destruct() { + if ( $this->autoDispose ) { + $this->dispose(); + } + } + + /** + * Return the current line. + * + * @link http://php.net/manual/en/iterator.current.php + * @return string + */ + public function current() { + return $this->current; + } + + /** + * Reads the the next line. Use current() to get the line's content. + * + * @link http://php.net/manual/en/iterator.next.php + */ + public function next() { + $this->current = fgets( $this->fileHandle ); + + if ( $this->valid() ) { + $this->line++; + } elseif ( $this->autoDispose ) { + $this->dispose(); + } + } + + /** + * Return the current line number. + * @link http://php.net/manual/en/iterator.key.php + * @return int + */ + public function key() { + return $this->line; + } + + /** + * Checks if current position is valid. Returns true if and only if + * next() has been called at least once and the end of file has not yet been reached. + * + * @link http://php.net/manual/en/iterator.valid.php + * @return boolean whether there is a current line + */ + public function valid() { + return is_string( $this->current ); + } + + /** + * Sets the file pointer to the beginning of the file, if supported. + * Has no effect if this LineReader has already been disposed. + * + * @link http://php.net/manual/en/iterator.rewind.php + * @return void Any returned value is ignored. + */ + public function rewind() { + if ( $this->fileHandle ) { + fseek( $this->fileHandle, 0 ); + $this->current = null; + + $this->next(); + } + } +} diff --git a/lib/tests/phpunit/IO/EntityIdReaderTest.txt b/lib/tests/phpunit/IO/EntityIdReaderTest.txt new file mode 100644 index 0000000..8eef7b3 --- /dev/null +++ b/lib/tests/phpunit/IO/EntityIdReaderTest.txt @@ -0,0 +1,6 @@ +Q1 +P2 + + q3 + + p4 \ No newline at end of file diff --git a/lib/tests/phpunit/IO/EntrityIdReaderTest.php b/lib/tests/phpunit/IO/EntrityIdReaderTest.php new file mode 100644 index 0000000..7c652d1 --- /dev/null +++ b/lib/tests/phpunit/IO/EntrityIdReaderTest.php @@ -0,0 +1,49 @@ +<?php + +namespace Wikibase\Test\IO; +use PHPUnit_Framework_TestCase; +use Wikibase\DataModel\Entity\ItemId; +use Wikibase\DataModel\Entity\PropertyId; +use Wikibase\IO\EntityIdReader; + +/** + * @covers Wikibase\IO\EntityIdReader + * + * @ingroup WikibaseLib + * @ingroup Test + * + * @group Wikibase + * @group WikibaseLib + * @group WikibaseIO + * + * @license GPL 2+ + * @author Daniel Kinzler + */ +class EntityIdReaderTest extends PHPUnit_Framework_TestCase { + + protected function getTestFile() { + return __DIR__ . '/EntityIdReaderTest.txt'; + } + + protected function openIdReader( $file ) { + $handle = fopen( $file, 'r' ); + return new EntityIdReader( $handle ); + } + + public function testIteration() { + $expected = array( + new ItemId( 'Q1' ), + new PropertyId( 'P2' ), + new ItemId( 'Q3' ), + new PropertyId( 'P4' ), + ); + + $file = $this->getTestFile(); + $reader = $this->openIdReader( $file ); + $actual = iterator_to_array( $reader ); + $reader->dispose(); + + $this->assertEmpty( array_diff( $expected, $actual ), "Different IDs" ); + } + +} diff --git a/lib/tests/phpunit/IO/LineReaderTest.php b/lib/tests/phpunit/IO/LineReaderTest.php new file mode 100644 index 0000000..51f9525 --- /dev/null +++ b/lib/tests/phpunit/IO/LineReaderTest.php @@ -0,0 +1,43 @@ +<?php + +namespace Wikibase\Test\IO; +use PHPUnit_Framework_TestCase; +use Wikibase\IO\LineReader; + +/** + * @covers Wikibase\IO\LineReader + * + * @ingroup WikibaseLib + * @ingroup Test + * + * @group Wikibase + * @group WikibaseLib + * @group WikibaseIO + * + * @license GPL 2+ + * @author Daniel Kinzler + */ +class LineReaderTest extends PHPUnit_Framework_TestCase { + + protected function getTestFile() { + return __DIR__ . '/LineReaderTest.txt'; + } + + protected function openLineReader( $file ) { + $handle = fopen( $file, 'r' ); + return new LineReader( $handle ); + } + + public function testIteration() { + $file = $this->getTestFile(); + + $expected = file( $file ); + + $reader = $this->openLineReader( $file ); + $actual = iterator_to_array( $reader ); + $reader->dispose(); + + $this->assertEmpty( array_diff( $expected, $actual ), "Different Lines" ); + } + +} diff --git a/lib/tests/phpunit/IO/LineReaderTest.txt b/lib/tests/phpunit/IO/LineReaderTest.txt new file mode 100644 index 0000000..360a4b0 --- /dev/null +++ b/lib/tests/phpunit/IO/LineReaderTest.txt @@ -0,0 +1,4 @@ +Hello +World! + +The End. diff --git a/repo/includes/store/sql/ConvertingResultWrapper.php b/repo/includes/store/sql/ConvertingResultWrapper.php index ab5a9f0..daef172 100644 --- a/repo/includes/store/sql/ConvertingResultWrapper.php +++ b/repo/includes/store/sql/ConvertingResultWrapper.php @@ -13,6 +13,9 @@ * * @licence GNU GPL v2+ * @author Daniel Kinzler + * + * @todo: this should implement Disposable know a LoadBalancer instance, so + * we can recycle the DB connection when done. */ abstract class ConvertingResultWrapper implements Iterator { diff --git a/repo/maintenance/dumpJson.php b/repo/maintenance/dumpJson.php index 5463f36..3e6ab1a 100644 --- a/repo/maintenance/dumpJson.php +++ b/repo/maintenance/dumpJson.php @@ -1,10 +1,13 @@ <?php namespace Wikibase; +use Disposable; use Iterator; use Maintenance; +use Traversable; use ValueFormatters\FormatterOptions; use Wikibase\Dumpers\JsonDumpGenerator; +use Wikibase\IO\EntityIdReader; use Wikibase\Lib\EntityIdFormatter; use Wikibase\Lib\Serializers\EntitySerializationOptions; use Wikibase\Lib\Serializers\EntitySerializer; @@ -47,11 +50,12 @@ $this->mDescription = 'Generate a JSON dump from entities in the repository.'; - //TODO: read list of IDs from file //TODO: filter by entity type + //TODO: shard by id congruence class ( id % n == m ) //$this->addOption( 'rebuild-all', "Update property info for all properties (per default, only missing entries are created)" ); //$this->addOption( 'start-row', "The ID of the first row to update (useful for continuing aborted runs)", false, true ); - //$this->addOption( 'batch-size', "Number of rows to update per database transaction (100 per default)", false, true ); + + $this->addOption( 'list-file', "A file containing one entity ID per line", false, true ); } public function initServices() { @@ -83,16 +87,43 @@ $idStream = $this->makeIdStream(); $dumper->generateDump( $idStream ); + + if ( $idStream instanceof Disposable ) { + // close stream / free resources + $idStream->dispose(); + } } /** * @return Iterator a stream of EntityId objects */ public function makeIdStream() { - //TODO: provide list/filter of entities - //TODO: allow ids to be read from a file + $listFile = $this->getOption( 'list-file' ); - $stream = $this->entityPerPage->getEntities(); + if ( $listFile !== null ) { + //TODO: allow filtering by entity type, id congruence class ( id % n == m ), etc. + $stream = $this->makeIdFileStream( $listFile ); + } else { + $stream = $this->entityPerPage->getEntities(); + } + + return $stream; + } + + /** + * @param $listFile + * + * @return Traversable + * @throws \MWException + */ + protected function makeIdFileStream( $listFile ) { + $input = fopen( $listFile, 'r' ); + + if ( !$input ) { + throw new \MWException( "Failed to open ID file: $input" ); + } + + $stream = new EntityIdReader( $input ); return $stream; } } -- To view, visit https://gerrit.wikimedia.org/r/84000 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: merged Gerrit-Change-Id: Ie101cc58b0218d623e73b832a4fc9d45f1d1ac4a Gerrit-PatchSet: 9 Gerrit-Project: mediawiki/extensions/Wikibase Gerrit-Branch: master Gerrit-Owner: Daniel Kinzler <daniel.kinz...@wikimedia.de> Gerrit-Reviewer: Addshore <addshorew...@gmail.com> Gerrit-Reviewer: Aude <aude.w...@gmail.com> Gerrit-Reviewer: Denny Vrandecic <denny.vrande...@wikimedia.de> Gerrit-Reviewer: Henning Snater <henning.sna...@wikimedia.de> Gerrit-Reviewer: Jeroen De Dauw <jeroended...@gmail.com> Gerrit-Reviewer: Tobias Gritschacher <tobias.gritschac...@wikimedia.de> Gerrit-Reviewer: jenkins-bot _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits