Daniel Kinzler has uploaded a new change for review.
https://gerrit.wikimedia.org/r/84000
Change subject: (bug 52799) Dump JSON of entities listed in file.
......................................................................
(bug 52799) Dump JSON of entities listed in file.
This allows dumpJson.php to dump just those entities given in
a given list file.
Change-Id: Ie101cc58b0218d623e73b832a4fc9d45f1d1ac4a
---
M lib/WikibaseLib.classes.php
A lib/includes/Disposable.php
M lib/includes/Dumpers/JsonDumpGenerator.php
A lib/includes/IO/EntityIdReader.php
A lib/includes/IO/LineReader.php
A lib/tests/phpunit/IO/EntityIdReaderTest.txt
A lib/tests/phpunit/IO/EntrityIdReaderTest.php
A lib/tests/phpunit/IO/LineReaderTest.php
A lib/tests/phpunit/IO/LineReaderTest.txt
M repo/includes/store/sql/ConvertingResultWrapper.php
M repo/maintenance/dumpJson.php
11 files changed, 426 insertions(+), 6 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase
refs/changes/00/84000/1
diff --git a/lib/WikibaseLib.classes.php b/lib/WikibaseLib.classes.php
index af37853..526c258 100644
--- a/lib/WikibaseLib.classes.php
+++ b/lib/WikibaseLib.classes.php
@@ -32,6 +32,9 @@
// Autoloading
'Wikibase\LibHooks' => 'WikibaseLib.hooks.php',
+ // generic things that could be factored out
+ 'Disposable' => 'includes/Disposable.php',
+
// includes
'Wikibase\ChangeNotifier' => 'includes/ChangeNotifier.php',
'Wikibase\ChangeNotificationJob' =>
'includes/ChangeNotificationJob.php',
@@ -93,6 +96,10 @@
'Wikibase\Lib\EntityIdLabelFormatter' =>
'includes/formatters/EntityIdLabelFormatter.php',
'Wikibase\Lib\MwTimeIsoFormatter' =>
'includes/formatters/MwTimeIsoFormatter.php',
+ // includes/IO
+ 'Wikibase\IO\LineReader' => 'includes/IO/LineReader.php',
+ 'Wikibase\IO\EntityIdReader' =>
'includes/IO/EntityIdReader.php',
+
// includes/modules
'Wikibase\RepoAccessModule' =>
'includes/modules/RepoAccessModule.php',
'Wikibase\SitesModule' => 'includes/modules/SitesModule.php',
diff --git a/lib/includes/Disposable.php b/lib/includes/Disposable.php
new file mode 100644
index 0000000..a362c4b
--- /dev/null
+++ b/lib/includes/Disposable.php
@@ -0,0 +1,24 @@
+<?php
+
+/**
+ * An interface for objects that support explicit disposal.
+ *
+ * @license GPL 2+
+ * @author Daniel Kinzler
+ *
+ * @todo make this reusable outside Wikibase
+ */
+interface Disposable {
+
+ /**
+ * Releases any system (or other) resources held by this object.
+ *
+ * It is safe to call dispose() multiple times.
+ * The behavior of all other methods of this object becomes undefined
after calling dispose()
+ * for the first time.
+ *
+ * Implementing classes may choose to implement the __destruct() method
to call dispose().
+ */
+ public function dispose();
+
+}
diff --git a/lib/includes/Dumpers/JsonDumpGenerator.php
b/lib/includes/Dumpers/JsonDumpGenerator.php
index 20f007f..efad245 100644
--- a/lib/includes/Dumpers/JsonDumpGenerator.php
+++ b/lib/includes/Dumpers/JsonDumpGenerator.php
@@ -83,7 +83,7 @@
}
}
- $json = "]\n"; //TODO: make optional
+ $json = "\n]\n"; //TODO: make optional
$this->writeToDump( $json );
}
diff --git a/lib/includes/IO/EntityIdReader.php
b/lib/includes/IO/EntityIdReader.php
new file mode 100644
index 0000000..b4b62dc
--- /dev/null
+++ b/lib/includes/IO/EntityIdReader.php
@@ -0,0 +1,98 @@
+<?php
+
+namespace Wikibase\IO;
+
+use Disposable;
+use Iterator;
+use Wikibase\DataModel\Entity\EntityId;
+use Wikibase\Lib\EntityIdParser;
+
+/**
+ * EntityIdReader reads entity IDs from a file, one per line.
+ *
+ * @license GPL 2+
+ * @author Daniel Kinzler
+ */
+class EntityIdReader implements Iterator, Disposable {
+
+ /**
+ * @var LineReader
+ */
+ protected $reader;
+
+ /**
+ * @param resource $fileHandle The file to read from.
+ * @param bool $canClose Whether calling dispose() should close the
fine handle.
+ * @param bool $autoDispose Whether to automatically call dispose()
when reaching EOF.
+ *
+ * @throws \InvalidArgumentException
+ */
+ public function __construct( $fileHandle, $canClose = true,
$autoDispose = false ) {
+ $this->reader = new LineReader( $fileHandle, $canClose,
$autoDispose );
+ $this->parser = new EntityIdParser(); //TODO: inject?
+ }
+
+ /**
+ * @param string $line
+ * @return EntityId
+ */
+ protected function lineToId( $line ) {
+ $line = trim( $line );
+ $id = $this->parser->parse( $line );
+ //TODO: optionally catch, log & ignore ParseException
+
+ return $id;
+ }
+
+ /**
+ */
+ public function dispose() {
+ $this->reader->dispose();
+ }
+
+ /**
+ * Returns the current ID.
+ *
+ * @link http://php.net/manual/en/iterator.current.php
+ * @return EntityId
+ */
+ public function current() {
+ $line = $this->reader->current();
+ return $this->lineToId( $line );
+ }
+
+ /**
+ * Advance to next ID. Blank lines are skipped.
+ *
+ * @see LineReader::next()
+ */
+ public function next() {
+ do {
+ $this->reader->next();
+ } while ( $this->reader->valid() && trim(
$this->reader->current() ) === '' );
+ }
+
+ /**
+ * @see LineReader::key()
+ * @return int
+ */
+ public function key() {
+ return $this->reader->key();
+ }
+
+ /**
+ * @see LineReader::valid()
+ * @return boolean
+ */
+ public function valid() {
+ return $this->reader->valid();
+ }
+
+ /**
+ * @see LineReader::rewind()
+ */
+ public function rewind() {
+ $this->reader->rewind();
+ }
+
+}
\ No newline at end of file
diff --git a/lib/includes/IO/LineReader.php b/lib/includes/IO/LineReader.php
new file mode 100644
index 0000000..ed2e054
--- /dev/null
+++ b/lib/includes/IO/LineReader.php
@@ -0,0 +1,155 @@
+<?php
+
+namespace Wikibase\IO;
+
+use Disposable;
+use Iterator;
+
+/**
+ * LineReader allows iterating over the lines of a file.
+ * Each line returned will contain the line separator character(s) and all
whitespace.
+ * Concatenating all lines returned by the reader should result in the
original file.
+ *
+ * @license GPL 2+
+ * @author Daniel Kinzler
+ */
+class LineReader implements Iterator, Disposable {
+
+ /**
+ * @var resource
+ */
+ protected $fileHandle;
+
+ /**
+ * Whether dispose() will close the file handle.
+ *
+ * @var bool
+ */
+ protected $canClose;
+
+ /**
+ * Whether dispose() is called automatically when the end of file is
reached.
+ *
+ * @var bool
+ */
+ protected $autoDispose;
+
+ /**
+ * @var string
+ */
+ protected $current = null;
+
+ /**
+ * @var int
+ */
+ protected $line = 0;
+
+ /**
+ * @param resource $fileHandle The file to read from.
+ * @param bool $canClose Whether calling dispose() should close the
fine handle.
+ * @param bool $autoDispose Whether to automatically call dispose()
when reaching EOF
+ * or when this reader is destructed.
+ *
+ * @throws \InvalidArgumentException
+ */
+ public function __construct( $fileHandle, $canClose = true,
$autoDispose = false ) {
+ if ( !is_resource( $fileHandle ) ) {
+ throw new \InvalidArgumentException( '$fileHandle must
be a file resource.' );
+ }
+
+ if ( !is_bool( $canClose ) ) {
+ throw new \InvalidArgumentException( '$canClose must be
a boolean.' );
+ }
+
+ if ( !is_bool( $autoDispose ) ) {
+ throw new \InvalidArgumentException( '$autoDispose must
be a boolean.' );
+ }
+
+ $this->fileHandle = $fileHandle;
+
+ $this->canClose = $canClose;
+ $this->autoDispose = $autoDispose;
+ }
+
+ /**
+ * Closes the underlying file handle if the $canClose parameter was
given as
+ * true (the default) in the constructor.
+ */
+ public function dispose() {
+ if ( $this->fileHandle && $this->canClose ) {
+ fclose( $this->fileHandle );
+ }
+
+ $this->fileHandle = false;
+ }
+
+ /**
+ * Destructor, calls dispose() if $autoDispose was set in the
constructor.
+ */
+ public function __destruct() {
+ if ( $this->autoDispose ) {
+ $this->dispose();
+ }
+ }
+
+ /**
+ * Return the current line.
+ *
+ * @link http://php.net/manual/en/iterator.current.php
+ * @return string
+ */
+ public function current() {
+ return $this->current;
+ }
+
+ /**
+ * Reads the the next line. Use current() to get the line's content.
+ *
+ * @link http://php.net/manual/en/iterator.next.php
+ */
+ public function next() {
+ $this->current = fgets( $this->fileHandle );
+
+ if ( $this->valid() ) {
+ $this->line++;
+ } elseif ( $this->autoDispose ) {
+ $this->dispose();
+ }
+ }
+
+ /**
+ * Return the current line number.
+ * @link http://php.net/manual/en/iterator.key.php
+ * @return int
+ */
+ public function key() {
+ return $this->line;
+ }
+
+ /**
+ * Checks if current position is valid. Returns true if and only if
+ * next() has been called at least once and the end of file has not yet
been reached.
+ *
+ * @link http://php.net/manual/en/iterator.valid.php
+ * @return boolean whether there is a current line
+ */
+ public function valid() {
+ return is_string( $this->current );
+ }
+
+ /**
+ * Sets the file pointer to the beginning of the file, if supported.
+ * Has no effect if this LineReader has already been disposed.
+ *
+ * @link http://php.net/manual/en/iterator.rewind.php
+ * @return void Any returned value is ignored.
+ */
+ public function rewind() {
+ if ( $this->fileHandle ) {
+ fseek( $this->fileHandle, 0 );
+ $this->current = null;
+
+ $this->next();
+ }
+ }
+}
diff --git a/lib/tests/phpunit/IO/EntityIdReaderTest.txt
b/lib/tests/phpunit/IO/EntityIdReaderTest.txt
new file mode 100644
index 0000000..8eef7b3
--- /dev/null
+++ b/lib/tests/phpunit/IO/EntityIdReaderTest.txt
@@ -0,0 +1,6 @@
+Q1
+P2
+
+ q3
+
+ p4
\ No newline at end of file
diff --git a/lib/tests/phpunit/IO/EntrityIdReaderTest.php
b/lib/tests/phpunit/IO/EntrityIdReaderTest.php
new file mode 100644
index 0000000..7c652d1
--- /dev/null
+++ b/lib/tests/phpunit/IO/EntrityIdReaderTest.php
@@ -0,0 +1,49 @@
+<?php
+
+namespace Wikibase\Test\IO;
+use PHPUnit_Framework_TestCase;
+use Wikibase\DataModel\Entity\ItemId;
+use Wikibase\DataModel\Entity\PropertyId;
+use Wikibase\IO\EntityIdReader;
+
+/**
+ * @covers Wikibase\IO\EntityIdReader
+ *
+ * @ingroup WikibaseLib
+ * @ingroup Test
+ *
+ * @group Wikibase
+ * @group WikibaseLib
+ * @group WikibaseIO
+ *
+ * @license GPL 2+
+ * @author Daniel Kinzler
+ */
+class EntityIdReaderTest extends PHPUnit_Framework_TestCase {
+
+ protected function getTestFile() {
+ return __DIR__ . '/EntityIdReaderTest.txt';
+ }
+
+ protected function openIdReader( $file ) {
+ $handle = fopen( $file, 'r' );
+ return new EntityIdReader( $handle );
+ }
+
+ public function testIteration() {
+ $expected = array(
+ new ItemId( 'Q1' ),
+ new PropertyId( 'P2' ),
+ new ItemId( 'Q3' ),
+ new PropertyId( 'P4' ),
+ );
+
+ $file = $this->getTestFile();
+ $reader = $this->openIdReader( $file );
+ $actual = iterator_to_array( $reader );
+ $reader->dispose();
+
+ $this->assertEmpty( array_diff( $expected, $actual ),
"Different IDs" );
+ }
+
+}
diff --git a/lib/tests/phpunit/IO/LineReaderTest.php
b/lib/tests/phpunit/IO/LineReaderTest.php
new file mode 100644
index 0000000..51f9525
--- /dev/null
+++ b/lib/tests/phpunit/IO/LineReaderTest.php
@@ -0,0 +1,43 @@
+<?php
+
+namespace Wikibase\Test\IO;
+use PHPUnit_Framework_TestCase;
+use Wikibase\IO\LineReader;
+
+/**
+ * @covers Wikibase\IO\LineReader
+ *
+ * @ingroup WikibaseLib
+ * @ingroup Test
+ *
+ * @group Wikibase
+ * @group WikibaseLib
+ * @group WikibaseIO
+ *
+ * @license GPL 2+
+ * @author Daniel Kinzler
+ */
+class LineReaderTest extends PHPUnit_Framework_TestCase {
+
+ protected function getTestFile() {
+ return __DIR__ . '/LineReaderTest.txt';
+ }
+
+ protected function openLineReader( $file ) {
+ $handle = fopen( $file, 'r' );
+ return new LineReader( $handle );
+ }
+
+ public function testIteration() {
+ $file = $this->getTestFile();
+
+ $expected = file( $file );
+
+ $reader = $this->openLineReader( $file );
+ $actual = iterator_to_array( $reader );
+ $reader->dispose();
+
+ $this->assertEmpty( array_diff( $expected, $actual ),
"Different Lines" );
+ }
+
+}
diff --git a/lib/tests/phpunit/IO/LineReaderTest.txt
b/lib/tests/phpunit/IO/LineReaderTest.txt
new file mode 100644
index 0000000..360a4b0
--- /dev/null
+++ b/lib/tests/phpunit/IO/LineReaderTest.txt
@@ -0,0 +1,4 @@
+Hello
+World!
+
+The End.
diff --git a/repo/includes/store/sql/ConvertingResultWrapper.php
b/repo/includes/store/sql/ConvertingResultWrapper.php
index ab5a9f0..daef172 100644
--- a/repo/includes/store/sql/ConvertingResultWrapper.php
+++ b/repo/includes/store/sql/ConvertingResultWrapper.php
@@ -13,6 +13,9 @@
*
* @licence GNU GPL v2+
* @author Daniel Kinzler
+ *
+ * @todo: this should implement Disposable know a LoadBalancer instance, so
+ * we can recycle the DB connection when done.
*/
abstract class ConvertingResultWrapper implements Iterator {
diff --git a/repo/maintenance/dumpJson.php b/repo/maintenance/dumpJson.php
index 5519ca8..116add7 100644
--- a/repo/maintenance/dumpJson.php
+++ b/repo/maintenance/dumpJson.php
@@ -1,10 +1,13 @@
<?php
namespace Wikibase;
+use Disposable;
use Iterator;
use Maintenance;
+use Traversable;
use ValueFormatters\FormatterOptions;
use Wikibase\Dumpers\JsonDumpGenerator;
+use Wikibase\IO\EntityIdReader;
use Wikibase\Lib\EntityIdFormatter;
use Wikibase\Lib\Serializers\EntitySerializationOptions;
use Wikibase\Lib\Serializers\EntitySerializer;
@@ -47,11 +50,12 @@
$this->mDescription = 'Generate a JSON dump from entities in
the repository.';
- //TODO: read list of IDs from file
//TODO: filter by entity type
+ //TODO: shard by id congruence class ( id % n == m )
//$this->addOption( 'rebuild-all', "Update property info for
all properties (per default, only missing entries are created)" );
//$this->addOption( 'start-row', "The ID of the first row to
update (useful for continuing aborted runs)", false, true );
- //$this->addOption( 'batch-size', "Number of rows to update per
database transaction (100 per default)", false, true );
+
+ $this->addOption( 'list-file', "A file containing one entity ID
per line", false, true );
}
public function finalSetup() {
@@ -85,16 +89,43 @@
$idStream = $this->makeIdStream();
$dumper->generateDump( $idStream );
+
+ if ( $idStream instanceof Disposable ) {
+ // close stream / free resources
+ $idStream->dispose();
+ }
}
/**
* @return Iterator a stream of EntityId objects
*/
public function makeIdStream() {
- //TODO: provide list/filter of entities
- //TODO: allow ids to be read from a file
+ $listFile = $this->getOption( 'list-file' );
- $stream = $this->entityPerPage->getEntities();
+ if ( $listFile !== null ) {
+ //TODO: allow filtering by entity type, id congruence
class ( id % n == m ), etc.
+ $stream = $this->makeIdFileStream( $listFile );
+ } else {
+ $stream = $this->entityPerPage->getEntities();
+ }
+
+ return $stream;
+ }
+
+ /**
+ * @param $listFile
+ *
+ * @return Traversable
+ * @throws \MWException
+ */
+ protected function makeIdFileStream( $listFile ) {
+ $input = fopen( $listFile, 'r' );
+
+ if ( !$input ) {
+ throw new \MWException( "Failed to open ID file:
$input" );
+ }
+
+ $stream = new EntityIdReader( $input );
return $stream;
}
}
--
To view, visit https://gerrit.wikimedia.org/r/84000
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie101cc58b0218d623e73b832a4fc9d45f1d1ac4a
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Daniel Kinzler <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits