Daniel Kinzler has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/118277

Change subject: (bug 62381) Chunked ID query for JSON dumper.
......................................................................

(bug 62381) Chunked ID query for JSON dumper.

When listing entities, don't list all IDs in one huge result set.

Change-Id: I842b1ef3dcc07ff437112b86aab5a5a0e64198d5
---
M lib/includes/IO/EntityIdReader.php
M lib/includes/IO/LineReader.php
M lib/tests/phpunit/IO/EntrityIdReaderTest.php
M repo/includes/Dumpers/JsonDumpGenerator.php
A repo/includes/store/EntityIdPager.php
M repo/includes/store/EntityPerPage.php
D repo/includes/store/sql/ConvertingResultWrapper.php
D repo/includes/store/sql/DatabaseRowEntityIdIterator.php
M repo/includes/store/sql/EntityPerPageTable.php
M repo/maintenance/dumpJson.php
M repo/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php
M repo/tests/phpunit/includes/store/sql/EntityPerPageTableTest.php
12 files changed, 454 insertions(+), 312 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase 
refs/changes/77/118277/1

diff --git a/lib/includes/IO/EntityIdReader.php 
b/lib/includes/IO/EntityIdReader.php
index 1fab66d..82f9d62 100644
--- a/lib/includes/IO/EntityIdReader.php
+++ b/lib/includes/IO/EntityIdReader.php
@@ -4,10 +4,10 @@
 
 use Disposable;
 use ExceptionHandler;
-use Iterator;
-use Wikibase\DataModel\Entity\BasicEntityIdParser;
 use Wikibase\DataModel\Entity\EntityId;
+use Wikibase\DataModel\Entity\EntityIdParser;
 use Wikibase\DataModel\Entity\EntityIdParsingException;
+use Wikibase\EntityIdPager;
 
 /**
  * EntityIdReader reads entity IDs from a file, one per line.
@@ -15,7 +15,7 @@
  * @license GPL 2+
  * @author Daniel Kinzler
  */
-class EntityIdReader implements Iterator, Disposable {
+class EntityIdReader implements EntityIdPager, Disposable {
 
        /**
         * @var LineReader
@@ -33,15 +33,12 @@
        protected $current = null;
 
        /**
-        * @param resource $fileHandle The file to read from.
-        * @param bool $canClose Whether calling dispose() should close the 
fine handle.
-        * @param bool $autoDispose Whether to automatically call dispose() 
when reaching EOF.
-        *
-        * @throws \InvalidArgumentException
+        * @param LineReader $reader
+        * @param \Wikibase\DataModel\Entity\EntityIdParser $parser
         */
-       public function __construct( $fileHandle, $canClose = true, 
$autoDispose = false ) {
-               $this->reader = new LineReader( $fileHandle, $canClose, 
$autoDispose );
-               $this->parser = new BasicEntityIdParser(); //FIXME: inject!
+       public function __construct( LineReader $reader, EntityIdParser $parser 
) {
+               $this->reader = $reader;
+               $this->parser = $parser;
 
                $this->exceptionHandler = new \RethrowingExceptionHandler();
        }
@@ -85,13 +82,13 @@
        }
 
        /**
-        * Returns the current ID, or, if that has been consumed, finds the 
next ID on
-        * the input stream and return it.
+        * Returns the next ID (or null if there are no more ids).
         *
         * @return EntityId|null
         */
-       protected function fill() {
-               while ( $this->current === null && $this->reader->valid() ) {
+       protected function next() {
+               $id = null;
+               while ( $id === null && $this->reader->valid() ) {
                        $line = trim( $this->reader->current() );
                        $this->reader->next();
 
@@ -99,55 +96,55 @@
                                continue;
                        }
 
-                       $this->current = $this->lineToId( $line );
+                       $id = $this->lineToId( $line );
                };
 
-               return $this->current;
-       }
-
-       /**
-        * Returns the current ID.
-        *
-        * @link http://php.net/manual/en/iterator.current.php
-        * @return EntityId
-        */
-       public function current() {
-               $id = $this->fill();
                return $id;
        }
 
        /**
-        * Advance to next ID. Blank lines are skipped.
+        * Lists the IDs of entities of the given type.
         *
-        * @see LineReader::next()
+        * This supports paging via the $offset parameter: to get the next 
batch of IDs,
+        * call listEntities() again with the $offset provided by the previous 
call
+        * to listEntities().
+        *
+        * @since 0.5
+        *
+        * @param string|null $entityType The type of entity to return, or null 
for any type.
+        * @param int $limit The maximum number of IDs to return.
+        * @param mixed &$offset A position marker representing the position to 
start listing from;
+        * Will be updated to represent the position for the next batch of IDs.
+        * Callers should make no assumptions about the type or content of 
$offset.
+        * Use null (the default) to start with the first ID.
+        *
+        * @return \Wikibase\EntityId[] A list of EntityIds matching the given 
parameters. Will
+        * be empty if there are no more entities to list from the given offset.
         */
-       public function next() {
-               $this->current = null; // consume current
-               $this->fill();
-       }
+       public function listEntities( $entityType, $limit, &$offset = null ) {
+               if ( $offset === null ) {
+                       $offset = $this->reader->getPosition();
+               }
 
-       /**
-        * @see LineReader::key()
-        * @return int
-        */
-       public function key() {
-               return $this->reader->key();
-       }
+               $this->reader->setPosition( $offset );
 
-       /**
-        * @see LineReader::valid()
-        * @return boolean
-        */
-       public function valid() {
-               return $this->fill() !== null;
-       }
+               $ids = array();
+               while ( $limit > 0 ) {
+                       $id = $this->next();
 
-       /**
-        * @see LineReader::rewind()
-        */
-       public function rewind() {
-               $this->current = null;
-               $this->reader->rewind();
-       }
+                       if ( $id === null ) {
+                               break;
+                       }
 
+                       if ( $entityType !== null && $id->getEntityType() !== 
$entityType ) {
+                               continue;
+                       }
+
+                       $ids[] = $id;
+                       $limit--;
+               }
+
+               $offset = $this->reader->getPosition();
+               return $ids;
+       }
 }
\ No newline at end of file
diff --git a/lib/includes/IO/LineReader.php b/lib/includes/IO/LineReader.php
index ed2e054..e5c5db1 100644
--- a/lib/includes/IO/LineReader.php
+++ b/lib/includes/IO/LineReader.php
@@ -145,11 +145,35 @@
         * @return void Any returned value is ignored.
         */
        public function rewind() {
-               if ( $this->fileHandle ) {
-                       fseek( $this->fileHandle, 0 );
-                       $this->current = null;
+               $this->setPosition( 0 );
+       }
 
+       /**
+        * Set the underlying file pointer to the given position (using fseek).
+        * For use with getPosition().
+        *
+        * @param int $offset
+        */
+       public function setPosition( $offset ) {
+               if ( $this->fileHandle ) {
+                       fseek( $this->fileHandle, $offset );
+
+                       $this->current = null;
                        $this->next();
                }
        }
+
+       /**
+        * Get the position of the underlying file pointer (using ftell).
+        * For use with setPosition().
+        *
+        * @return int|bool The position of the file pointer, or false upon 
error.
+        */
+       public function getPosition() {
+               if ( $this->fileHandle ) {
+                       return ftell( $this->fileHandle );
+               } else {
+                       return false;
+               }
+       }
 }
diff --git a/lib/tests/phpunit/IO/EntrityIdReaderTest.php 
b/lib/tests/phpunit/IO/EntrityIdReaderTest.php
index d5f4c7e..6491f2c 100644
--- a/lib/tests/phpunit/IO/EntrityIdReaderTest.php
+++ b/lib/tests/phpunit/IO/EntrityIdReaderTest.php
@@ -2,9 +2,14 @@
 
 namespace Wikibase\Test\IO;
 
+use Wikibase\DataModel\Entity\BasicEntityIdParser;
+use Wikibase\DataModel\Entity\EntityId;
+use Wikibase\DataModel\Entity\Item;
 use Wikibase\DataModel\Entity\ItemId;
+use Wikibase\DataModel\Entity\Property;
 use Wikibase\DataModel\Entity\PropertyId;
 use Wikibase\IO\EntityIdReader;
+use Wikibase\IO\LineReader;
 
 /**
  * @covers Wikibase\IO\EntityIdReader
@@ -25,25 +30,103 @@
        protected function openIdReader( $file ) {
                $path = __DIR__ . '/' . $file;
                $handle = fopen( $path, 'r' );
-               return new EntityIdReader( $handle );
+               return new EntityIdReader( new LineReader( $handle ), new 
BasicEntityIdParser() );
        }
 
-       public function testIteration() {
-               $expected = array(
-                       new ItemId( 'Q1' ),
-                       new PropertyId( 'P2' ),
-                       new ItemId( 'Q3' ),
-                       new PropertyId( 'P4' ),
+
+       protected function getIdStrings( array $entityIds ) {
+               $ids = array_map( function ( EntityId $entityId ) {
+                       return $entityId->getSerialization();
+               }, $entityIds );
+
+               return $ids;
+       }
+
+       protected function assertEqualIds( array $expected,array $actual, $msg 
= null ) {
+               $expectedIds = array_values( $this->getIdStrings( $expected ) );
+               $actualIds = array_values( $this->getIdStrings( $actual ) );
+
+               sort( $expectedIds );
+               sort( $actualIds );
+               $this->assertEquals( $expectedIds, $actualIds, $msg );
+       }
+
+       /**
+        * @dataProvider listEntitiesProvider
+        */
+       public function testListEntities( $file, $type, $limit, array $expected 
) {
+               $reader = $this->openIdReader( $file );
+
+               $actual = $reader->listEntities( $type, $limit );
+
+               $this->assertEqualIds( $expected, $actual );
+       }
+
+       public static function listEntitiesProvider() {
+               $q1 = new ItemId( 'Q1' );
+               $p2 = new PropertyId( 'P2' );
+               $q3 = new ItemId( 'Q3' );
+               $p4 = new PropertyId( 'P4' );
+
+               return array(
+                       'all' => array(
+                               'EntityIdReaderTest.txt', null, 100, array( 
$q1, $p2, $q3, $p4 )
+                       ),
+                       'just properties' => array(
+                               'EntityIdReaderTest.txt', 
Property::ENTITY_TYPE, 100, array( $p2, $p4 )
+                       ),
+                       'limit' => array(
+                               'EntityIdReaderTest.txt', null, 2, array( $q1, 
$p2 )
+                       ),
+                       'limit and filter' => array(
+                               'EntityIdReaderTest.txt', Item::ENTITY_TYPE, 1, 
array( $q1 )
+                       ),
                );
+       }
+       /**
+        * @dataProvider listEntitiesProvider_paging
+        */
+       public function testListEntities_paging( $file, $type, $limit, array 
$expectedChunks ) {
+               $reader = $this->openIdReader( $file );
 
-               $reader = $this->openIdReader( 'EntityIdReaderTest.txt' );
-               $actual = iterator_to_array( $reader );
-               $reader->dispose();
+               foreach ( $expectedChunks as $expected ) {
+                       $actual = $reader->listEntities( $type, $limit, $offset 
);
 
-               $this->assertEmpty( array_diff( $expected, $actual ), 
"Different IDs" );
+                       $this->assertEqualIds( $expected, $actual );
+               }
        }
 
-       public function testIterationWithErrors() {
+       public static function listEntitiesProvider_paging() {
+               $q1 = new ItemId( 'Q1' );
+               $p2 = new PropertyId( 'P2' );
+               $q3 = new ItemId( 'Q3' );
+               $p4 = new PropertyId( 'P4' );
+
+               return array(
+                       'limit' => array(
+                               'EntityIdReaderTest.txt',
+                               null,
+                               2,
+                               array (
+                                       array( $q1, $p2 ),
+                                       array( $q3, $p4 ),
+                                       array(),
+                               )
+                       ),
+                       'limit and filter' => array(
+                               'EntityIdReaderTest.txt',
+                               Item::ENTITY_TYPE,
+                               1,
+                               array(
+                                       array( $q1 ),
+                                       array( $q3 ),
+                                       array(),
+                               )
+                       )
+               );
+       }
+
+       public function testErrorHandler() {
                $expected = array(
                        new ItemId( 'Q23' ),
                        new PropertyId( 'P42' ),
@@ -56,10 +139,10 @@
                $reader = $this->openIdReader( 'EntityIdReaderTest.bad.txt' );
                $reader->setExceptionHandler( $exceptionHandler );
 
-               $actual = iterator_to_array( $reader );
+               $actual = $reader->listEntities( null, 100 );
                $reader->dispose();
 
-               $this->assertEmpty( array_diff( $expected, $actual ), 
"Different IDs" );
+               $this->assertEqualIds( $expected, $actual );
        }
 
 }
diff --git a/repo/includes/Dumpers/JsonDumpGenerator.php 
b/repo/includes/Dumpers/JsonDumpGenerator.php
index 8c50232..81c615e 100644
--- a/repo/includes/Dumpers/JsonDumpGenerator.php
+++ b/repo/includes/Dumpers/JsonDumpGenerator.php
@@ -8,8 +8,8 @@
 use MWException;
 use NullMessageReporter;
 use RethrowingExceptionHandler;
-use Traversable;
 use Wikibase\DataModel\Entity\EntityId;
+use Wikibase\EntityIdPager;
 use Wikibase\EntityLookup;
 use Wikibase\Lib\Serializers\Serializer;
 use Wikibase\StorageException;
@@ -32,7 +32,7 @@
        /**
         * @var int interval at which to output progress messages.
         */
-       public $progressInterval = 100;
+       public $batchSize = 100;
 
        /**
         * @var resource File handle for output
@@ -95,12 +95,20 @@
        }
 
        /**
-        * Sets the interval for progress reporting
+        * Sets the batch size for processing. The batch size is used as the 
limit
+        * when listing IDs via the EntityIdPager::listEntities() method, and
+        * also controls the interval of progress reports.
         *
-        * @param int $progressInterval
+        * @param int $batchSize
+        *
+        * @throws \InvalidArgumentException
         */
-       public function setProgressInterval( $progressInterval ) {
-               $this->progressInterval = $progressInterval;
+       public function setBatchSize( $batchSize ) {
+               if ( !is_int( $batchSize ) || $batchSize < 1 ) {
+                       throw new InvalidArgumentException( '$batchSize must be 
a positive integer' );
+               }
+
+               $this->batchSize = $batchSize;
        }
 
        /**
@@ -108,8 +116,8 @@
         *
         * @return int
         */
-       public function getProgressInterval() {
-               return $this->progressInterval;
+       public function getBatchSize() {
+               return $this->batchSize;
        }
 
        /**
@@ -182,24 +190,43 @@
        /**
         * Generates a JSON dump, writing to the file handle provided to the 
constructor.
         *
-        * @param Traversable $idStream an Iterator that returns EntityId 
instances
+        * @param EntityIdPager $idPager an Iterator that returns EntityId 
instances
         */
-       public function generateDump( Traversable $idStream ) {
+       public function generateDump( EntityIdPager $idPager ) {
 
                $json = "[\n"; //TODO: make optional
                $this->writeToDump( $json );
 
-               $i = 0;
-               $wantComma = false;
+               $dumpCount = 0;
 
+               while ( true ) {
+                       $ids = $idPager->listEntities( $this->entityType, 
$this->batchSize, $offset );
+
+                       if ( empty( $ids ) ) {
+                               break;
+                       }
+
+                       $this->dumpEntities( $ids, $dumpCount );
+
+                       $this->progressReporter->reportMessage( 'Processed ' . 
$dumpCount . ' entities.' );
+               };
+
+               $json = "\n]\n"; //TODO: make optional
+               $this->writeToDump( $json );
+       }
+
+       /**
+        * @param array $ids
+        * @param int &$dumpCount The number of entities already dumped (will 
be updated).
+        */
+       protected function dumpEntities( array $ids, &$dumpCount ) {
                /* @var EntityId $id */
-               foreach ( $idStream as $id ) {
+               foreach ( $ids as $id ) {
                        if ( !$this->idMatchesFilters( $id ) ) {
                                continue;
                        }
 
                        try {
-                               $i++;
                                $entity = $this->entityLookup->getEntity( $id );
 
                                if ( !$entity ) {
@@ -209,26 +236,16 @@
                                $data = $this->entitySerializer->getSerialized( 
$entity );
                                $json = $this->encode( $data );
 
-                               if ( $wantComma ) {
+                               if ( $dumpCount > 0 ) {
                                        $this->writeToDump( ",\n" );
-                                       $wantComma = false;
                                }
 
                                $this->writeToDump( $json );
-                               $wantComma = true;
+                               $dumpCount++;
                        } catch ( StorageException $ex ) {
                                $this->exceptionHandler->handleException( $ex, 
'failed-to-dump', 'Failed to dump '. $id );
                        }
-
-                       if ( $this->progressInterval > 0 && ( $i % 
$this->progressInterval ) === 0 ) {
-                               $this->progressReporter->reportMessage( 
'Processed ' . $i . ' entities.' );
-                       }
                }
-
-               $this->progressReporter->reportMessage( 'Processed ' . $i . ' 
entities.' );
-
-               $json = "\n]\n"; //TODO: make optional
-               $this->writeToDump( $json );
        }
 
        private function idMatchesFilters( EntityId $id ) {
diff --git a/repo/includes/store/EntityIdPager.php 
b/repo/includes/store/EntityIdPager.php
new file mode 100644
index 0000000..b2c8d0f
--- /dev/null
+++ b/repo/includes/store/EntityIdPager.php
@@ -0,0 +1,35 @@
+<?php
+
+namespace Wikibase;
+
+/**
+ * A service for paging through EntityIds.
+ *
+ * @since 0.5
+ *
+ * @licence GNU GPL v2+
+ * @author Daniel Kinzler
+ */
+interface EntityIdPager {
+
+       /**
+        * Lists the IDs of entities of the given type.
+        *
+        * This supports paging via the $offset parameter: to get the next 
batch of IDs,
+        * call listEntities() again with the $offset provided by the previous 
call
+        * to listEntities().
+        *
+        * @since 0.5
+        *
+        * @param string|null $entityType The type of entity to return, or null 
for any type.
+        * @param int $limit The maximum number of IDs to return.
+        * @param mixed &$offset A position marker representing the position to 
start listing from;
+        * Will be updated to represent the position for the next batch of IDs.
+        * Callers should make no assumptions about the type or content of 
$offset.
+        * Use null (the default) to start with the first ID.
+        *
+        * @return EntityId[] A list of EntityIds matching the given 
parameters. Will
+        * be empty if there are no more entities to list from the given offset.
+        */
+       public function listEntities( $entityType, $limit, &$offset = null );
+}
diff --git a/repo/includes/store/EntityPerPage.php 
b/repo/includes/store/EntityPerPage.php
index 3d976d2..17b6844 100644
--- a/repo/includes/store/EntityPerPage.php
+++ b/repo/includes/store/EntityPerPage.php
@@ -13,7 +13,7 @@
  * @licence GNU GPL v2+
  * @author Thomas Pellissier Tanon
  */
-interface EntityPerPage {
+interface EntityPerPage extends EntityIdPager {
 
        /**
         * Adds a new link between an entity and a page
@@ -100,13 +100,4 @@
         * @return EntityId[]
         */
        public function getItemsWithoutSitelinks( $siteId = null, $limit = 50, 
$offset = 0 );
-
-       /**
-        * Returns an iterator providing an EntityId object for each entity.
-        *
-        * @param string $entityType The type of entity to return, or null for 
any type.
-        *
-        * @return Iterator
-        */
-       public function getEntities( $entityType = null );
 }
diff --git a/repo/includes/store/sql/ConvertingResultWrapper.php 
b/repo/includes/store/sql/ConvertingResultWrapper.php
deleted file mode 100644
index 7dd8fd2..0000000
--- a/repo/includes/store/sql/ConvertingResultWrapper.php
+++ /dev/null
@@ -1,85 +0,0 @@
-<?php
-
-namespace Wikibase;
-
-use Iterator;
-use ResultWrapper;
-
-/**
- * Base class for iterators that convert each row of a database result into an 
appropriate object.
- *
- * @since 0.5
- *
- * @licence GNU GPL v2+
- * @author Daniel Kinzler
- *
- * @todo: this should implement Disposable know a LoadBalancer instance, so
- *        we can recycle the DB connection when done.
- */
-abstract class ConvertingResultWrapper implements Iterator {
-
-       /**
-        * @var \ResultWrapper
-        */
-       protected $rows;
-
-       /**
-        * @param ResultWrapper $rows
-        */
-       public function __construct( ResultWrapper $rows ) {
-               $this->rows = $rows;
-       }
-
-       /**
-        * @see Iterator::current()
-        * @see ResultWrapper::current()
-        */
-       public function current() {
-               $current = $this->rows->current();
-               $current = $this->convert( $current );
-               return $current;
-       }
-
-       /**
-        * @see Iterator::next()
-        * @see ResultWrapper::next()
-        */
-       public function next() {
-               $this->rows->next();
-       }
-
-       /**
-        * @see Iterator::key()
-        * @see ResultWrapper::key()
-        * @return mixed scalar or null
-        */
-       public function key() {
-               return $this->rows->key();
-       }
-
-       /**
-        * @see Iterator::valid()
-        * @see ResultWrapper::valid()
-        * @return bool
-        */
-       public function valid() {
-               return $this->rows->valid();
-       }
-
-       /**
-        * @see Iterator::rewind()
-        * @see ResultWrapper::rewind()
-        */
-       public function rewind() {
-               $this->rows->rewind();
-       }
-
-       /**
-        * Converts a database row into the desired representation.
-        *
-        * @param object $row An object representing the raw database row, as 
returned by ResultWrapper::current().
-        *
-        * @return mixed
-        */
-       protected abstract function convert( $row );
-}
diff --git a/repo/includes/store/sql/DatabaseRowEntityIdIterator.php 
b/repo/includes/store/sql/DatabaseRowEntityIdIterator.php
deleted file mode 100644
index 49ae2da..0000000
--- a/repo/includes/store/sql/DatabaseRowEntityIdIterator.php
+++ /dev/null
@@ -1,65 +0,0 @@
-<?php
-
-namespace Wikibase;
-
-use ResultWrapper;
-use Wikibase\DataModel\Entity\ItemId;
-use Wikibase\DataModel\Entity\PropertyId;
-
-/**
- * Allows a database result set containing entity IDs to be iterated as 
EntityId objects.
- *
- * @since 0.5
- *
- * @licence GNU GPL v2+
- * @author Daniel Kinzler
- */
-class DatabaseRowEntityIdIterator extends ConvertingResultWrapper {
-
-       /**
-        * @var string
-        */
-       protected $idField;
-
-       /**
-        * @var string
-        */
-       protected $typeField;
-
-       public function __construct( ResultWrapper $rows, $typeField, $idField 
) {
-               parent::__construct( $rows );
-
-               $this->idField = $idField;
-               $this->typeField = $typeField;
-       }
-
-       /**
-        * Converts a database row into the desired representation.
-        *
-        * @param object $row An object representing the raw database row, as 
returned by ResultWrapper::current().
-        *
-        * @throws \RuntimeException
-        * @return EntityId
-        */
-       protected function convert( $row ) {
-               $idField = $this->idField; //PHP fails
-               $typeField = $this->typeField; //PHP fails
-
-               $id = (int)$row->$idField;
-               $type = $row->$typeField;
-
-               //TODO: use an EntityIdFactory here
-               switch ( $type ) {
-                       case 'item':
-                               $entityId = new ItemId( "Q$id" );
-                               break;
-                       case 'property':
-                               $entityId = new PropertyId( "P$id" );
-                               break;
-                       default:
-                               throw new \RuntimeException( "Unknown entity 
type: $type" );
-               }
-
-               return $entityId;
-       }
-}
diff --git a/repo/includes/store/sql/EntityPerPageTable.php 
b/repo/includes/store/sql/EntityPerPageTable.php
index 19e260b..6b3e37e 100644
--- a/repo/includes/store/sql/EntityPerPageTable.php
+++ b/repo/includes/store/sql/EntityPerPageTable.php
@@ -2,6 +2,7 @@
 
 namespace Wikibase;
 
+use InvalidArgumentException;
 use Iterator;
 use Wikibase\DataModel\Entity\BasicEntityIdParser;
 use Wikibase\DataModel\Entity\ItemId;
@@ -11,6 +12,9 @@
  * Corresponds to the wb_entities_per_page table.
  *
  * @since 0.2
+ *
+ * @todo: make this extend DBAccessBase, so it can be used to access the 
repo's EPP table
+ * from a client!
  *
  * @licence GNU GPL v2+
  * @author Thomas Pellissier Tanon
@@ -220,32 +224,64 @@
        }
 
        /**
-        * Returns an iterator providing an EntityId object for each entity.
+        * @see EntityIdPager::listEntities
         *
-        * @see EntityPerPage::getEntities
         * @param null|string $entityType
+        * @param int $limit
+        * @param mixed &$offset
         *
-        * @return Iterator
+        * @throws \InvalidArgumentException
+        * @return EntityId[]
         */
-       public function getEntities( $entityType = null ) {
-               //XXX: Would be nice to get the DBR from a load balancer and 
allow access to foreign wikis.
-               // But since we return a ResultWrapper, we don't know when we 
can release the connection for re-use.
-
-               if ( $entityType !== null ) {
-                       $where = array( 'epp_entity_type' => $entityType );
-               } else {
+       public function listEntities( $entityType, $limit, &$offset = null ) {
+               if ( $entityType == null  ) {
                        $where = array();
+               } elseif ( !is_string( $entityType ) ) {
+                       throw new InvalidArgumentException( '$entityType must 
be a string (or null)' );
+               } else {
+                       $where = array( 'epp_entity_type' => $entityType );
+               }
+
+               if ( !is_int( $limit ) || $limit < 1 ) {
+                       throw new InvalidArgumentException( '$limit must be a 
positive integer' );
                }
 
                $dbr = wfGetDB( DB_SLAVE );
+
+               if ( $offset ) {
+                       if ( !$offset instanceof EntityId ) {
+                               throw new InvalidArgumentException( 'Bad 
$offset, use an offset supplied be an earlier call of listEntities() (or use 
null).' );
+                       }
+
+                       /* @var EntityId $offset */
+                       if ( $entityType === null ) {
+                               // Ugly. About time we switch to qualified, 
string based IDs!
+                               $where[] = '( ( epp_entity_type = ' . 
$dbr->addQuotes( $offset->getEntityType() ) .
+                                               'AND epp_entity_id > ' . 
$offset->getNumericId() . ' ) ' .
+                                               ' OR epp_entity_type > ' . 
$dbr->addQuotes( $offset->getEntityType() ) . ' )';
+                       } else {
+                               $where[] = 'epp_entity_id > ' . 
$offset->getNumericId();
+                       }
+               }
+
                $rows = $dbr->select(
                        'wb_entity_per_page',
-                       array( 'epp_entity_id', 'epp_entity_type' ),
+                       array( 'entity_type' => 'epp_entity_type', 'entity_id' 
=> 'epp_entity_id' ),
                        $where,
-                       __METHOD__
+                       __METHOD__,
+                       array(
+                               'ORDER BY' => array( 'epp_entity_type', 
'epp_entity_id' ),
+                               'LIMIT' => $limit
+                       )
                );
 
-               return new DatabaseRowEntityIdIterator( $rows, 
'epp_entity_type', 'epp_entity_id' );
+               $ids = self::getEntityIdsFromRows( $rows );
+
+               if ( !empty( $ids ) ) {
+                       $offset = $ids[ count( $ids ) - 1 ];
+               }
+
+               return $ids;
        }
 
 }
diff --git a/repo/maintenance/dumpJson.php b/repo/maintenance/dumpJson.php
index 68c9ed7..d2dee7d 100644
--- a/repo/maintenance/dumpJson.php
+++ b/repo/maintenance/dumpJson.php
@@ -4,12 +4,12 @@
 
 use Disposable;
 use ExceptionHandler;
-use Iterator;
 use Maintenance;
 use Traversable;
-use ValueFormatters\FormatterOptions;
+use Wikibase\DataModel\Entity\BasicEntityIdParser;
 use Wikibase\Dumpers\JsonDumpGenerator;
 use Wikibase\IO\EntityIdReader;
+use Wikibase\IO\LineReader;
 use Wikibase\Lib\Serializers\DispatchingEntitySerializer;
 use Wikibase\Lib\Serializers\SerializationOptions;
 use Wikibase\Lib\Serializers\Serializer;
@@ -58,7 +58,8 @@
                $this->addOption( 'list-file', "A file containing one entity ID 
per line.", false, true );
                $this->addOption( 'entity-type', "Only dump this kind of 
entity, e.g. `item` or `property`.", false, true );
                $this->addOption( 'sharding-factor', "The number of shards 
(must be >= 1)", false, true );
-               $this->addOption( 'shard', "The shard to output (must be less 
than the sharding-factor) ", false, true );
+               $this->addOption( 'shard', "The shard to output (must be less 
than the sharding-factor)", false, true );
+               $this->addOption( 'batch-size', "The number of entities per 
processing batch", false, true );
                $this->addOption( 'output', "Output file (default is stdout). 
Will be overwritten.", false, true );
                $this->addOption( 'log', "Log file (default is stderr). Will be 
appended.", false, true );
                $this->addOption( 'quiet', "Disable progress reporting", false, 
false );
@@ -145,6 +146,7 @@
                $entityType = $this->getOption( 'entity-type' );
                $shardingFactor = (int)$this->getOption( 'sharding-factor', 1 );
                $shard = (int)$this->getOption( 'shard', 0 );
+               $batchSize = (int)$this->getOption( 'batch-size', 100 );
 
                //TODO: Allow injection of an OutputStream for logging
                $this->openLogFile( $this->getOption( 'log', 'php://stderr' ) );
@@ -186,8 +188,9 @@
                //      but filtering in the dumper is needed when working from 
a list file.
                $dumper->setShardingFilter( $shardingFactor, $shard );
                $dumper->setEntityTypeFilter( $entityType );
+               $dumper->setBatchSize( $batchSize );
 
-               $idStream = $this->makeIdStream( $entityType, 
$exceptionReporter );
+               $idStream = $this->makeIdStream( $exceptionReporter );
                $dumper->generateDump( $idStream );
 
                if ( $idStream instanceof Disposable ) {
@@ -199,18 +202,17 @@
        }
 
        /**
-        * @param string|null $entityType
         * @param \ExceptionHandler $exceptionReporter
         *
-        * @return Iterator a stream of EntityId objects
+        * @return EntityIdPager a stream of EntityId objects
         */
-       public function makeIdStream( $entityType = null, ExceptionHandler 
$exceptionReporter = null ) {
+       public function makeIdStream( ExceptionHandler $exceptionReporter = 
null ) {
                $listFile = $this->getOption( 'list-file' );
 
                if ( $listFile !== null ) {
                        $stream = $this->makeIdFileStream( $listFile, 
$exceptionReporter );
                } else {
-                       $stream = $this->entityPerPage->getEntities( 
$entityType );
+                       $stream = $this->entityPerPage;
                }
 
                return $stream;
@@ -230,7 +232,7 @@
                        throw new \MWException( "Failed to open ID file: 
$input" );
                }
 
-               $stream = new EntityIdReader( $input );
+               $stream = new EntityIdReader( new LineReader( $input ), new 
BasicEntityIdParser() );
                $stream->setExceptionHandler( $exceptionReporter );
 
                return $stream;
diff --git a/repo/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php 
b/repo/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php
index c09d84a..dd8899c 100644
--- a/repo/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php
+++ b/repo/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php
@@ -2,19 +2,19 @@
 
 namespace Wikibase\Test\Dumpers;
 
-use ArrayObject;
+use Wikibase\DataModel\Entity\Entity;
 use Wikibase\DataModel\Entity\EntityId;
 use Wikibase\DataModel\Entity\ItemId;
+use Wikibase\DataModel\Entity\Property;
 use Wikibase\DataModel\Entity\PropertyId;
-use Wikibase\DataModel\SimpleSiteLink;
+use Wikibase\DataModel\SiteLink;
 use Wikibase\Dumpers\JsonDumpGenerator;
-use Wikibase\Entity;
 use Wikibase\EntityFactory;
+use Wikibase\EntityIdPager;
 use Wikibase\Item;
 use Wikibase\Lib\Serializers\DispatchingEntitySerializer;
 use Wikibase\Lib\Serializers\SerializationOptions;
 use Wikibase\Lib\Serializers\SerializerFactory;
-use Wikibase\Property;
 
 /**
  * @covers Wikibase\Dumpers\JsonDumpGenerator
@@ -79,7 +79,7 @@
                }
 
                if ( $entity instanceof Item ) {
-                       $entity->addSiteLink( new SimpleSiteLink( 'test', 'Foo' 
) );
+                       $entity->addSiteLink( new SiteLink( 'test', 'Foo' ) );
                }
 
                return $entity;
@@ -115,6 +115,58 @@
        }
 
        /**
+        * Callback for providing dummy entity lists for the EntityIdPager mock.
+        *
+        * @param array $ids
+        * @param $entityType
+        * @param $limit
+        * @param null $offset
+        *
+        * @return array
+        */
+       protected function listEntities ( array $ids, $entityType, $limit, 
&$offset = null ) {
+               $result = array();
+               $size = count( $ids );
+
+               if ( $offset === null ) {
+                       $offset = 0;
+               }
+
+               for ( ; $offset < $size && count( $result ) < $limit; $offset++ 
) {
+                       /* @var EntityId $id */
+                       $id = $ids[ $offset ];
+
+                       if ( $entityType !== null && $entityType !== 
$id->getEntityType() ) {
+                               continue;
+                       }
+
+                       $result[] = $id;
+               }
+
+               return $result;
+       }
+
+       /**
+        * @param array $ids
+        *
+        * @return EntityIdPager
+        */
+       protected function makeIdPager( array $ids ) {
+               $pager = $this->getMock( 'Wikibase\EntityIdPager' );
+
+               $this_ = $this;
+               $pager->expects( $this->any() )
+                       ->method( 'listEntities' )
+                       ->will( $this->returnCallback(
+                               function ( $entityType, $limit, &$offset = null 
) use ( $ids, $this_ ) {
+                                       return $this_->listEntities( $ids, 
$entityType, $limit, $offset );
+                               }
+                       ) );
+
+               return $pager;
+       }
+
+       /**
         * @dataProvider idProvider
         */
        public function testGenerateDump( array $ids ) {
@@ -136,12 +188,12 @@
         */
        public function testTypeFilterDump( array $ids, $type, $expectedIds ) {
                $dumper = $this->newDumpGenerator( $ids );
-               $idList = new ArrayObject( $ids );
+               $pager = $this->makeIdPager( $ids );
 
                $dumper->setEntityTypeFilter( $type );
 
                ob_start();
-               $dumper->generateDump( $idList );
+               $dumper->generateDump( $pager );
                $json = ob_get_clean();
 
                // check that the resulting json contains all the ids we asked 
for.
@@ -191,7 +243,7 @@
         */
        public function testSharding( array $ids, $shardingFactor ) {
                $dumper = $this->newDumpGenerator( $ids );
-               $idList = new ArrayObject( $ids );
+               $pager = $this->makeIdPager( $ids );
 
                $actualIds = array();
 
@@ -202,7 +254,7 @@
 
                        // Generate the dump and grab the output
                        ob_start();
-                       $dumper->generateDump( $idList );
+                       $dumper->generateDump( $pager );
                        $json = ob_get_clean();
 
                        // check that the resulting json contains all the ids 
we asked for.
@@ -318,7 +370,7 @@
                }
 
                $dumper = $this->newDumpGenerator( $ids, $missingIds );
-               $idList = new ArrayObject( $ids );
+               $pager = $this->makeIdPager( $ids );
 
                $exceptionHandler = $this->getMock( 'ExceptionHandler' );
                $exceptionHandler->expects( $this->exactly( count( $missingIds 
) ) )
@@ -327,7 +379,7 @@
                $dumper->setExceptionHandler( $exceptionHandler );
 
                ob_start();
-               $dumper->generateDump( $idList );
+               $dumper->generateDump( $pager );
                $json = ob_get_clean();
 
                // make sure we get valid json even if there were exceptions.
@@ -344,17 +396,17 @@
                }
 
                $dumper = $this->newDumpGenerator( $ids );
-               $idList = new ArrayObject( $ids );
+               $pager = $this->makeIdPager( $ids );
 
                $progressReporter = $this->getMock( 'MessageReporter' );
                $progressReporter->expects( $this->exactly( ( count( $ids ) / 
10 ) +1 ) )
                        ->method( 'reportMessage' );
 
-               $dumper->setProgressInterval( 10 );
+               $dumper->setBatchSize( 10 );
                $dumper->setProgressReporter( $progressReporter );
 
                ob_start();
-               $dumper->generateDump( $idList );
+               $dumper->generateDump( $pager );
                ob_end_clean();
        }
 }
diff --git a/repo/tests/phpunit/includes/store/sql/EntityPerPageTableTest.php 
b/repo/tests/phpunit/includes/store/sql/EntityPerPageTableTest.php
index 1e90552..786d6d7 100644
--- a/repo/tests/phpunit/includes/store/sql/EntityPerPageTableTest.php
+++ b/repo/tests/phpunit/includes/store/sql/EntityPerPageTableTest.php
@@ -76,7 +76,7 @@
                $this->markTestIncomplete( "test me!" );
        }
 
-       public function testGetEntitiesWithoutTerm( /* $termType, $language = 
null, $entityType = null, $limit = 50, $offset = 0 */ ) {
+       public function testListEntitiesWithoutTerm( /* $termType, $language = 
null, $entityType = null, $limit = 50, $offset = 0 */ ) {
                $this->markTestIncomplete( "test me!" );
        }
 
@@ -84,39 +84,94 @@
                $this->markTestIncomplete( "test me!" );
        }
 
-       /**
-        * @dataProvider getEntitiesProvider
-        */
-       public function testGetEntities( $entities, $type, $expected ) {
-               $table = $this->newEntityPerPageTable( $entities );
+       protected function getIdStrings( array $entities ) {
+               $ids = array_map( function ( $entity ) {
+                       if ( $entity instanceof Entity ) {
+                               $entity = $entity->getId();
+                       }
+                       return $entity->getSerialization();
+               }, $entities );
 
-               $iterator = $table->getEntities( $type );
-               $actual = iterator_to_array( $iterator );
-
-               $expectedIds = array();
-               foreach( $expected as $entity ) {
-                       $expectedIds[] = $entity->getId();
-               }
-               $this->assertArrayEquals( $expectedIds, $actual );
+               return $ids;
        }
 
-       public static function getEntitiesProvider() {
+       protected function assertEqualIds( array $expected,array $actual, $msg 
= null ) {
+               $expectedIds = $this->getIdStrings( $expected );
+               $actualIds = $this->getIdStrings( $actual );
+
+               $this->assertArrayEquals( $expectedIds, $actualIds, $msg );
+       }
+
+       /**
+        * @dataProvider listEntitiesProvider
+        */
+       public function testListEntities( array $entities, $type, $limit, array 
$expected ) {
+               $table = $this->newEntityPerPageTable( $entities );
+
+               $actual = $table->listEntities( $type, $limit );
+
+               $this->assertEqualIds( $expected, $actual );
+       }
+
+       public static function listEntitiesProvider() {
                $property = Property::newEmpty();
                $item = Item::newEmpty();
 
                return array(
                        'empty' => array(
-                               array(), null, array()
+                               array(), null, 100, array()
                        ),
                        'some entities' => array(
-                               array( $property, $item ), null, array( 
$property, $item )
+                               array( $item, $property ), null, 100, array( 
$property, $item )
                        ),
                        'just properties' => array(
-                               array( $property, $item ), 
Property::ENTITY_TYPE, array( $property )
+                               array( $item, $property ), 
Property::ENTITY_TYPE, 100, array( $property )
                        ),
                        'no matches' => array(
-                               array( $property ), Item::ENTITY_TYPE, array()
+                               array( $property ), Item::ENTITY_TYPE, 100, 
array()
                        ),
                );
        }
+       /**
+        * @dataProvider listEntitiesProvider_paging
+        */
+       public function testListEntities_paging( array $entities, $type, 
$limit, array $expectedChunks ) {
+               $table = $this->newEntityPerPageTable( $entities );
+
+               foreach ( $expectedChunks as $expected ) {
+                       $actual = $table->listEntities( $type, $limit, $offset 
);
+
+                       $this->assertEqualIds( $expected, $actual );
+               }
+       }
+
+       public static function listEntitiesProvider_paging() {
+               $property = Property::newEmpty();
+               $item = Item::newEmpty();
+               $item2 = Item::newEmpty();
+
+               return array(
+                       'limit' => array(
+                               // note: "item" sorted before "property".
+                               array( $item, $item2, $property ),
+                               null,
+                               2,
+                               array (
+                                       array( $item, $item2 ),
+                                       array( $property ),
+                                       array(),
+                               )
+                       ),
+                       'limit and filter' => array(
+                               array( $item, $item2, $property ),
+                               Item::ENTITY_TYPE,
+                               1,
+                               array(
+                                       array( $item ),
+                                       array( $item2 ),
+                                       array(),
+                               )
+                       )
+               );
+       }
 }

-- 
To view, visit https://gerrit.wikimedia.org/r/118277
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: I842b1ef3dcc07ff437112b86aab5a5a0e64198d5
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Daniel Kinzler <daniel.kinz...@wikimedia.de>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to