Daniel Kinzler has uploaded a new change for review. https://gerrit.wikimedia.org/r/118277
Change subject: (bug 62381) Chunked ID query for JSON dumper. ...................................................................... (bug 62381) Chunked ID query for JSON dumper. When listing entities, don't list all IDs in one huge result set. Change-Id: I842b1ef3dcc07ff437112b86aab5a5a0e64198d5 --- M lib/includes/IO/EntityIdReader.php M lib/includes/IO/LineReader.php M lib/tests/phpunit/IO/EntrityIdReaderTest.php M repo/includes/Dumpers/JsonDumpGenerator.php A repo/includes/store/EntityIdPager.php M repo/includes/store/EntityPerPage.php D repo/includes/store/sql/ConvertingResultWrapper.php D repo/includes/store/sql/DatabaseRowEntityIdIterator.php M repo/includes/store/sql/EntityPerPageTable.php M repo/maintenance/dumpJson.php M repo/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php M repo/tests/phpunit/includes/store/sql/EntityPerPageTableTest.php 12 files changed, 454 insertions(+), 312 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase refs/changes/77/118277/1 diff --git a/lib/includes/IO/EntityIdReader.php b/lib/includes/IO/EntityIdReader.php index 1fab66d..82f9d62 100644 --- a/lib/includes/IO/EntityIdReader.php +++ b/lib/includes/IO/EntityIdReader.php @@ -4,10 +4,10 @@ use Disposable; use ExceptionHandler; -use Iterator; -use Wikibase\DataModel\Entity\BasicEntityIdParser; use Wikibase\DataModel\Entity\EntityId; +use Wikibase\DataModel\Entity\EntityIdParser; use Wikibase\DataModel\Entity\EntityIdParsingException; +use Wikibase\EntityIdPager; /** * EntityIdReader reads entity IDs from a file, one per line. @@ -15,7 +15,7 @@ * @license GPL 2+ * @author Daniel Kinzler */ -class EntityIdReader implements Iterator, Disposable { +class EntityIdReader implements EntityIdPager, Disposable { /** * @var LineReader @@ -33,15 +33,12 @@ protected $current = null; /** - * @param resource $fileHandle The file to read from. - * @param bool $canClose Whether calling dispose() should close the fine handle. - * @param bool $autoDispose Whether to automatically call dispose() when reaching EOF. - * - * @throws \InvalidArgumentException + * @param LineReader $reader + * @param \Wikibase\DataModel\Entity\EntityIdParser $parser */ - public function __construct( $fileHandle, $canClose = true, $autoDispose = false ) { - $this->reader = new LineReader( $fileHandle, $canClose, $autoDispose ); - $this->parser = new BasicEntityIdParser(); //FIXME: inject! + public function __construct( LineReader $reader, EntityIdParser $parser ) { + $this->reader = $reader; + $this->parser = $parser; $this->exceptionHandler = new \RethrowingExceptionHandler(); } @@ -85,13 +82,13 @@ } /** - * Returns the current ID, or, if that has been consumed, finds the next ID on - * the input stream and return it. + * Returns the next ID (or null if there are no more ids). * * @return EntityId|null */ - protected function fill() { - while ( $this->current === null && $this->reader->valid() ) { + protected function next() { + $id = null; + while ( $id === null && $this->reader->valid() ) { $line = trim( $this->reader->current() ); $this->reader->next(); @@ -99,55 +96,55 @@ continue; } - $this->current = $this->lineToId( $line ); + $id = $this->lineToId( $line ); }; - return $this->current; - } - - /** - * Returns the current ID. - * - * @link http://php.net/manual/en/iterator.current.php - * @return EntityId - */ - public function current() { - $id = $this->fill(); return $id; } /** - * Advance to next ID. Blank lines are skipped. + * Lists the IDs of entities of the given type. * - * @see LineReader::next() + * This supports paging via the $offset parameter: to get the next batch of IDs, + * call listEntities() again with the $offset provided by the previous call + * to listEntities(). + * + * @since 0.5 + * + * @param string|null $entityType The type of entity to return, or null for any type. + * @param int $limit The maximum number of IDs to return. + * @param mixed &$offset A position marker representing the position to start listing from; + * Will be updated to represent the position for the next batch of IDs. + * Callers should make no assumptions about the type or content of $offset. + * Use null (the default) to start with the first ID. + * + * @return \Wikibase\EntityId[] A list of EntityIds matching the given parameters. Will + * be empty if there are no more entities to list from the given offset. */ - public function next() { - $this->current = null; // consume current - $this->fill(); - } + public function listEntities( $entityType, $limit, &$offset = null ) { + if ( $offset === null ) { + $offset = $this->reader->getPosition(); + } - /** - * @see LineReader::key() - * @return int - */ - public function key() { - return $this->reader->key(); - } + $this->reader->setPosition( $offset ); - /** - * @see LineReader::valid() - * @return boolean - */ - public function valid() { - return $this->fill() !== null; - } + $ids = array(); + while ( $limit > 0 ) { + $id = $this->next(); - /** - * @see LineReader::rewind() - */ - public function rewind() { - $this->current = null; - $this->reader->rewind(); - } + if ( $id === null ) { + break; + } + if ( $entityType !== null && $id->getEntityType() !== $entityType ) { + continue; + } + + $ids[] = $id; + $limit--; + } + + $offset = $this->reader->getPosition(); + return $ids; + } } \ No newline at end of file diff --git a/lib/includes/IO/LineReader.php b/lib/includes/IO/LineReader.php index ed2e054..e5c5db1 100644 --- a/lib/includes/IO/LineReader.php +++ b/lib/includes/IO/LineReader.php @@ -145,11 +145,35 @@ * @return void Any returned value is ignored. */ public function rewind() { - if ( $this->fileHandle ) { - fseek( $this->fileHandle, 0 ); - $this->current = null; + $this->setPosition( 0 ); + } + /** + * Set the underlying file pointer to the given position (using fseek). + * For use with getPosition(). + * + * @param int $offset + */ + public function setPosition( $offset ) { + if ( $this->fileHandle ) { + fseek( $this->fileHandle, $offset ); + + $this->current = null; $this->next(); } } + + /** + * Get the position of the underlying file pointer (using ftell). + * For use with setPosition(). + * + * @return int|bool The position of the file pointer, or false upon error. + */ + public function getPosition() { + if ( $this->fileHandle ) { + return ftell( $this->fileHandle ); + } else { + return false; + } + } } diff --git a/lib/tests/phpunit/IO/EntrityIdReaderTest.php b/lib/tests/phpunit/IO/EntrityIdReaderTest.php index d5f4c7e..6491f2c 100644 --- a/lib/tests/phpunit/IO/EntrityIdReaderTest.php +++ b/lib/tests/phpunit/IO/EntrityIdReaderTest.php @@ -2,9 +2,14 @@ namespace Wikibase\Test\IO; +use Wikibase\DataModel\Entity\BasicEntityIdParser; +use Wikibase\DataModel\Entity\EntityId; +use Wikibase\DataModel\Entity\Item; use Wikibase\DataModel\Entity\ItemId; +use Wikibase\DataModel\Entity\Property; use Wikibase\DataModel\Entity\PropertyId; use Wikibase\IO\EntityIdReader; +use Wikibase\IO\LineReader; /** * @covers Wikibase\IO\EntityIdReader @@ -25,25 +30,103 @@ protected function openIdReader( $file ) { $path = __DIR__ . '/' . $file; $handle = fopen( $path, 'r' ); - return new EntityIdReader( $handle ); + return new EntityIdReader( new LineReader( $handle ), new BasicEntityIdParser() ); } - public function testIteration() { - $expected = array( - new ItemId( 'Q1' ), - new PropertyId( 'P2' ), - new ItemId( 'Q3' ), - new PropertyId( 'P4' ), + + protected function getIdStrings( array $entityIds ) { + $ids = array_map( function ( EntityId $entityId ) { + return $entityId->getSerialization(); + }, $entityIds ); + + return $ids; + } + + protected function assertEqualIds( array $expected,array $actual, $msg = null ) { + $expectedIds = array_values( $this->getIdStrings( $expected ) ); + $actualIds = array_values( $this->getIdStrings( $actual ) ); + + sort( $expectedIds ); + sort( $actualIds ); + $this->assertEquals( $expectedIds, $actualIds, $msg ); + } + + /** + * @dataProvider listEntitiesProvider + */ + public function testListEntities( $file, $type, $limit, array $expected ) { + $reader = $this->openIdReader( $file ); + + $actual = $reader->listEntities( $type, $limit ); + + $this->assertEqualIds( $expected, $actual ); + } + + public static function listEntitiesProvider() { + $q1 = new ItemId( 'Q1' ); + $p2 = new PropertyId( 'P2' ); + $q3 = new ItemId( 'Q3' ); + $p4 = new PropertyId( 'P4' ); + + return array( + 'all' => array( + 'EntityIdReaderTest.txt', null, 100, array( $q1, $p2, $q3, $p4 ) + ), + 'just properties' => array( + 'EntityIdReaderTest.txt', Property::ENTITY_TYPE, 100, array( $p2, $p4 ) + ), + 'limit' => array( + 'EntityIdReaderTest.txt', null, 2, array( $q1, $p2 ) + ), + 'limit and filter' => array( + 'EntityIdReaderTest.txt', Item::ENTITY_TYPE, 1, array( $q1 ) + ), ); + } + /** + * @dataProvider listEntitiesProvider_paging + */ + public function testListEntities_paging( $file, $type, $limit, array $expectedChunks ) { + $reader = $this->openIdReader( $file ); - $reader = $this->openIdReader( 'EntityIdReaderTest.txt' ); - $actual = iterator_to_array( $reader ); - $reader->dispose(); + foreach ( $expectedChunks as $expected ) { + $actual = $reader->listEntities( $type, $limit, $offset ); - $this->assertEmpty( array_diff( $expected, $actual ), "Different IDs" ); + $this->assertEqualIds( $expected, $actual ); + } } - public function testIterationWithErrors() { + public static function listEntitiesProvider_paging() { + $q1 = new ItemId( 'Q1' ); + $p2 = new PropertyId( 'P2' ); + $q3 = new ItemId( 'Q3' ); + $p4 = new PropertyId( 'P4' ); + + return array( + 'limit' => array( + 'EntityIdReaderTest.txt', + null, + 2, + array ( + array( $q1, $p2 ), + array( $q3, $p4 ), + array(), + ) + ), + 'limit and filter' => array( + 'EntityIdReaderTest.txt', + Item::ENTITY_TYPE, + 1, + array( + array( $q1 ), + array( $q3 ), + array(), + ) + ) + ); + } + + public function testErrorHandler() { $expected = array( new ItemId( 'Q23' ), new PropertyId( 'P42' ), @@ -56,10 +139,10 @@ $reader = $this->openIdReader( 'EntityIdReaderTest.bad.txt' ); $reader->setExceptionHandler( $exceptionHandler ); - $actual = iterator_to_array( $reader ); + $actual = $reader->listEntities( null, 100 ); $reader->dispose(); - $this->assertEmpty( array_diff( $expected, $actual ), "Different IDs" ); + $this->assertEqualIds( $expected, $actual ); } } diff --git a/repo/includes/Dumpers/JsonDumpGenerator.php b/repo/includes/Dumpers/JsonDumpGenerator.php index 8c50232..81c615e 100644 --- a/repo/includes/Dumpers/JsonDumpGenerator.php +++ b/repo/includes/Dumpers/JsonDumpGenerator.php @@ -8,8 +8,8 @@ use MWException; use NullMessageReporter; use RethrowingExceptionHandler; -use Traversable; use Wikibase\DataModel\Entity\EntityId; +use Wikibase\EntityIdPager; use Wikibase\EntityLookup; use Wikibase\Lib\Serializers\Serializer; use Wikibase\StorageException; @@ -32,7 +32,7 @@ /** * @var int interval at which to output progress messages. */ - public $progressInterval = 100; + public $batchSize = 100; /** * @var resource File handle for output @@ -95,12 +95,20 @@ } /** - * Sets the interval for progress reporting + * Sets the batch size for processing. The batch size is used as the limit + * when listing IDs via the EntityIdPager::listEntities() method, and + * also controls the interval of progress reports. * - * @param int $progressInterval + * @param int $batchSize + * + * @throws \InvalidArgumentException */ - public function setProgressInterval( $progressInterval ) { - $this->progressInterval = $progressInterval; + public function setBatchSize( $batchSize ) { + if ( !is_int( $batchSize ) || $batchSize < 1 ) { + throw new InvalidArgumentException( '$batchSize must be a positive integer' ); + } + + $this->batchSize = $batchSize; } /** @@ -108,8 +116,8 @@ * * @return int */ - public function getProgressInterval() { - return $this->progressInterval; + public function getBatchSize() { + return $this->batchSize; } /** @@ -182,24 +190,43 @@ /** * Generates a JSON dump, writing to the file handle provided to the constructor. * - * @param Traversable $idStream an Iterator that returns EntityId instances + * @param EntityIdPager $idPager an Iterator that returns EntityId instances */ - public function generateDump( Traversable $idStream ) { + public function generateDump( EntityIdPager $idPager ) { $json = "[\n"; //TODO: make optional $this->writeToDump( $json ); - $i = 0; - $wantComma = false; + $dumpCount = 0; + while ( true ) { + $ids = $idPager->listEntities( $this->entityType, $this->batchSize, $offset ); + + if ( empty( $ids ) ) { + break; + } + + $this->dumpEntities( $ids, $dumpCount ); + + $this->progressReporter->reportMessage( 'Processed ' . $dumpCount . ' entities.' ); + }; + + $json = "\n]\n"; //TODO: make optional + $this->writeToDump( $json ); + } + + /** + * @param array $ids + * @param int &$dumpCount The number of entities already dumped (will be updated). + */ + protected function dumpEntities( array $ids, &$dumpCount ) { /* @var EntityId $id */ - foreach ( $idStream as $id ) { + foreach ( $ids as $id ) { if ( !$this->idMatchesFilters( $id ) ) { continue; } try { - $i++; $entity = $this->entityLookup->getEntity( $id ); if ( !$entity ) { @@ -209,26 +236,16 @@ $data = $this->entitySerializer->getSerialized( $entity ); $json = $this->encode( $data ); - if ( $wantComma ) { + if ( $dumpCount > 0 ) { $this->writeToDump( ",\n" ); - $wantComma = false; } $this->writeToDump( $json ); - $wantComma = true; + $dumpCount++; } catch ( StorageException $ex ) { $this->exceptionHandler->handleException( $ex, 'failed-to-dump', 'Failed to dump '. $id ); } - - if ( $this->progressInterval > 0 && ( $i % $this->progressInterval ) === 0 ) { - $this->progressReporter->reportMessage( 'Processed ' . $i . ' entities.' ); - } } - - $this->progressReporter->reportMessage( 'Processed ' . $i . ' entities.' ); - - $json = "\n]\n"; //TODO: make optional - $this->writeToDump( $json ); } private function idMatchesFilters( EntityId $id ) { diff --git a/repo/includes/store/EntityIdPager.php b/repo/includes/store/EntityIdPager.php new file mode 100644 index 0000000..b2c8d0f --- /dev/null +++ b/repo/includes/store/EntityIdPager.php @@ -0,0 +1,35 @@ +<?php + +namespace Wikibase; + +/** + * A service for paging through EntityIds. + * + * @since 0.5 + * + * @licence GNU GPL v2+ + * @author Daniel Kinzler + */ +interface EntityIdPager { + + /** + * Lists the IDs of entities of the given type. + * + * This supports paging via the $offset parameter: to get the next batch of IDs, + * call listEntities() again with the $offset provided by the previous call + * to listEntities(). + * + * @since 0.5 + * + * @param string|null $entityType The type of entity to return, or null for any type. + * @param int $limit The maximum number of IDs to return. + * @param mixed &$offset A position marker representing the position to start listing from; + * Will be updated to represent the position for the next batch of IDs. + * Callers should make no assumptions about the type or content of $offset. + * Use null (the default) to start with the first ID. + * + * @return EntityId[] A list of EntityIds matching the given parameters. Will + * be empty if there are no more entities to list from the given offset. + */ + public function listEntities( $entityType, $limit, &$offset = null ); +} diff --git a/repo/includes/store/EntityPerPage.php b/repo/includes/store/EntityPerPage.php index 3d976d2..17b6844 100644 --- a/repo/includes/store/EntityPerPage.php +++ b/repo/includes/store/EntityPerPage.php @@ -13,7 +13,7 @@ * @licence GNU GPL v2+ * @author Thomas Pellissier Tanon */ -interface EntityPerPage { +interface EntityPerPage extends EntityIdPager { /** * Adds a new link between an entity and a page @@ -100,13 +100,4 @@ * @return EntityId[] */ public function getItemsWithoutSitelinks( $siteId = null, $limit = 50, $offset = 0 ); - - /** - * Returns an iterator providing an EntityId object for each entity. - * - * @param string $entityType The type of entity to return, or null for any type. - * - * @return Iterator - */ - public function getEntities( $entityType = null ); } diff --git a/repo/includes/store/sql/ConvertingResultWrapper.php b/repo/includes/store/sql/ConvertingResultWrapper.php deleted file mode 100644 index 7dd8fd2..0000000 --- a/repo/includes/store/sql/ConvertingResultWrapper.php +++ /dev/null @@ -1,85 +0,0 @@ -<?php - -namespace Wikibase; - -use Iterator; -use ResultWrapper; - -/** - * Base class for iterators that convert each row of a database result into an appropriate object. - * - * @since 0.5 - * - * @licence GNU GPL v2+ - * @author Daniel Kinzler - * - * @todo: this should implement Disposable know a LoadBalancer instance, so - * we can recycle the DB connection when done. - */ -abstract class ConvertingResultWrapper implements Iterator { - - /** - * @var \ResultWrapper - */ - protected $rows; - - /** - * @param ResultWrapper $rows - */ - public function __construct( ResultWrapper $rows ) { - $this->rows = $rows; - } - - /** - * @see Iterator::current() - * @see ResultWrapper::current() - */ - public function current() { - $current = $this->rows->current(); - $current = $this->convert( $current ); - return $current; - } - - /** - * @see Iterator::next() - * @see ResultWrapper::next() - */ - public function next() { - $this->rows->next(); - } - - /** - * @see Iterator::key() - * @see ResultWrapper::key() - * @return mixed scalar or null - */ - public function key() { - return $this->rows->key(); - } - - /** - * @see Iterator::valid() - * @see ResultWrapper::valid() - * @return bool - */ - public function valid() { - return $this->rows->valid(); - } - - /** - * @see Iterator::rewind() - * @see ResultWrapper::rewind() - */ - public function rewind() { - $this->rows->rewind(); - } - - /** - * Converts a database row into the desired representation. - * - * @param object $row An object representing the raw database row, as returned by ResultWrapper::current(). - * - * @return mixed - */ - protected abstract function convert( $row ); -} diff --git a/repo/includes/store/sql/DatabaseRowEntityIdIterator.php b/repo/includes/store/sql/DatabaseRowEntityIdIterator.php deleted file mode 100644 index 49ae2da..0000000 --- a/repo/includes/store/sql/DatabaseRowEntityIdIterator.php +++ /dev/null @@ -1,65 +0,0 @@ -<?php - -namespace Wikibase; - -use ResultWrapper; -use Wikibase\DataModel\Entity\ItemId; -use Wikibase\DataModel\Entity\PropertyId; - -/** - * Allows a database result set containing entity IDs to be iterated as EntityId objects. - * - * @since 0.5 - * - * @licence GNU GPL v2+ - * @author Daniel Kinzler - */ -class DatabaseRowEntityIdIterator extends ConvertingResultWrapper { - - /** - * @var string - */ - protected $idField; - - /** - * @var string - */ - protected $typeField; - - public function __construct( ResultWrapper $rows, $typeField, $idField ) { - parent::__construct( $rows ); - - $this->idField = $idField; - $this->typeField = $typeField; - } - - /** - * Converts a database row into the desired representation. - * - * @param object $row An object representing the raw database row, as returned by ResultWrapper::current(). - * - * @throws \RuntimeException - * @return EntityId - */ - protected function convert( $row ) { - $idField = $this->idField; //PHP fails - $typeField = $this->typeField; //PHP fails - - $id = (int)$row->$idField; - $type = $row->$typeField; - - //TODO: use an EntityIdFactory here - switch ( $type ) { - case 'item': - $entityId = new ItemId( "Q$id" ); - break; - case 'property': - $entityId = new PropertyId( "P$id" ); - break; - default: - throw new \RuntimeException( "Unknown entity type: $type" ); - } - - return $entityId; - } -} diff --git a/repo/includes/store/sql/EntityPerPageTable.php b/repo/includes/store/sql/EntityPerPageTable.php index 19e260b..6b3e37e 100644 --- a/repo/includes/store/sql/EntityPerPageTable.php +++ b/repo/includes/store/sql/EntityPerPageTable.php @@ -2,6 +2,7 @@ namespace Wikibase; +use InvalidArgumentException; use Iterator; use Wikibase\DataModel\Entity\BasicEntityIdParser; use Wikibase\DataModel\Entity\ItemId; @@ -11,6 +12,9 @@ * Corresponds to the wb_entities_per_page table. * * @since 0.2 + * + * @todo: make this extend DBAccessBase, so it can be used to access the repo's EPP table + * from a client! * * @licence GNU GPL v2+ * @author Thomas Pellissier Tanon @@ -220,32 +224,64 @@ } /** - * Returns an iterator providing an EntityId object for each entity. + * @see EntityIdPager::listEntities * - * @see EntityPerPage::getEntities * @param null|string $entityType + * @param int $limit + * @param mixed &$offset * - * @return Iterator + * @throws \InvalidArgumentException + * @return EntityId[] */ - public function getEntities( $entityType = null ) { - //XXX: Would be nice to get the DBR from a load balancer and allow access to foreign wikis. - // But since we return a ResultWrapper, we don't know when we can release the connection for re-use. - - if ( $entityType !== null ) { - $where = array( 'epp_entity_type' => $entityType ); - } else { + public function listEntities( $entityType, $limit, &$offset = null ) { + if ( $entityType == null ) { $where = array(); + } elseif ( !is_string( $entityType ) ) { + throw new InvalidArgumentException( '$entityType must be a string (or null)' ); + } else { + $where = array( 'epp_entity_type' => $entityType ); + } + + if ( !is_int( $limit ) || $limit < 1 ) { + throw new InvalidArgumentException( '$limit must be a positive integer' ); } $dbr = wfGetDB( DB_SLAVE ); + + if ( $offset ) { + if ( !$offset instanceof EntityId ) { + throw new InvalidArgumentException( 'Bad $offset, use an offset supplied be an earlier call of listEntities() (or use null).' ); + } + + /* @var EntityId $offset */ + if ( $entityType === null ) { + // Ugly. About time we switch to qualified, string based IDs! + $where[] = '( ( epp_entity_type = ' . $dbr->addQuotes( $offset->getEntityType() ) . + 'AND epp_entity_id > ' . $offset->getNumericId() . ' ) ' . + ' OR epp_entity_type > ' . $dbr->addQuotes( $offset->getEntityType() ) . ' )'; + } else { + $where[] = 'epp_entity_id > ' . $offset->getNumericId(); + } + } + $rows = $dbr->select( 'wb_entity_per_page', - array( 'epp_entity_id', 'epp_entity_type' ), + array( 'entity_type' => 'epp_entity_type', 'entity_id' => 'epp_entity_id' ), $where, - __METHOD__ + __METHOD__, + array( + 'ORDER BY' => array( 'epp_entity_type', 'epp_entity_id' ), + 'LIMIT' => $limit + ) ); - return new DatabaseRowEntityIdIterator( $rows, 'epp_entity_type', 'epp_entity_id' ); + $ids = self::getEntityIdsFromRows( $rows ); + + if ( !empty( $ids ) ) { + $offset = $ids[ count( $ids ) - 1 ]; + } + + return $ids; } } diff --git a/repo/maintenance/dumpJson.php b/repo/maintenance/dumpJson.php index 68c9ed7..d2dee7d 100644 --- a/repo/maintenance/dumpJson.php +++ b/repo/maintenance/dumpJson.php @@ -4,12 +4,12 @@ use Disposable; use ExceptionHandler; -use Iterator; use Maintenance; use Traversable; -use ValueFormatters\FormatterOptions; +use Wikibase\DataModel\Entity\BasicEntityIdParser; use Wikibase\Dumpers\JsonDumpGenerator; use Wikibase\IO\EntityIdReader; +use Wikibase\IO\LineReader; use Wikibase\Lib\Serializers\DispatchingEntitySerializer; use Wikibase\Lib\Serializers\SerializationOptions; use Wikibase\Lib\Serializers\Serializer; @@ -58,7 +58,8 @@ $this->addOption( 'list-file', "A file containing one entity ID per line.", false, true ); $this->addOption( 'entity-type', "Only dump this kind of entity, e.g. `item` or `property`.", false, true ); $this->addOption( 'sharding-factor', "The number of shards (must be >= 1)", false, true ); - $this->addOption( 'shard', "The shard to output (must be less than the sharding-factor) ", false, true ); + $this->addOption( 'shard', "The shard to output (must be less than the sharding-factor)", false, true ); + $this->addOption( 'batch-size', "The number of entities per processing batch", false, true ); $this->addOption( 'output', "Output file (default is stdout). Will be overwritten.", false, true ); $this->addOption( 'log', "Log file (default is stderr). Will be appended.", false, true ); $this->addOption( 'quiet', "Disable progress reporting", false, false ); @@ -145,6 +146,7 @@ $entityType = $this->getOption( 'entity-type' ); $shardingFactor = (int)$this->getOption( 'sharding-factor', 1 ); $shard = (int)$this->getOption( 'shard', 0 ); + $batchSize = (int)$this->getOption( 'batch-size', 100 ); //TODO: Allow injection of an OutputStream for logging $this->openLogFile( $this->getOption( 'log', 'php://stderr' ) ); @@ -186,8 +188,9 @@ // but filtering in the dumper is needed when working from a list file. $dumper->setShardingFilter( $shardingFactor, $shard ); $dumper->setEntityTypeFilter( $entityType ); + $dumper->setBatchSize( $batchSize ); - $idStream = $this->makeIdStream( $entityType, $exceptionReporter ); + $idStream = $this->makeIdStream( $exceptionReporter ); $dumper->generateDump( $idStream ); if ( $idStream instanceof Disposable ) { @@ -199,18 +202,17 @@ } /** - * @param string|null $entityType * @param \ExceptionHandler $exceptionReporter * - * @return Iterator a stream of EntityId objects + * @return EntityIdPager a stream of EntityId objects */ - public function makeIdStream( $entityType = null, ExceptionHandler $exceptionReporter = null ) { + public function makeIdStream( ExceptionHandler $exceptionReporter = null ) { $listFile = $this->getOption( 'list-file' ); if ( $listFile !== null ) { $stream = $this->makeIdFileStream( $listFile, $exceptionReporter ); } else { - $stream = $this->entityPerPage->getEntities( $entityType ); + $stream = $this->entityPerPage; } return $stream; @@ -230,7 +232,7 @@ throw new \MWException( "Failed to open ID file: $input" ); } - $stream = new EntityIdReader( $input ); + $stream = new EntityIdReader( new LineReader( $input ), new BasicEntityIdParser() ); $stream->setExceptionHandler( $exceptionReporter ); return $stream; diff --git a/repo/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php b/repo/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php index c09d84a..dd8899c 100644 --- a/repo/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php +++ b/repo/tests/phpunit/Dumpers/JsonDumpGeneratorTest.php @@ -2,19 +2,19 @@ namespace Wikibase\Test\Dumpers; -use ArrayObject; +use Wikibase\DataModel\Entity\Entity; use Wikibase\DataModel\Entity\EntityId; use Wikibase\DataModel\Entity\ItemId; +use Wikibase\DataModel\Entity\Property; use Wikibase\DataModel\Entity\PropertyId; -use Wikibase\DataModel\SimpleSiteLink; +use Wikibase\DataModel\SiteLink; use Wikibase\Dumpers\JsonDumpGenerator; -use Wikibase\Entity; use Wikibase\EntityFactory; +use Wikibase\EntityIdPager; use Wikibase\Item; use Wikibase\Lib\Serializers\DispatchingEntitySerializer; use Wikibase\Lib\Serializers\SerializationOptions; use Wikibase\Lib\Serializers\SerializerFactory; -use Wikibase\Property; /** * @covers Wikibase\Dumpers\JsonDumpGenerator @@ -79,7 +79,7 @@ } if ( $entity instanceof Item ) { - $entity->addSiteLink( new SimpleSiteLink( 'test', 'Foo' ) ); + $entity->addSiteLink( new SiteLink( 'test', 'Foo' ) ); } return $entity; @@ -115,6 +115,58 @@ } /** + * Callback for providing dummy entity lists for the EntityIdPager mock. + * + * @param array $ids + * @param $entityType + * @param $limit + * @param null $offset + * + * @return array + */ + protected function listEntities ( array $ids, $entityType, $limit, &$offset = null ) { + $result = array(); + $size = count( $ids ); + + if ( $offset === null ) { + $offset = 0; + } + + for ( ; $offset < $size && count( $result ) < $limit; $offset++ ) { + /* @var EntityId $id */ + $id = $ids[ $offset ]; + + if ( $entityType !== null && $entityType !== $id->getEntityType() ) { + continue; + } + + $result[] = $id; + } + + return $result; + } + + /** + * @param array $ids + * + * @return EntityIdPager + */ + protected function makeIdPager( array $ids ) { + $pager = $this->getMock( 'Wikibase\EntityIdPager' ); + + $this_ = $this; + $pager->expects( $this->any() ) + ->method( 'listEntities' ) + ->will( $this->returnCallback( + function ( $entityType, $limit, &$offset = null ) use ( $ids, $this_ ) { + return $this_->listEntities( $ids, $entityType, $limit, $offset ); + } + ) ); + + return $pager; + } + + /** * @dataProvider idProvider */ public function testGenerateDump( array $ids ) { @@ -136,12 +188,12 @@ */ public function testTypeFilterDump( array $ids, $type, $expectedIds ) { $dumper = $this->newDumpGenerator( $ids ); - $idList = new ArrayObject( $ids ); + $pager = $this->makeIdPager( $ids ); $dumper->setEntityTypeFilter( $type ); ob_start(); - $dumper->generateDump( $idList ); + $dumper->generateDump( $pager ); $json = ob_get_clean(); // check that the resulting json contains all the ids we asked for. @@ -191,7 +243,7 @@ */ public function testSharding( array $ids, $shardingFactor ) { $dumper = $this->newDumpGenerator( $ids ); - $idList = new ArrayObject( $ids ); + $pager = $this->makeIdPager( $ids ); $actualIds = array(); @@ -202,7 +254,7 @@ // Generate the dump and grab the output ob_start(); - $dumper->generateDump( $idList ); + $dumper->generateDump( $pager ); $json = ob_get_clean(); // check that the resulting json contains all the ids we asked for. @@ -318,7 +370,7 @@ } $dumper = $this->newDumpGenerator( $ids, $missingIds ); - $idList = new ArrayObject( $ids ); + $pager = $this->makeIdPager( $ids ); $exceptionHandler = $this->getMock( 'ExceptionHandler' ); $exceptionHandler->expects( $this->exactly( count( $missingIds ) ) ) @@ -327,7 +379,7 @@ $dumper->setExceptionHandler( $exceptionHandler ); ob_start(); - $dumper->generateDump( $idList ); + $dumper->generateDump( $pager ); $json = ob_get_clean(); // make sure we get valid json even if there were exceptions. @@ -344,17 +396,17 @@ } $dumper = $this->newDumpGenerator( $ids ); - $idList = new ArrayObject( $ids ); + $pager = $this->makeIdPager( $ids ); $progressReporter = $this->getMock( 'MessageReporter' ); $progressReporter->expects( $this->exactly( ( count( $ids ) / 10 ) +1 ) ) ->method( 'reportMessage' ); - $dumper->setProgressInterval( 10 ); + $dumper->setBatchSize( 10 ); $dumper->setProgressReporter( $progressReporter ); ob_start(); - $dumper->generateDump( $idList ); + $dumper->generateDump( $pager ); ob_end_clean(); } } diff --git a/repo/tests/phpunit/includes/store/sql/EntityPerPageTableTest.php b/repo/tests/phpunit/includes/store/sql/EntityPerPageTableTest.php index 1e90552..786d6d7 100644 --- a/repo/tests/phpunit/includes/store/sql/EntityPerPageTableTest.php +++ b/repo/tests/phpunit/includes/store/sql/EntityPerPageTableTest.php @@ -76,7 +76,7 @@ $this->markTestIncomplete( "test me!" ); } - public function testGetEntitiesWithoutTerm( /* $termType, $language = null, $entityType = null, $limit = 50, $offset = 0 */ ) { + public function testListEntitiesWithoutTerm( /* $termType, $language = null, $entityType = null, $limit = 50, $offset = 0 */ ) { $this->markTestIncomplete( "test me!" ); } @@ -84,39 +84,94 @@ $this->markTestIncomplete( "test me!" ); } - /** - * @dataProvider getEntitiesProvider - */ - public function testGetEntities( $entities, $type, $expected ) { - $table = $this->newEntityPerPageTable( $entities ); + protected function getIdStrings( array $entities ) { + $ids = array_map( function ( $entity ) { + if ( $entity instanceof Entity ) { + $entity = $entity->getId(); + } + return $entity->getSerialization(); + }, $entities ); - $iterator = $table->getEntities( $type ); - $actual = iterator_to_array( $iterator ); - - $expectedIds = array(); - foreach( $expected as $entity ) { - $expectedIds[] = $entity->getId(); - } - $this->assertArrayEquals( $expectedIds, $actual ); + return $ids; } - public static function getEntitiesProvider() { + protected function assertEqualIds( array $expected,array $actual, $msg = null ) { + $expectedIds = $this->getIdStrings( $expected ); + $actualIds = $this->getIdStrings( $actual ); + + $this->assertArrayEquals( $expectedIds, $actualIds, $msg ); + } + + /** + * @dataProvider listEntitiesProvider + */ + public function testListEntities( array $entities, $type, $limit, array $expected ) { + $table = $this->newEntityPerPageTable( $entities ); + + $actual = $table->listEntities( $type, $limit ); + + $this->assertEqualIds( $expected, $actual ); + } + + public static function listEntitiesProvider() { $property = Property::newEmpty(); $item = Item::newEmpty(); return array( 'empty' => array( - array(), null, array() + array(), null, 100, array() ), 'some entities' => array( - array( $property, $item ), null, array( $property, $item ) + array( $item, $property ), null, 100, array( $property, $item ) ), 'just properties' => array( - array( $property, $item ), Property::ENTITY_TYPE, array( $property ) + array( $item, $property ), Property::ENTITY_TYPE, 100, array( $property ) ), 'no matches' => array( - array( $property ), Item::ENTITY_TYPE, array() + array( $property ), Item::ENTITY_TYPE, 100, array() ), ); } + /** + * @dataProvider listEntitiesProvider_paging + */ + public function testListEntities_paging( array $entities, $type, $limit, array $expectedChunks ) { + $table = $this->newEntityPerPageTable( $entities ); + + foreach ( $expectedChunks as $expected ) { + $actual = $table->listEntities( $type, $limit, $offset ); + + $this->assertEqualIds( $expected, $actual ); + } + } + + public static function listEntitiesProvider_paging() { + $property = Property::newEmpty(); + $item = Item::newEmpty(); + $item2 = Item::newEmpty(); + + return array( + 'limit' => array( + // note: "item" sorted before "property". + array( $item, $item2, $property ), + null, + 2, + array ( + array( $item, $item2 ), + array( $property ), + array(), + ) + ), + 'limit and filter' => array( + array( $item, $item2, $property ), + Item::ENTITY_TYPE, + 1, + array( + array( $item ), + array( $item2 ), + array(), + ) + ) + ); + } } -- To view, visit https://gerrit.wikimedia.org/r/118277 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I842b1ef3dcc07ff437112b86aab5a5a0e64198d5 Gerrit-PatchSet: 1 Gerrit-Project: mediawiki/extensions/Wikibase Gerrit-Branch: master Gerrit-Owner: Daniel Kinzler <daniel.kinz...@wikimedia.de> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits