Daniel Kinzler has uploaded a new change for review.
https://gerrit.wikimedia.org/r/204486
Change subject: Introduce DedupeBag.
......................................................................
Introduce DedupeBag.
This introduces a dedicated interface for the dedupe logic
previously implemented using a BagOStuff made accessible via
a callback.
Change-Id: Ie6d4c1527ef00200bc278eac604915bebf285778
---
M repo/includes/rdf/ComplexValueRdfBuilder.php
A repo/includes/rdf/DedupeBag.php
M repo/includes/rdf/FullStatementRdfBuilder.php
A repo/includes/rdf/HashDedupeBag.php
A repo/includes/rdf/NullDedupeBag.php
M repo/includes/rdf/RdfBuilder.php
M repo/tests/phpunit/includes/rdf/ComplexValueRdfBuilderTest.php
M repo/tests/phpunit/includes/rdf/FullStatementsRdfBuilderTest.php
M repo/tests/phpunit/includes/rdf/RdfBuilderTest.php
9 files changed, 189 insertions(+), 88 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/mediawiki/extensions/Wikibase
refs/changes/86/204486/1
diff --git a/repo/includes/rdf/ComplexValueRdfBuilder.php
b/repo/includes/rdf/ComplexValueRdfBuilder.php
index 2da3d87..a872e3e 100644
--- a/repo/includes/rdf/ComplexValueRdfBuilder.php
+++ b/repo/includes/rdf/ComplexValueRdfBuilder.php
@@ -19,15 +19,14 @@
class ComplexValueRdfBuilder extends SimpleValueRdfBuilder {
/**
- * @var callable
+ * @var DedupeBag
*/
- private $valueSeenCallback = null;
+ private $dedupeBag;
/**
* @var RdfWriter
*/
private $valueWriter;
-
/**
* @param RdfVocabulary $vocabulary
@@ -36,21 +35,23 @@
*/
public function __construct( RdfVocabulary $vocabulary, RdfWriter
$valueWriter, PropertyDataTypeLookup $propertyLookup ) {
parent::__construct( $vocabulary, $propertyLookup );
+
+ $this->dedupeBag = new NullDedupeBag();
$this->valueWriter = $valueWriter;
}
/**
- * @return callable
+ * @return DedupeBag
*/
- public function getValueSeenCallback() {
- return $this->valueSeenCallback;
+ public function getDedupeBag() {
+ return $this->dedupeBag;
}
/**
- * @param callable $valueSeenCallback
+ * @param DedupeBag $dedupeBag
*/
- public function setValueSeenCallback( $valueSeenCallback ) {
- $this->valueSeenCallback = $valueSeenCallback;
+ public function setDedupeBag( DedupeBag $dedupeBag ) {
+ $this->dedupeBag = $dedupeBag;
}
/**
@@ -119,10 +120,8 @@
private function addExpandedValue( DataValue $value, $prefix, array
$props ) {
$valueLName = $value->getHash();
- if ( $this->valueSeenCallback ) {
- if ( call_user_func( $this->valueSeenCallback,
$valueLName ) !== false ) {
+ if ( $this->dedupeBag->alreadySeen( $valueLName, 'V' ) !==
false ) {
return $valueLName;
- }
}
$this->valueWriter->about( RdfVocabulary::NS_VALUE, $valueLName
)->a( RdfVocabulary::NS_ONTOLOGY, 'Value' );
diff --git a/repo/includes/rdf/DedupeBag.php b/repo/includes/rdf/DedupeBag.php
new file mode 100644
index 0000000..5d3fe57
--- /dev/null
+++ b/repo/includes/rdf/DedupeBag.php
@@ -0,0 +1,32 @@
+<?php
+
+namespace Wikibase;
+
+/**
+ * Interface for a facility that avoids duplicates based on value hashes.
+ *
+ * @since 0.5
+ * @internal
+ *
+ * @licence GNU GPL v2+
+ * @author Daniel Kinzler
+ */
+interface DedupeBag {
+
+ /**
+ * Did we already see this value? If yes, we may need to skip it
+ *
+ * @note False negatives are acceptable, while false positives are not.
+ * This means that implementations are free to return false if it is not
+ * sure whether the hash was seen before, but should never return true
+ * if it is not certain that the hash was need before.
+ *
+ * @param string $hash Hash to check
+ * @param string $namespace Optional namespace to allow a
compartmentalized bag,
+ * tracking hashes from multiple value sets.
+ *
+ * @return bool
+ */
+ public function alreadySeen( $hash, $namespace = '' );
+
+}
diff --git a/repo/includes/rdf/FullStatementRdfBuilder.php
b/repo/includes/rdf/FullStatementRdfBuilder.php
index 9e91760..580ebe1 100644
--- a/repo/includes/rdf/FullStatementRdfBuilder.php
+++ b/repo/includes/rdf/FullStatementRdfBuilder.php
@@ -30,9 +30,9 @@
private $propertyMentionCallback = null;
/**
- * @var callable
+ * @var DedupeBag
*/
- private $referenceSeenCallback = null;
+ private $dedupeBag;
/**
* @var bool
@@ -78,6 +78,8 @@
$this->referenceWriter = $writer;
$this->valueBuilder = $valueBuilder;
+
+ $this->dedupeBag = new NullDedupeBag();
}
/**
@@ -95,10 +97,17 @@
}
/**
- * @return callable
+ * @return DedupeBag
*/
- public function getReferenceSeenCallback() {
- return $this->referenceSeenCallback;
+ public function getDedupeBag() {
+ return $this->dedupeBag;
+ }
+
+ /**
+ * @param DedupeBag $dedupeBag
+ */
+ public function setDedupeBag( DedupeBag $dedupeBag ) {
+ $this->dedupeBag = $dedupeBag;
}
/**
@@ -196,7 +205,7 @@
$this->statementWriter->about(
RdfVocabulary::NS_STATEMENT, $statementLName )
->say( RdfVocabulary::NS_PROV,
'wasDerivedFrom' )->is( RdfVocabulary::NS_REFERENCE, $refLName );
- if ( $this->referenceSeen( $hash ) !== false ) {
+ if ( $this->dedupeBag->alreadySeen( $hash, 'R'
) !== false ) {
continue;
}
@@ -242,7 +251,6 @@
} else {
wfLogWarning( "Unknown rank $rank encountered for
$entityId:{$statement->getGuid()}" );
}
-
}
/**
@@ -284,21 +292,6 @@
if ( $this->propertyMentionCallback ) {
call_user_func( $this->propertyMentionCallback,
$propertyId );
}
- }
-
- /**
- * @param string $hash
- *
- * @return bool
- */
- private function referenceSeen( $hash ) {
- if ( $this->referenceSeenCallback ) {
- if ( call_user_func( $this->referenceSeenCallback,
$hash ) ) {
- return $hash;
- }
- }
-
- return false;
}
/**
diff --git a/repo/includes/rdf/HashDedupeBag.php
b/repo/includes/rdf/HashDedupeBag.php
new file mode 100644
index 0000000..8a68451
--- /dev/null
+++ b/repo/includes/rdf/HashDedupeBag.php
@@ -0,0 +1,58 @@
+<?php
+
+namespace Wikibase;
+
+/**
+ * Null implementation of DedupeBag.
+ *
+ * @since 0.5
+ * @internal
+ *
+ * @licence GNU GPL v2+
+ * @author Daniel Kinzler
+ */
+class HashDedupeBag implements DedupeBag {
+
+ /**
+ * @var array
+ */
+ private $bag;
+
+ /**
+ * @var int
+ */
+ private $cutoff;
+
+ /**
+ * @param int $cutoff The number of hash characters to use. A larger
number means
+ * less collisions, but a larger bag. The number can be read as
an exponent
+ * to the size of the hash's alphabet, so with a hex hash and
$cutoff = 5,
+ * you'd get a max bag size of 16^5, and a collision probability
of 16^-5.
+ */
+ public function __construct( $cutoff = 5 ) {
+ $this->cutoff = $cutoff;
+
+ $this->bag = array();
+ }
+
+ /**
+ * @see DedupeBag::alreadySeen
+ *
+ * Always returns false.
+ *
+ * @param string $hash
+ * @param string $namespace
+ *
+ * @return bool
+ */
+ public function alreadySeen( $hash, $namespace = '' ) {
+ $key = $namespace . substr( $hash, 0, $this->cutoff );
+ if ( !isset( $this->bag[$key] ) || $this->bag[$key] !== $hash )
{
+ $this->bag[$key] = $hash;
+ return false;
+ }
+
+ return true;
+ }
+
+}
diff --git a/repo/includes/rdf/NullDedupeBag.php
b/repo/includes/rdf/NullDedupeBag.php
new file mode 100644
index 0000000..15dbb05
--- /dev/null
+++ b/repo/includes/rdf/NullDedupeBag.php
@@ -0,0 +1,30 @@
+<?php
+
+namespace Wikibase;
+
+/**
+ * Null implementation of DedupeBag.
+ *
+ * @since 0.5
+ * @internal
+ *
+ * @licence GNU GPL v2+
+ * @author Daniel Kinzler
+ */
+class NullDedupeBag implements DedupeBag {
+
+ /**
+ * @see DedupeBag::alreadySeen
+ *
+ * Always returns false.
+ *
+ * @param string $hash
+ * @param string $namespace
+ *
+ * @return bool
+ */
+ public function alreadySeen( $hash, $namespace = '' ) {
+ return false;
+ }
+
+}
diff --git a/repo/includes/rdf/RdfBuilder.php b/repo/includes/rdf/RdfBuilder.php
index a453578..886ea13 100644
--- a/repo/includes/rdf/RdfBuilder.php
+++ b/repo/includes/rdf/RdfBuilder.php
@@ -8,8 +8,6 @@
use Wikibase\DataModel\Entity\Entity;
use Wikibase\DataModel\Entity\EntityId;
use Wikibase\DataModel\Entity\Property;
-use Wikibase\DataModel\SiteLink;
-use Wikibase\DataModel\Snak\PropertyValueSnak;
use Wikibase\DataModel\Entity\PropertyDataTypeLookup;
use Wikibase\Lib\Store\EntityLookup;
use Wikimedia\Purtle\RdfWriter;
@@ -49,8 +47,7 @@
private $writer;
/**
- * Hash to store seen references/values for deduplication
- * @var BagOStuff
+ * @var DedupeBag
*/
private $dedupBag;
@@ -73,12 +70,12 @@
* @param PropertyDataTypeLookup $propertyLookup
* @param integer $flavor
* @param RdfWriter $writer
- * @param BagOStuff|null $dedupBag Container used for deduplication of
refs/values
+ * @param DedupeBag $dedupBag
*/
public function __construct( SiteList $sites, RdfVocabulary $vocabulary,
PropertyDataTypeLookup $propertyLookup, $flavor,
RdfWriter $writer,
- BagOStuff $dedupBag = null
+ DedupeBag $dedupBag
) {
$this->vocabulary = $vocabulary;
$this->dedupBag = $dedupBag;
@@ -130,15 +127,12 @@
if ( $this->shouldProduce( RdfProducer::PRODUCE_FULL_VALUES ) )
{
$self = $this; // PHP 5.3 compat
- $valueSeen = function( $hash ) use ( $self ) {
- return $self->alreadySeen( $hash, 'V' );
- };
// NOTE: us sub-writers for nested structures
$valueWriter = $writer->sub();
$statementValueBuilder = new ComplexValueRdfBuilder(
$vocabulary, $valueWriter, $propertyLookup );
- $statementValueBuilder->setValueSeenCallback(
$valueSeen );
+ $statementValueBuilder->setDedupeBag( $this->dedupBag );
if ( $this->shouldProduce(
RdfProducer::PRODUCE_RESOLVED_ENTITIES ) ) {
$statementValueBuilder->setEntityMentionCallback( array( $this,
'entityMentioned' ) );
@@ -177,12 +171,9 @@
$statementValueBuilder = $this->newSnakValueBuilder(
$vocabulary, $propertyLookup, $writer );
$self = $this; // PHP 5.3 compat
- $referenceSeen = function ( $hash ) use ( $self ) {
- return $self->alreadySeen( $hash, 'R' );
- };
$statementBuilder = new FullStatementRdfBuilder( $vocabulary,
$writer, $statementValueBuilder );
- $statementBuilder->setReferenceSeenCallback( $referenceSeen );
+ $statementBuilder->setDedupeBag( $this->dedupBag );
if ( $this->shouldProduce( RdfProducer::PRODUCE_PROPERTIES ) ) {
$statementBuilder->setPropertyMentionCallback( array(
$this, 'entityMentioned' ) );
@@ -268,28 +259,6 @@
private function entityResolved( EntityId $entityId ) {
$prefixedId = $entityId->getSerialization();
$this->entitiesResolved[$prefixedId] = true;
- }
-
- /**
- * Did we already see this value? If yes, we may need to skip it
- *
- * @todo Make callback private once we drop PHP 5.3 compat.
- *
- * @param string $hash hash value to check
- * @param string $namespace
- *
- * @return bool
- */
- public function alreadySeen( $hash, $namespace ) {
- if ( !$this->dedupBag ) {
- return false;
- }
- $key = $namespace . substr($hash, 0, 5);
- if ( $this->dedupBag->get( $key ) !== $hash ) {
- $this->dedupBag->set( $key, $hash );
- return false;
- }
- return true;
}
/**
diff --git a/repo/tests/phpunit/includes/rdf/ComplexValueRdfBuilderTest.php
b/repo/tests/phpunit/includes/rdf/ComplexValueRdfBuilderTest.php
index 0b5f26b..affa007 100644
--- a/repo/tests/phpunit/includes/rdf/ComplexValueRdfBuilderTest.php
+++ b/repo/tests/phpunit/includes/rdf/ComplexValueRdfBuilderTest.php
@@ -14,6 +14,9 @@
use Wikibase\DataModel\Entity\EntityIdValue;
use Wikibase\DataModel\Entity\ItemId;
use Wikibase\DataModel\Entity\PropertyId;
+use Wikibase\DedupeBag;
+use Wikibase\HashDedupeBag;
+use Wikibase\NullDedupeBag;
use Wikimedia\Purtle\RdfWriter;
use Wikibase\Rdf\Test\RdfBuilderTestData;
use Wikibase\RdfVocabulary;
@@ -56,22 +59,22 @@
*
* @return ComplexValueRdfBuilder
*/
- private function newBuilder( array &$mentioned = array(), array
$valuesSeen = array() ) {
+ private function newBuilder( array &$mentioned = array(), DedupeBag
$bag = null ) {
$entityMentioned = function( EntityId $id ) use ( &$mentioned )
{
$key = $id->getSerialization();
$mentioned[$key] = $id;
};
+ if ( !$bag ) {
+ $bag = new NullDedupeBag();
+ }
+
$vocabulary = $this->getTestData()->getVocabulary();
$valueWriter = $this->getTestData()->getNTriplesWriter();
- $valueSeenCallback = function( $hash ) use ( $valuesSeen ) {
- return in_array( $hash, $valuesSeen );
- };
-
$builder = new ComplexValueRdfBuilder( $vocabulary,
$valueWriter, $this->getTestData()->getMockRepository() );
$builder->setEntityMentionCallback( $entityMentioned );
- $builder->setValueSeenCallback( $valueSeenCallback );
+ $builder->setDedupeBag( $bag );
// HACK: glue on the value writer as a public field, so we can
evaluate it later.
$builder->test_value_writer = $valueWriter;
@@ -338,7 +341,10 @@
$writer->about( RdfVocabulary::NS_ENTITY, 'Q11' );
$mentioned = array();
- $seen = array( $value->getHash() );
+
+ $seen = new HashDedupeBag();
+ $seen->alreadySeen( $value->getHash(), 'V' );
+
$builder = $this->newBuilder( $mentioned, $seen );
$builder->addSnakValue( $writer, $propertyId, $value,
RdfVocabulary::NS_DIRECT_CLAIM );
diff --git a/repo/tests/phpunit/includes/rdf/FullStatementsRdfBuilderTest.php
b/repo/tests/phpunit/includes/rdf/FullStatementsRdfBuilderTest.php
index ce5f0b3..96520fc 100644
--- a/repo/tests/phpunit/includes/rdf/FullStatementsRdfBuilderTest.php
+++ b/repo/tests/phpunit/includes/rdf/FullStatementsRdfBuilderTest.php
@@ -4,6 +4,9 @@
use Wikibase\ComplexValueRdfBuilder;
use Wikibase\DataModel\Entity\EntityId;
+use Wikibase\DedupeBag;
+use Wikibase\HashDedupeBag;
+use Wikibase\NullDedupeBag;
use Wikimedia\Purtle\NTriplesRdfWriter;
use Wikibase\Rdf\Test\RdfBuilderTestData;
use Wikibase\RdfProducer;
@@ -44,11 +47,11 @@
/**
* @param int $flavor Bitmap for the output flavor, use
RdfProducer::PRODUCE_XXX constants.
* @param EntityId[] &$mentioned Receives any entity IDs being
mentioned.
- * @param string[] $referencesSeen A list of reference hashes that
should be considered "already seen".
+ * @param DedupeBag $dedupe A bag of reference hashes that should be
considered "already seen".
*
* @return FullStatementRdfBuilder
*/
- private function newBuilder( $flavor, array &$mentioned = array(),
array $referencesSeen = array() ) {
+ private function newBuilder( $flavor, array &$mentioned = array(),
DedupeBag $dedupe = null ) {
$vocabulary = $this->getTestData()->getVocabulary();
$writer = $this->getTestData()->getNTriplesWriter();
@@ -58,6 +61,10 @@
$mentioned[$key] = $id;
};
+ if ( !$dedupe ) {
+ $dedupe = new NullDedupeBag();
+ }
+
if ( $flavor & RdfProducer::PRODUCE_FULL_VALUES ) {
$valueWriter = $writer->sub();
@@ -66,12 +73,8 @@
$statementValueBuilder = new SimpleValueRdfBuilder(
$vocabulary, $this->getTestData()->getMockRepository() );
}
- $referenceSeenCallback = function( $hash ) use (
$referencesSeen ) {
- return in_array( $hash, $referencesSeen );
- };
-
$statementBuilder = new FullStatementRdfBuilder( $vocabulary,
$writer, $statementValueBuilder );
- $statementBuilder->setReferenceSeenCallback(
$referenceSeenCallback );
+ $statementBuilder->setDedupeBag( $dedupe );
if ( $flavor & RdfProducer::PRODUCE_PROPERTIES ) {
$statementBuilder->setPropertyMentionCallback(
$entityMentioned );
@@ -154,8 +157,14 @@
public function testAddEntity_seen( $entityName, $dataSetName, array
$referencesSeen ) {
$entity = $this->getTestData()->getEntity( $entityName );
+ $dedupe = new HashDedupeBag();
+
+ foreach ( $referencesSeen as $hash ) {
+ $dedupe->alreadySeen( $hash, 'R' );
+ }
+
$mentioned = array();
- $builder = $this->newBuilder( RdfProducer::PRODUCE_ALL,
$mentioned, $referencesSeen );
+ $builder = $this->newBuilder( RdfProducer::PRODUCE_ALL,
$mentioned, $dedupe );
$builder->addEntity( $entity );
$this->assertOrCreateNTriples( $dataSetName, $builder );
diff --git a/repo/tests/phpunit/includes/rdf/RdfBuilderTest.php
b/repo/tests/phpunit/includes/rdf/RdfBuilderTest.php
index b0e6105..34d2fb6 100644
--- a/repo/tests/phpunit/includes/rdf/RdfBuilderTest.php
+++ b/repo/tests/phpunit/includes/rdf/RdfBuilderTest.php
@@ -3,6 +3,8 @@
namespace Wikibase\Test;
use Wikibase\DataModel\Entity\Entity;
+use Wikibase\DedupeBag;
+use Wikibase\HashDedupeBag;
use Wikimedia\Purtle\NTriplesRdfWriter;
use Wikibase\Rdf\Test\RdfBuilderTestData;
use Wikibase\RdfBuilder;
@@ -40,10 +42,11 @@
/**
* @return RdfBuilder
*/
- private function newRdfBuilder( $produce, \BagOStuff $dedup = null ) {
+ private function newRdfBuilder( $produce, DedupeBag $dedup = null ) {
if( !$dedup ) {
- $dedup = new \HashBagOStuff();
+ $dedup = new HashDedupeBag();
}
+
$emitter = new NTriplesRdfWriter();
$builder = new RdfBuilder(
$this->getTestData()->getSiteList(),
@@ -53,6 +56,7 @@
$emitter,
$dedup
);
+
$builder->startDocument();
return $builder;
}
@@ -165,7 +169,8 @@
}
public function testDeduplication() {
- $bag = new \HashBagOStuff();
+ $bag = new HashDedupeBag();
+
$builder = $this->newRdfBuilder( RdfProducer::PRODUCE_ALL, $bag
);
$builder->addEntity( $this->getEntityData( 'Q7' ) );
$data1 = $this->getDataFromBuilder( $builder );
--
To view, visit https://gerrit.wikimedia.org/r/204486
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: Ie6d4c1527ef00200bc278eac604915bebf285778
Gerrit-PatchSet: 1
Gerrit-Project: mediawiki/extensions/Wikibase
Gerrit-Branch: master
Gerrit-Owner: Daniel Kinzler <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits