Manybubbles has uploaded a new change for review.
https://gerrit.wikimedia.org/r/199171
Change subject: Add RDF Munger
......................................................................
Add RDF Munger
Munges RDF - probably not what we need in the end but a good example.
Change-Id: I7787a9692e65c727214bf15971f1d4add51bad04
---
M README.md
M common/src/main/java/org/wikidata/query/rdf/common/uri/EntityData.java
A common/src/main/java/org/wikidata/query/rdf/common/uri/Ontology.java
A tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java
D tools/src/test/java/org/wikidata/query/rdf/tool/DummyUnitTest.java
R
tools/src/test/java/org/wikidata/query/rdf/tool/rdf/BlazegraphCanaryIntegrationTest.java
A tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java
7 files changed, 178 insertions(+), 25 deletions(-)
git pull ssh://gerrit.wikimedia.org:29418/wikidata/query/rdf
refs/changes/71/199171/1
diff --git a/README.md b/README.md
index f1e042b..6eae4ac 100644
--- a/README.md
+++ b/README.md
@@ -2,11 +2,39 @@
==================
Tools for Querying Wikibase instances with RDF. Two modules:
-* tools - Tools for syncing a Wikibase instance with an SPARQL 1.1 complian
triple store
-** Apache Licensed
+* tools - Tools for syncing a Wikibase instance with an SPARQL 1.1 compliant
triple store
+ * Apache Licensed
* blazegraph - Blazegraph extension to make querying Wikibase instances more
efficient
-** GPLv2 Licensed
+ * GPLv2 Licensed
+* common - Code shared between tools and blazegraph
+ * Apache Licensed
Development Notes
-----------------
-* Eclipse - Works well with m2e.
+### Eclipse
+Works well with m2e.
+
+### Randomized Testing
+Some tests use RandomizedRunner. If they fail you'll get a stack trace
containing a "seed" that looks like this:
+```
+ at
__randomizedtesting.SeedInfo.seed([A4D62887A701F9F1:1BF047C091E0A9C2]:0)
+```
+You can reuse that see by adding @Seed to the test class like this:
+```java
+ @RunWith(RandomizedRunner.class)
+ @Seed("A4D62887A701F9F1:1BF047C091E0A9C2")
+ public class MungerUnitTest extends RandomizedTest {
+```
+Just remember to remove the @Seed annotation before committing the code.
+
+We use RandomizedRunner because its a good way to cover a ton of testing
ground with relatively little code. Its how Lucene consistently finds bugs in
the JVM before they're hit in production.
+
+### Unit and Integration Testing
+All tests either end in "UnitTest" or "IntegrationTest". "UnitTest"s are so
named because they don't need any external services. "IntegrationTest"s either
need to spin up some service like Blazegraph or they need an Internet
connection to wikidata.org or test.wikidata.org.
+
+### Blazegraph
+We use Blazegraph for testing SPARQL. You can start it from the command line
by running
+```bash
+ cd tools && runBlazegraph.sh
+```
+It is started automatically during integration testing.
diff --git
a/common/src/main/java/org/wikidata/query/rdf/common/uri/EntityData.java
b/common/src/main/java/org/wikidata/query/rdf/common/uri/EntityData.java
index 49fcd0a..8ef1cb8 100644
--- a/common/src/main/java/org/wikidata/query/rdf/common/uri/EntityData.java
+++ b/common/src/main/java/org/wikidata/query/rdf/common/uri/EntityData.java
@@ -7,7 +7,7 @@
/**
* An EntityData instance for wikidata.org.
*/
- public static Entity WIKIDATA = new Entity("wikidata.org");
+ public static EntityData WIKIDATA = new EntityData("wikidata.org");
private final String namespace;
diff --git
a/common/src/main/java/org/wikidata/query/rdf/common/uri/Ontology.java
b/common/src/main/java/org/wikidata/query/rdf/common/uri/Ontology.java
new file mode 100644
index 0000000..c681853
--- /dev/null
+++ b/common/src/main/java/org/wikidata/query/rdf/common/uri/Ontology.java
@@ -0,0 +1,9 @@
+package org.wikidata.query.rdf.common.uri;
+
+/**
+ * Marks the kinds of things (items or properties).
+ */
+public class Ontology {
+ public static final String NAMESPACE = "http://www.wikidata.org/ontology#";
+ public static final String ITEM = NAMESPACE + "Item";
+}
diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java
b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java
new file mode 100644
index 0000000..1df7175
--- /dev/null
+++ b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java
@@ -0,0 +1,83 @@
+package org.wikidata.query.rdf.tool.rdf;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+import org.openrdf.model.Resource;
+import org.openrdf.model.Statement;
+import org.openrdf.model.Value;
+import org.openrdf.model.impl.StatementImpl;
+import org.openrdf.model.impl.URIImpl;
+import org.openrdf.model.vocabulary.RDF;
+import org.wikidata.query.rdf.common.uri.Entity;
+import org.wikidata.query.rdf.common.uri.EntityData;
+import org.wikidata.query.rdf.common.uri.Ontology;
+import org.wikidata.query.rdf.common.uri.SchemaDotOrg;
+
+/**
+ * Munges RDF from Wikibase into a more queryable format. Note that this is
+ * tightly coupled with Wikibase's export format.
+ */
+public class Munger {
+ private final EntityData entityDataUris;
+ private final Entity entityUris;
+
+ public Munger(EntityData entityDataUris, Entity entityUris) {
+ this.entityDataUris = entityDataUris;
+ this.entityUris = entityUris;
+ }
+
+ /**
+ * Adds and removes entries from the statements collection to munge
Wikibase
+ * RDF exports into a more queryable form.
+ *
+ * @param statements statements to munge
+ * @return a reference to statements
+ */
+ public Collection<Statement> munge(Collection<Statement> statements) {
+ Iterator<Statement> itr = statements.iterator();
+ Value revisionId = null;
+ Value lastModified = null;
+ Resource entity = null;
+
+ while (itr.hasNext()) {
+ Statement s = itr.next();
+ String subject = s.getSubject().stringValue();
+ String predicate = s.getPredicate().stringValue();
+ if (subject.startsWith(entityDataUris.namespace())) {
+ if (revisionId == null &&
predicate.equals(SchemaDotOrg.VERSION)) {
+ revisionId = s.getObject();
+ } else if (lastModified == null &&
predicate.equals(SchemaDotOrg.DATE_MODIFIED)) {
+ lastModified = s.getObject();
+ } else if (entity == null &&
predicate.equalsIgnoreCase(SchemaDotOrg.ABOUT)) {
+ try {
+ entity = (Resource) s.getObject();
+ } catch (ClassCastException e) {
+ throw new RuntimeException("Unexepect object with
schema:about predicate. "
+ + "Expected data:Q<foo> schema:about
entity:Q<foo>", e);
+ }
+ }
+ itr.remove();
+ } else if (subject.startsWith(entityUris.namespace())) {
+ entity = s.getSubject();
+ if (predicate.equals(RDF.TYPE) &&
s.getObject().stringValue().equals(Ontology.ITEM)) {
+ // We don't need wd:Q1 a wdo:item
+ itr.remove();
+ }
+ }
+ }
+ if (revisionId == null) {
+ throw new RuntimeException("Didn't get a revision id!");
+ }
+ if (lastModified == null) {
+ throw new RuntimeException("Didn't get a last modified date!");
+ }
+ if (entity == null) {
+ throw new RuntimeException("Didn't get any entity information!");
+ }
+ statements.add(new StatementImpl(entity, new
URIImpl(SchemaDotOrg.VERSION), revisionId));
+ statements.add(new StatementImpl(entity, new
URIImpl(SchemaDotOrg.DATE_MODIFIED), lastModified));
+
+ return statements;
+ }
+}
diff --git a/tools/src/test/java/org/wikidata/query/rdf/tool/DummyUnitTest.java
b/tools/src/test/java/org/wikidata/query/rdf/tool/DummyUnitTest.java
deleted file mode 100644
index 4cd0718..0000000
--- a/tools/src/test/java/org/wikidata/query/rdf/tool/DummyUnitTest.java
+++ /dev/null
@@ -1,19 +0,0 @@
-package org.wikidata.query.rdf.tool;
-
-import static org.hamcrest.Matchers.lessThan;
-
-import org.junit.Test;
-import org.junit.runner.RunWith;
-
-import com.carrotsearch.randomizedtesting.RandomizedRunner;
-import com.carrotsearch.randomizedtesting.RandomizedTest;
-
-@RunWith(RandomizedRunner.class)
-public class DummyUnitTest extends RandomizedTest {
- @Test
- public void dummy() {
- // TODO remove me when there are real tests here
- assertThat(randomIntBetween(0, 10), lessThan(11));
- }
-}
-
diff --git
a/tools/src/test/java/org/wikidata/query/rdf/tool/BlazegraphCanaryIntegrationTest.java
b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/BlazegraphCanaryIntegrationTest.java
similarity index 94%
rename from
tools/src/test/java/org/wikidata/query/rdf/tool/BlazegraphCanaryIntegrationTest.java
rename to
tools/src/test/java/org/wikidata/query/rdf/tool/rdf/BlazegraphCanaryIntegrationTest.java
index 850336b..87fa517 100644
---
a/tools/src/test/java/org/wikidata/query/rdf/tool/BlazegraphCanaryIntegrationTest.java
+++
b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/BlazegraphCanaryIntegrationTest.java
@@ -1,4 +1,4 @@
-package org.wikidata.query.rdf.tool;
+package org.wikidata.query.rdf.tool.rdf;
import static org.hamcrest.Matchers.containsString;
import static org.junit.Assert.assertThat;
diff --git
a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java
b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java
new file mode 100644
index 0000000..8a78d2b
--- /dev/null
+++ b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java
@@ -0,0 +1,52 @@
+package org.wikidata.query.rdf.tool.rdf;
+
+import static org.hamcrest.Matchers.equalTo;
+import static org.wikidata.query.rdf.tool.StatementHelper.statement;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.hamcrest.Matcher;
+import org.hamcrest.Matchers;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.openrdf.model.Statement;
+import org.openrdf.model.impl.LiteralImpl;
+import org.wikidata.query.rdf.common.uri.Entity;
+import org.wikidata.query.rdf.common.uri.EntityData;
+import org.wikidata.query.rdf.common.uri.SchemaDotOrg;
+
+import com.carrotsearch.randomizedtesting.RandomizedRunner;
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+import com.google.common.collect.ImmutableList;
+
+@RunWith(RandomizedRunner.class)
+public class MungerUnitTest extends RandomizedTest {
+ private final Munger munger = new Munger(EntityData.WIKIDATA,
Entity.WIKIDATA);
+
+ @Test
+ public void mungesEntityData() {
+ List<Statement> statements = new ArrayList<>();
+ String entityData = EntityData.WIKIDATA.namespace() + "Q23";
+ // EntityData is all munged onto Entity
+ statements.add(statement(entityData, SchemaDotOrg.ABOUT, "Q23"));
+ statements.add(statement(entityData, SchemaDotOrg.VERSION, new
LiteralImpl("a revision number I promise")));
+ statements.add(statement(entityData, SchemaDotOrg.DATE_MODIFIED, new
LiteralImpl("a date I promise")));
+
+ // Stuff from entity isn't messed with
+ boolean hasExtra = randomBoolean();
+ if (hasExtra) {
+ statements.add(statement("Q23", "P509", "Q6"));
+ }
+
+ munger.munge(statements);
+ // This Matcher is so hard to build......
+ ImmutableList.Builder<Matcher<? super Statement>> matchers =
ImmutableList.builder();
+ matchers.add(equalTo(statement("Q23", SchemaDotOrg.VERSION, new
LiteralImpl("a revision number I promise"))));
+ matchers.add(equalTo(statement("Q23", SchemaDotOrg.DATE_MODIFIED, new
LiteralImpl("a date I promise"))));
+ if (hasExtra) {
+ matchers.add(equalTo(statement("Q23", "P509", "Q6")));
+ }
+ assertThat(statements, Matchers.<Statement>
containsInAnyOrder(matchers.build()));
+ }
+}
--
To view, visit https://gerrit.wikimedia.org/r/199171
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings
Gerrit-MessageType: newchange
Gerrit-Change-Id: I7787a9692e65c727214bf15971f1d4add51bad04
Gerrit-PatchSet: 1
Gerrit-Project: wikidata/query/rdf
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits