jenkins-bot has submitted this change and it was merged.

Change subject: Add RDF Munger
......................................................................


Add RDF Munger

Munges RDF - probably not what we need in the end but a good example.

Change-Id: I7787a9692e65c727214bf15971f1d4add51bad04
---
M README.md
M common/src/main/java/org/wikidata/query/rdf/common/uri/EntityData.java
A common/src/main/java/org/wikidata/query/rdf/common/uri/Ontology.java
A tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java
D tools/src/test/java/org/wikidata/query/rdf/tool/DummyUnitTest.java
R 
tools/src/test/java/org/wikidata/query/rdf/tool/rdf/BlazegraphCanaryIntegrationTest.java
A tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java
7 files changed, 178 insertions(+), 25 deletions(-)

Approvals:
  Manybubbles: Looks good to me, approved
  jenkins-bot: Verified



diff --git a/README.md b/README.md
index f1e042b..6eae4ac 100644
--- a/README.md
+++ b/README.md
@@ -2,11 +2,39 @@
 ==================
 
 Tools for Querying Wikibase instances with RDF.  Two modules:
-* tools - Tools for syncing a Wikibase instance with an SPARQL 1.1 complian 
triple store
-** Apache Licensed
+* tools - Tools for syncing a Wikibase instance with an SPARQL 1.1 compliant 
triple store
+ * Apache Licensed
 * blazegraph - Blazegraph extension to make querying Wikibase instances more 
efficient
-** GPLv2 Licensed
+ * GPLv2 Licensed
+* common - Code shared between tools and blazegraph
+ * Apache Licensed
 
 Development Notes
 -----------------
-* Eclipse - Works well with m2e.
+### Eclipse
+Works well with m2e.
+
+### Randomized Testing
+Some tests use RandomizedRunner.  If they fail you'll get a stack trace 
containing a "seed" that looks like this:
+```
+       at 
__randomizedtesting.SeedInfo.seed([A4D62887A701F9F1:1BF047C091E0A9C2]:0)
+```
+You can reuse that see by adding @Seed to the test class like this:
+```java
+       @RunWith(RandomizedRunner.class)
+       @Seed("A4D62887A701F9F1:1BF047C091E0A9C2")
+       public class MungerUnitTest extends RandomizedTest {
+```
+Just remember to remove the @Seed annotation before committing the code.
+
+We use RandomizedRunner because its a good way to cover a ton of testing 
ground with relatively little code.  Its how Lucene consistently finds bugs in 
the JVM before they're hit in production.
+
+### Unit and Integration Testing
+All tests either end in "UnitTest" or "IntegrationTest".  "UnitTest"s are so 
named because they don't need any external services.  "IntegrationTest"s either 
need to spin up some service like Blazegraph or they need an Internet 
connection to wikidata.org or test.wikidata.org.
+
+### Blazegraph
+We use Blazegraph for testing SPARQL.  You can start it from the command line 
by running
+```bash
+       cd tools && runBlazegraph.sh
+```
+It is started automatically during integration testing.
diff --git 
a/common/src/main/java/org/wikidata/query/rdf/common/uri/EntityData.java 
b/common/src/main/java/org/wikidata/query/rdf/common/uri/EntityData.java
index 49fcd0a..8ef1cb8 100644
--- a/common/src/main/java/org/wikidata/query/rdf/common/uri/EntityData.java
+++ b/common/src/main/java/org/wikidata/query/rdf/common/uri/EntityData.java
@@ -7,7 +7,7 @@
     /**
      * An EntityData instance for wikidata.org.
      */
-    public static Entity WIKIDATA = new Entity("wikidata.org");
+    public static EntityData WIKIDATA = new EntityData("wikidata.org");
 
     private final String namespace;
 
diff --git 
a/common/src/main/java/org/wikidata/query/rdf/common/uri/Ontology.java 
b/common/src/main/java/org/wikidata/query/rdf/common/uri/Ontology.java
new file mode 100644
index 0000000..c681853
--- /dev/null
+++ b/common/src/main/java/org/wikidata/query/rdf/common/uri/Ontology.java
@@ -0,0 +1,9 @@
+package org.wikidata.query.rdf.common.uri;
+
+/**
+ * Marks the kinds of things (items or properties).
+ */
+public class Ontology {
+    public static final String NAMESPACE = "http://www.wikidata.org/ontology#";;
+    public static final String ITEM = NAMESPACE + "Item";
+}
diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java 
b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java
new file mode 100644
index 0000000..1df7175
--- /dev/null
+++ b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java
@@ -0,0 +1,83 @@
+package org.wikidata.query.rdf.tool.rdf;
+
+import java.util.Collection;
+import java.util.Iterator;
+
+import org.openrdf.model.Resource;
+import org.openrdf.model.Statement;
+import org.openrdf.model.Value;
+import org.openrdf.model.impl.StatementImpl;
+import org.openrdf.model.impl.URIImpl;
+import org.openrdf.model.vocabulary.RDF;
+import org.wikidata.query.rdf.common.uri.Entity;
+import org.wikidata.query.rdf.common.uri.EntityData;
+import org.wikidata.query.rdf.common.uri.Ontology;
+import org.wikidata.query.rdf.common.uri.SchemaDotOrg;
+
+/**
+ * Munges RDF from Wikibase into a more queryable format. Note that this is
+ * tightly coupled with Wikibase's export format.
+ */
+public class Munger {
+    private final EntityData entityDataUris;
+    private final Entity entityUris;
+
+    public Munger(EntityData entityDataUris, Entity entityUris) {
+        this.entityDataUris = entityDataUris;
+        this.entityUris = entityUris;
+    }
+
+    /**
+     * Adds and removes entries from the statements collection to munge 
Wikibase
+     * RDF exports into a more queryable form.
+     * 
+     * @param statements statements to munge
+     * @return a reference to statements
+     */
+    public Collection<Statement> munge(Collection<Statement> statements) {
+        Iterator<Statement> itr = statements.iterator();
+        Value revisionId = null;
+        Value lastModified = null;
+        Resource entity = null;
+
+        while (itr.hasNext()) {
+            Statement s = itr.next();
+            String subject = s.getSubject().stringValue();
+            String predicate = s.getPredicate().stringValue();
+            if (subject.startsWith(entityDataUris.namespace())) {
+                if (revisionId == null && 
predicate.equals(SchemaDotOrg.VERSION)) {
+                    revisionId = s.getObject();
+                } else if (lastModified == null && 
predicate.equals(SchemaDotOrg.DATE_MODIFIED)) {
+                    lastModified = s.getObject();
+                } else if (entity == null && 
predicate.equalsIgnoreCase(SchemaDotOrg.ABOUT)) {
+                    try {
+                        entity = (Resource) s.getObject();
+                    } catch (ClassCastException e) {
+                        throw new RuntimeException("Unexepect object with 
schema:about predicate.  "
+                                + "Expected data:Q<foo> schema:about 
entity:Q<foo>", e);
+                    }
+                }
+                itr.remove();
+            } else if (subject.startsWith(entityUris.namespace())) {
+                entity = s.getSubject();
+                if (predicate.equals(RDF.TYPE) && 
s.getObject().stringValue().equals(Ontology.ITEM)) {
+                    // We don't need wd:Q1 a wdo:item
+                    itr.remove();
+                }
+            }
+        }
+        if (revisionId == null) {
+            throw new RuntimeException("Didn't get a revision id!");
+        }
+        if (lastModified == null) {
+            throw new RuntimeException("Didn't get a last modified date!");
+        }
+        if (entity == null) {
+            throw new RuntimeException("Didn't get any entity information!");
+        }
+        statements.add(new StatementImpl(entity, new 
URIImpl(SchemaDotOrg.VERSION), revisionId));
+        statements.add(new StatementImpl(entity, new 
URIImpl(SchemaDotOrg.DATE_MODIFIED), lastModified));
+
+        return statements;
+    }
+}
diff --git a/tools/src/test/java/org/wikidata/query/rdf/tool/DummyUnitTest.java 
b/tools/src/test/java/org/wikidata/query/rdf/tool/DummyUnitTest.java
deleted file mode 100644
index 4cd0718..0000000
--- a/tools/src/test/java/org/wikidata/query/rdf/tool/DummyUnitTest.java
+++ /dev/null
@@ -1,19 +0,0 @@
-package org.wikidata.query.rdf.tool;
-
-import static org.hamcrest.Matchers.lessThan;
-
-import org.junit.Test;
-import org.junit.runner.RunWith;
-
-import com.carrotsearch.randomizedtesting.RandomizedRunner;
-import com.carrotsearch.randomizedtesting.RandomizedTest;
-
-@RunWith(RandomizedRunner.class)
-public class DummyUnitTest extends RandomizedTest {
-    @Test
-    public void dummy() {
-        // TODO remove me when there are real tests here
-        assertThat(randomIntBetween(0, 10), lessThan(11));
-    }
-}
-
diff --git 
a/tools/src/test/java/org/wikidata/query/rdf/tool/BlazegraphCanaryIntegrationTest.java
 
b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/BlazegraphCanaryIntegrationTest.java
similarity index 94%
rename from 
tools/src/test/java/org/wikidata/query/rdf/tool/BlazegraphCanaryIntegrationTest.java
rename to 
tools/src/test/java/org/wikidata/query/rdf/tool/rdf/BlazegraphCanaryIntegrationTest.java
index 850336b..87fa517 100644
--- 
a/tools/src/test/java/org/wikidata/query/rdf/tool/BlazegraphCanaryIntegrationTest.java
+++ 
b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/BlazegraphCanaryIntegrationTest.java
@@ -1,4 +1,4 @@
-package org.wikidata.query.rdf.tool;
+package org.wikidata.query.rdf.tool.rdf;
 
 import static org.hamcrest.Matchers.containsString;
 import static org.junit.Assert.assertThat;
diff --git 
a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java 
b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java
new file mode 100644
index 0000000..8a78d2b
--- /dev/null
+++ b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java
@@ -0,0 +1,52 @@
+package org.wikidata.query.rdf.tool.rdf;
+
+import static org.hamcrest.Matchers.equalTo;
+import static org.wikidata.query.rdf.tool.StatementHelper.statement;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.hamcrest.Matcher;
+import org.hamcrest.Matchers;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.openrdf.model.Statement;
+import org.openrdf.model.impl.LiteralImpl;
+import org.wikidata.query.rdf.common.uri.Entity;
+import org.wikidata.query.rdf.common.uri.EntityData;
+import org.wikidata.query.rdf.common.uri.SchemaDotOrg;
+
+import com.carrotsearch.randomizedtesting.RandomizedRunner;
+import com.carrotsearch.randomizedtesting.RandomizedTest;
+import com.google.common.collect.ImmutableList;
+
+@RunWith(RandomizedRunner.class)
+public class MungerUnitTest extends RandomizedTest {
+    private final Munger munger = new Munger(EntityData.WIKIDATA, 
Entity.WIKIDATA);
+
+    @Test
+    public void mungesEntityData() {
+        List<Statement> statements = new ArrayList<>();
+        String entityData = EntityData.WIKIDATA.namespace() + "Q23";
+        // EntityData is all munged onto Entity
+        statements.add(statement(entityData, SchemaDotOrg.ABOUT, "Q23"));
+        statements.add(statement(entityData, SchemaDotOrg.VERSION, new 
LiteralImpl("a revision number I promise")));
+        statements.add(statement(entityData, SchemaDotOrg.DATE_MODIFIED, new 
LiteralImpl("a date I promise")));
+
+        // Stuff from entity isn't messed with
+        boolean hasExtra = randomBoolean();
+        if (hasExtra) {
+            statements.add(statement("Q23", "P509", "Q6"));
+        }
+
+        munger.munge(statements);
+        // This Matcher is so hard to build......
+        ImmutableList.Builder<Matcher<? super Statement>> matchers = 
ImmutableList.builder();
+        matchers.add(equalTo(statement("Q23", SchemaDotOrg.VERSION, new 
LiteralImpl("a revision number I promise"))));
+        matchers.add(equalTo(statement("Q23", SchemaDotOrg.DATE_MODIFIED, new 
LiteralImpl("a date I promise"))));
+        if (hasExtra) {
+            matchers.add(equalTo(statement("Q23", "P509", "Q6")));
+        }
+        assertThat(statements, Matchers.<Statement> 
containsInAnyOrder(matchers.build()));
+    }
+}

-- 
To view, visit https://gerrit.wikimedia.org/r/199171
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: merged
Gerrit-Change-Id: I7787a9692e65c727214bf15971f1d4add51bad04
Gerrit-PatchSet: 2
Gerrit-Project: wikidata/query/rdf
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <[email protected]>
Gerrit-Reviewer: Manybubbles <[email protected]>
Gerrit-Reviewer: jenkins-bot <>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to