Manybubbles has uploaded a new change for review.

  https://gerrit.wikimedia.org/r/201101

Change subject: Add an option to skip site links
......................................................................

Add an option to skip site links

Site links take up a ton of space so it might be worth skipping them.
IDs 1-200 (including the missing IDs) is ~60,000 triple without site links
and ~150,000 with them.

Also fix an error encountered loading string with \ in them.  This would
be an isolated commit but its test overlaps with the above and it isn't
worth it to break it out.

Change-Id: Icc8a3b4c327e4719c0eb221060218448ebcdd877
---
M tools/src/main/java/org/wikidata/query/rdf/tool/Update.java
M tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java
M tools/src/main/java/org/wikidata/query/rdf/tool/rdf/UpdateBuilder.java
M tools/src/test/java/org/wikidata/query/rdf/tool/StatementHelper.java
M tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java
M 
tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java
6 files changed, 146 insertions(+), 35 deletions(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikidata/query/rdf 
refs/changes/01/201101/1

diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java 
b/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java
index e249f4e..87400a9 100644
--- a/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java
+++ b/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java
@@ -73,7 +73,11 @@
         @Option(shortName = "u", description = "URL to post updates and 
queries.")
         String sparqlUrl();
 
-        // TODO need option to limit the load to certain languages or sites
+        // TODO need option to limit the load to certain languages or to only
+        // import a single label fallback style
+        @Option(description = "Skip site links")
+        boolean skipSiteLinks();
+
         @Option(helpRequest = true)
         boolean help();
     }
@@ -126,7 +130,12 @@
         ExecutorService executor = new ThreadPoolExecutor(10, 10, 0, 
TimeUnit.SECONDS,
                 new LinkedBlockingQueue<Runnable>(), threadFactory.build());
 
-        new Update<>(changeSource, wikibaseRepository, rdfRepository, 
entityUris, entityDataUris, executor).run();
+        Munger munger = new Munger(entityDataUris, entityUris);
+        if (options.skipSiteLinks()) {
+            munger = munger.removeSiteLinks();
+        }
+
+        new Update<>(changeSource, wikibaseRepository, rdfRepository, munger, 
executor).run();
     }
 
     /**
@@ -198,17 +207,15 @@
     private Change.Source<B> changeSource;
     private final WikibaseRepository wikibase;
     private final RdfRepository rdfRepository;
-    private final Entity entityUris;
-    private final EntityData entityDataUris;
+    private final Munger munger;
     private final ExecutorService executor;
 
     public Update(Change.Source<B> changeSource, WikibaseRepository wikibase, 
RdfRepository rdfRepository,
-            Entity entityUris, EntityData entityDataUris, ExecutorService 
executor) {
+            Munger munger, ExecutorService executor) {
         this.changeSource = changeSource;
         this.wikibase = wikibase;
         this.rdfRepository = rdfRepository;
-        this.entityUris = entityUris;
-        this.entityDataUris = entityDataUris;
+        this.munger = munger;
         this.executor = executor;
     }
 
@@ -282,7 +289,6 @@
             log.debug("RDF repostiroy already has this revision, skipping.");
             return;
         }
-        Munger munger = new Munger(entityDataUris, entityUris);
         rdfRepository.sync(change.entityId(),
                 munger.munge(change.entityId(), 
wikibase.fetchRdfForEntity(change.entityId())));
         updateMeter.mark();
diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java 
b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java
index 858c7c2..32f3f8a 100644
--- a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java
+++ b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java
@@ -1,8 +1,10 @@
 package org.wikidata.query.rdf.tool.rdf;
 
+import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashSet;
 import java.util.Iterator;
+import java.util.List;
 import java.util.Locale;
 import java.util.Set;
 
@@ -18,6 +20,9 @@
 import org.wikidata.query.rdf.common.uri.SKOS;
 import org.wikidata.query.rdf.common.uri.SchemaDotOrg;
 
+import com.google.common.collect.ArrayListMultimap;
+import com.google.common.collect.ListMultimap;
+
 /**
  * Munges RDF from Wikibase into a more queryable format. Note that this is
  * tightly coupled with Wikibase's export format.
@@ -25,10 +30,23 @@
 public class Munger {
     private final EntityData entityDataUris;
     private final Entity entityUris;
+    private final boolean removeSiteLinks;
 
     public Munger(EntityData entityDataUris, Entity entityUris) {
+        this(entityDataUris, entityUris, false);
+    }
+
+    private Munger(EntityData entityDataUris, Entity entityUris, boolean 
removeSiteLinks) {
         this.entityDataUris = entityDataUris;
         this.entityUris = entityUris;
+        this.removeSiteLinks = removeSiteLinks;
+    }
+
+    /**
+     * Build a Munger that removes site links.
+     */
+    public Munger removeSiteLinks() {
+        return new Munger(entityDataUris, entityUris, true);
     }
 
     /**
@@ -50,8 +68,21 @@
         Value lastModified = null;
         Resource entity = null;
 
+        /*
+         * A list of statements that were removed from the original collection
+         * in error.
+         */
+        List<Statement> restoredStatements = new ArrayList<>();
+        /*
+         * Subject of all sitelinks.
+         */
         Set<String> siteLinks = new HashSet<>();
-        Set<String> unknownSubjects = new HashSet<>();
+        /*
+         * Subjects that likely showed up in statements in error. If a later
+         * statement merits the re-inclusion of the subject then its statements
+         * will be removed from this multimap and added to restoredStatement.
+         */
+        ListMultimap<String, Statement> unknownSubjects = 
ArrayListMultimap.create();
         while (itr.hasNext()) {
             Statement s = itr.next();
             String subject = s.getSubject().stringValue();
@@ -94,25 +125,34 @@
                 continue;
             }
             /*
-             * We make an effort to detect subjects that we don't recognize so
-             * we can report them as an error. We first have to filter out
-             * sitelinks.
+             * Detecting site links is important so we can (optionally) filter
+             * them out and so that we can report everything that isn't a
+             * sitelink or proper subject as an error.
              */
             if (siteLinks.contains(subject)) {
+                if (removeSiteLinks) {
+                    itr.remove();
+                }
                 continue;
             }
             if (predicate.equals(RDF.TYPE) && 
s.getObject().stringValue().equals(SchemaDotOrg.ARTICLE)) {
                 siteLinks.add(subject);
                 // Site links may have crept into unknown subjects if they
                 // appeared in a funky order.
-                unknownSubjects.remove(subject);
+                if (removeSiteLinks) {
+                    itr.remove();
+                    unknownSubjects.removeAll(subject);
+                } else {
+                    
restoredStatements.addAll(unknownSubjects.removeAll(subject));
+                }
                 continue;
             }
-            unknownSubjects.add(subject);
+            unknownSubjects.put(subject, s);
+            itr.remove();
         }
 
         if (!unknownSubjects.isEmpty()) {
-            throw new BadSubjectException(unknownSubjects, entityDataUris, 
entityUris);
+            throw new BadSubjectException(unknownSubjects.keySet(), 
entityDataUris, entityUris);
         }
         if (revisionId == null) {
             throw new RuntimeException("Didn't get a revision id for " + 
statements);
@@ -125,7 +165,7 @@
         }
         statements.add(new StatementImpl(entity, new 
URIImpl(SchemaDotOrg.VERSION), revisionId));
         statements.add(new StatementImpl(entity, new 
URIImpl(SchemaDotOrg.DATE_MODIFIED), lastModified));
-
+        statements.addAll(restoredStatements);
         return statements;
     }
 
diff --git 
a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/UpdateBuilder.java 
b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/UpdateBuilder.java
index 21ed702..12a329e 100644
--- a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/UpdateBuilder.java
+++ b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/UpdateBuilder.java
@@ -7,7 +7,7 @@
 import org.openrdf.model.Literal;
 import org.openrdf.model.Statement;
 import org.openrdf.model.URI;
-import org.openrdf.model.util.Literals;
+import org.openrdf.model.vocabulary.XMLSchema;
 
 import com.google.common.base.Joiner;
 
@@ -102,13 +102,13 @@
                 StringBuilder sb = new StringBuilder(l.getLabel().length() * 
2);
 
                 sb.append('"');
-                sb.append(l.getLabel().replace("\"", "\\\""));
+                sb.append(l.getLabel().replace("\\", "\\\\").replace("\"", 
"\\\""));
                 sb.append('"');
 
-                if (Literals.isLanguageLiteral(l)) {
+                if (l.getLanguage() != null) {
                     sb.append('@');
                     sb.append(l.getLanguage());
-                } else {
+                } else if (!l.getDatatype().equals(XMLSchema.STRING)) {
                     sb.append("^^<");
                     sb.append(l.getDatatype());
                     sb.append(">");
diff --git 
a/tools/src/test/java/org/wikidata/query/rdf/tool/StatementHelper.java 
b/tools/src/test/java/org/wikidata/query/rdf/tool/StatementHelper.java
index c49dde0..76134ef 100644
--- a/tools/src/test/java/org/wikidata/query/rdf/tool/StatementHelper.java
+++ b/tools/src/test/java/org/wikidata/query/rdf/tool/StatementHelper.java
@@ -3,9 +3,14 @@
 import org.openrdf.model.Statement;
 import org.openrdf.model.URI;
 import org.openrdf.model.Value;
+import org.openrdf.model.impl.LiteralImpl;
 import org.openrdf.model.impl.StatementImpl;
 import org.openrdf.model.impl.URIImpl;
 import org.wikidata.query.rdf.common.uri.Entity;
+import org.wikidata.query.rdf.common.uri.RDF;
+import org.wikidata.query.rdf.common.uri.SchemaDotOrg;
+
+import com.google.common.collect.ImmutableList;
 
 /**
  * Constructs statements for testing.
@@ -26,6 +31,43 @@
     }
 
     /**
+     * Build the statements describing a sitelink.
+     *
+     * @param entityId entity being linked
+     * @param link address of the link
+     * @param language language the link is in
+     * @return statements describing the sitelink
+     */
+    public static ImmutableList<Statement> siteLink(String entityId, String 
link, String language) {
+        return siteLink(entityId, link, language, false);
+    }
+
+    /**
+     * Build the statements describing a sitelink.
+     *
+     * @param entityId entity being linked
+     * @param link address of the link
+     * @param language language the link is in
+     * @param outOfOrder should the link be out of order compared to how
+     *            wikidata dumps it?
+     * @return statements describing the sitelink
+     */
+
+    public static ImmutableList<Statement> siteLink(String entityId, String 
link, String language, boolean outOfOrder) {
+        if (outOfOrder) {
+            return ImmutableList.of(//
+                    statement(link, SchemaDotOrg.IN_LANGUAGE, new 
LiteralImpl(language)),//
+                    statement(link, SchemaDotOrg.ABOUT, entityId),//
+                    statement(link, RDF.TYPE, SchemaDotOrg.ARTICLE));
+
+        }
+        return ImmutableList.of(//
+                statement(link, RDF.TYPE, SchemaDotOrg.ARTICLE),//
+                statement(link, SchemaDotOrg.ABOUT, entityId),//
+                statement(link, SchemaDotOrg.IN_LANGUAGE, new 
LiteralImpl(language)));
+    }
+
+    /**
      * Convert a string into a URI for testing.
      */
     public static URI uri(String r) {
diff --git 
a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java 
b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java
index a6525ff..0643221 100644
--- a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java
+++ b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java
@@ -4,6 +4,7 @@
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.hasItem;
 import static org.hamcrest.Matchers.not;
+import static org.wikidata.query.rdf.tool.StatementHelper.siteLink;
 import static org.wikidata.query.rdf.tool.StatementHelper.statement;
 
 import java.util.ArrayList;
@@ -51,7 +52,7 @@
         List<Statement> statements = basicEntity("Q23");
         statements.add(statement("Q23", "P509", "Q6"));
         munger.munge("Q23", statements);
-        assertThat(statements, hasItem(equalTo(statement("Q23", "P509", 
"Q6"))));
+        assertThat(statements, hasItem(statement("Q23", "P509", "Q6")));
     }
 
     @Test(expected = BadSubjectException.class)
@@ -76,7 +77,7 @@
             statements.add(articleDecl);
         }
         munger.munge("Q23", statements);
-        assertThat(statements, 
both(hasItem(equalTo(articleDecl))).and(hasItem(equalTo(metaDecl))));
+        assertThat(statements, 
both(hasItem(articleDecl)).and(hasItem(metaDecl)));
     }
 
     @Test
@@ -88,9 +89,9 @@
         List<Statement> statements = basicEntity("Q23");
         statements.addAll(ImmutableList.of(rdfsDecl, skosDecl, schemaDecl));
         munger.munge("Q23", statements);
-        assertThat(statements, hasItem(equalTo(rdfsDecl)));
-        assertThat(statements, not(hasItem(equalTo(skosDecl))));
-        assertThat(statements, not(hasItem(equalTo(schemaDecl))));
+        assertThat(statements, hasItem(rdfsDecl));
+        assertThat(statements, not(hasItem(skosDecl)));
+        assertThat(statements, not(hasItem(schemaDecl)));
     }
 
     @Test
@@ -102,8 +103,20 @@
         statements.add(georgeDecl);
         statements.add(marthaDecl);
         munger.munge("Q23", statements);
-        assertThat(statements, hasItem(equalTo(georgeDecl)));
-        assertThat(statements, not(hasItem(equalTo(marthaDecl))));
+        assertThat(statements, hasItem(georgeDecl));
+        assertThat(statements, not(hasItem(marthaDecl)));
+    }
+
+    @Test
+    public void skipSiteLinks() {
+        List<Statement> siteLink = siteLink("Q23", 
"http://en.wikipedia.org/wiki/George_Washington";, "en",
+                randomBoolean());
+        List<Statement> george = basicEntity("Q23");
+        george.addAll(siteLink);
+        munger.removeSiteLinks().munge("Q23", george);
+        for (Statement siteLinkPart : siteLink) {
+            assertThat(george, not(hasItem(siteLinkPart)));
+        }
     }
 
     private List<Statement> basicEntity(String entityId) {
diff --git 
a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java
 
b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java
index 268717e..5a39928 100644
--- 
a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java
+++ 
b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java
@@ -5,6 +5,7 @@
 import static org.junit.Assert.assertThat;
 import static org.junit.Assert.assertTrue;
 import static org.wikidata.query.rdf.tool.Matchers.binds;
+import static org.wikidata.query.rdf.tool.StatementHelper.siteLink;
 import static org.wikidata.query.rdf.tool.StatementHelper.statement;
 
 import java.math.BigInteger;
@@ -47,13 +48,11 @@
 
     @Test
     public void newSiteLink() throws QueryEvaluationException {
-        repository.sync("Q23", ImmutableList.of(//
-                statement("http://en.wikipedia.org/wiki/George_Washington";, 
SchemaDotOrg.ABOUT, "Q23")));
-        TupleQueryResult r = repository.query("SELECT * WHERE {?s ?p ?o}");
+        repository.sync("Q23", siteLink("Q23", 
"http://en.wikipedia.org/wiki/George_Washington";, "en"));
+        TupleQueryResult r = repository.query("SELECT * WHERE {?s 
<http://schema.org/about> ?o}");
         assertTrue(r.hasNext());
         assertThat(r.next(), allOf(//
                 binds("s", "http://en.wikipedia.org/wiki/George_Washington";),//
-                binds("p", SchemaDotOrg.ABOUT),//
                 binds("o", "Q23")));
         assertFalse(r.hasNext());
     }
@@ -61,13 +60,11 @@
     @Test
     public void moveSiteLink() throws QueryEvaluationException {
         newSiteLink();
-        repository.sync("Q23", ImmutableList.of(//
-                statement("http://en.wikipedia.org/wiki/George_Washingmoved";, 
SchemaDotOrg.ABOUT, "Q23")));
-        TupleQueryResult r = repository.query("SELECT * WHERE {?s ?p ?o}");
+        repository.sync("Q23", siteLink("Q23", 
"http://en.wikipedia.org/wiki/George_Washingmoved";, "en"));
+        TupleQueryResult r = repository.query("SELECT * WHERE {?s 
<http://schema.org/about> ?o}");
         assertTrue(r.hasNext());
         assertThat(r.next(), allOf(//
                 binds("s", 
"http://en.wikipedia.org/wiki/George_Washingmoved";),//
-                binds("p", SchemaDotOrg.ABOUT),//
                 binds("o", "Q23")));
         assertFalse(r.hasNext());
     }
@@ -113,6 +110,19 @@
     }
 
     @Test
+    public void statementWithBackslash() throws QueryEvaluationException {
+        repository.sync("Q42", ImmutableList.of(//
+                statement("Q42", "P396", new 
LiteralImpl("IT\\ICCU\\RAVV\\034417"))));
+        TupleQueryResult r = repository.query("SELECT * WHERE {?s ?p ?o}");
+        assertTrue(r.hasNext());
+        assertThat(r.next(), allOf(//
+                binds("s", "Q42"),//
+                binds("p", "P396"),//
+                binds("o", new LiteralImpl("IT\\ICCU\\RAVV\\034417"))));
+        assertFalse(r.hasNext());
+    }
+
+    @Test
     public void newLabelLanguage() throws QueryEvaluationException {
         newLabel();
         repository.sync("Q23", ImmutableList.of(//

-- 
To view, visit https://gerrit.wikimedia.org/r/201101
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Icc8a3b4c327e4719c0eb221060218448ebcdd877
Gerrit-PatchSet: 1
Gerrit-Project: wikidata/query/rdf
Gerrit-Branch: master
Gerrit-Owner: Manybubbles <never...@wikimedia.org>

_______________________________________________
MediaWiki-commits mailing list
MediaWiki-commits@lists.wikimedia.org
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to