Manybubbles has uploaded a new change for review. https://gerrit.wikimedia.org/r/201101
Change subject: Add an option to skip site links ...................................................................... Add an option to skip site links Site links take up a ton of space so it might be worth skipping them. IDs 1-200 (including the missing IDs) is ~60,000 triple without site links and ~150,000 with them. Also fix an error encountered loading string with \ in them. This would be an isolated commit but its test overlaps with the above and it isn't worth it to break it out. Change-Id: Icc8a3b4c327e4719c0eb221060218448ebcdd877 --- M tools/src/main/java/org/wikidata/query/rdf/tool/Update.java M tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java M tools/src/main/java/org/wikidata/query/rdf/tool/rdf/UpdateBuilder.java M tools/src/test/java/org/wikidata/query/rdf/tool/StatementHelper.java M tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java M tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java 6 files changed, 146 insertions(+), 35 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/wikidata/query/rdf refs/changes/01/201101/1 diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java b/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java index e249f4e..87400a9 100644 --- a/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java +++ b/tools/src/main/java/org/wikidata/query/rdf/tool/Update.java @@ -73,7 +73,11 @@ @Option(shortName = "u", description = "URL to post updates and queries.") String sparqlUrl(); - // TODO need option to limit the load to certain languages or sites + // TODO need option to limit the load to certain languages or to only + // import a single label fallback style + @Option(description = "Skip site links") + boolean skipSiteLinks(); + @Option(helpRequest = true) boolean help(); } @@ -126,7 +130,12 @@ ExecutorService executor = new ThreadPoolExecutor(10, 10, 0, TimeUnit.SECONDS, new LinkedBlockingQueue<Runnable>(), threadFactory.build()); - new Update<>(changeSource, wikibaseRepository, rdfRepository, entityUris, entityDataUris, executor).run(); + Munger munger = new Munger(entityDataUris, entityUris); + if (options.skipSiteLinks()) { + munger = munger.removeSiteLinks(); + } + + new Update<>(changeSource, wikibaseRepository, rdfRepository, munger, executor).run(); } /** @@ -198,17 +207,15 @@ private Change.Source<B> changeSource; private final WikibaseRepository wikibase; private final RdfRepository rdfRepository; - private final Entity entityUris; - private final EntityData entityDataUris; + private final Munger munger; private final ExecutorService executor; public Update(Change.Source<B> changeSource, WikibaseRepository wikibase, RdfRepository rdfRepository, - Entity entityUris, EntityData entityDataUris, ExecutorService executor) { + Munger munger, ExecutorService executor) { this.changeSource = changeSource; this.wikibase = wikibase; this.rdfRepository = rdfRepository; - this.entityUris = entityUris; - this.entityDataUris = entityDataUris; + this.munger = munger; this.executor = executor; } @@ -282,7 +289,6 @@ log.debug("RDF repostiroy already has this revision, skipping."); return; } - Munger munger = new Munger(entityDataUris, entityUris); rdfRepository.sync(change.entityId(), munger.munge(change.entityId(), wikibase.fetchRdfForEntity(change.entityId()))); updateMeter.mark(); diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java index 858c7c2..32f3f8a 100644 --- a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java +++ b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java @@ -1,8 +1,10 @@ package org.wikidata.query.rdf.tool.rdf; +import java.util.ArrayList; import java.util.Collection; import java.util.HashSet; import java.util.Iterator; +import java.util.List; import java.util.Locale; import java.util.Set; @@ -18,6 +20,9 @@ import org.wikidata.query.rdf.common.uri.SKOS; import org.wikidata.query.rdf.common.uri.SchemaDotOrg; +import com.google.common.collect.ArrayListMultimap; +import com.google.common.collect.ListMultimap; + /** * Munges RDF from Wikibase into a more queryable format. Note that this is * tightly coupled with Wikibase's export format. @@ -25,10 +30,23 @@ public class Munger { private final EntityData entityDataUris; private final Entity entityUris; + private final boolean removeSiteLinks; public Munger(EntityData entityDataUris, Entity entityUris) { + this(entityDataUris, entityUris, false); + } + + private Munger(EntityData entityDataUris, Entity entityUris, boolean removeSiteLinks) { this.entityDataUris = entityDataUris; this.entityUris = entityUris; + this.removeSiteLinks = removeSiteLinks; + } + + /** + * Build a Munger that removes site links. + */ + public Munger removeSiteLinks() { + return new Munger(entityDataUris, entityUris, true); } /** @@ -50,8 +68,21 @@ Value lastModified = null; Resource entity = null; + /* + * A list of statements that were removed from the original collection + * in error. + */ + List<Statement> restoredStatements = new ArrayList<>(); + /* + * Subject of all sitelinks. + */ Set<String> siteLinks = new HashSet<>(); - Set<String> unknownSubjects = new HashSet<>(); + /* + * Subjects that likely showed up in statements in error. If a later + * statement merits the re-inclusion of the subject then its statements + * will be removed from this multimap and added to restoredStatement. + */ + ListMultimap<String, Statement> unknownSubjects = ArrayListMultimap.create(); while (itr.hasNext()) { Statement s = itr.next(); String subject = s.getSubject().stringValue(); @@ -94,25 +125,34 @@ continue; } /* - * We make an effort to detect subjects that we don't recognize so - * we can report them as an error. We first have to filter out - * sitelinks. + * Detecting site links is important so we can (optionally) filter + * them out and so that we can report everything that isn't a + * sitelink or proper subject as an error. */ if (siteLinks.contains(subject)) { + if (removeSiteLinks) { + itr.remove(); + } continue; } if (predicate.equals(RDF.TYPE) && s.getObject().stringValue().equals(SchemaDotOrg.ARTICLE)) { siteLinks.add(subject); // Site links may have crept into unknown subjects if they // appeared in a funky order. - unknownSubjects.remove(subject); + if (removeSiteLinks) { + itr.remove(); + unknownSubjects.removeAll(subject); + } else { + restoredStatements.addAll(unknownSubjects.removeAll(subject)); + } continue; } - unknownSubjects.add(subject); + unknownSubjects.put(subject, s); + itr.remove(); } if (!unknownSubjects.isEmpty()) { - throw new BadSubjectException(unknownSubjects, entityDataUris, entityUris); + throw new BadSubjectException(unknownSubjects.keySet(), entityDataUris, entityUris); } if (revisionId == null) { throw new RuntimeException("Didn't get a revision id for " + statements); @@ -125,7 +165,7 @@ } statements.add(new StatementImpl(entity, new URIImpl(SchemaDotOrg.VERSION), revisionId)); statements.add(new StatementImpl(entity, new URIImpl(SchemaDotOrg.DATE_MODIFIED), lastModified)); - + statements.addAll(restoredStatements); return statements; } diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/UpdateBuilder.java b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/UpdateBuilder.java index 21ed702..12a329e 100644 --- a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/UpdateBuilder.java +++ b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/UpdateBuilder.java @@ -7,7 +7,7 @@ import org.openrdf.model.Literal; import org.openrdf.model.Statement; import org.openrdf.model.URI; -import org.openrdf.model.util.Literals; +import org.openrdf.model.vocabulary.XMLSchema; import com.google.common.base.Joiner; @@ -102,13 +102,13 @@ StringBuilder sb = new StringBuilder(l.getLabel().length() * 2); sb.append('"'); - sb.append(l.getLabel().replace("\"", "\\\"")); + sb.append(l.getLabel().replace("\\", "\\\\").replace("\"", "\\\"")); sb.append('"'); - if (Literals.isLanguageLiteral(l)) { + if (l.getLanguage() != null) { sb.append('@'); sb.append(l.getLanguage()); - } else { + } else if (!l.getDatatype().equals(XMLSchema.STRING)) { sb.append("^^<"); sb.append(l.getDatatype()); sb.append(">"); diff --git a/tools/src/test/java/org/wikidata/query/rdf/tool/StatementHelper.java b/tools/src/test/java/org/wikidata/query/rdf/tool/StatementHelper.java index c49dde0..76134ef 100644 --- a/tools/src/test/java/org/wikidata/query/rdf/tool/StatementHelper.java +++ b/tools/src/test/java/org/wikidata/query/rdf/tool/StatementHelper.java @@ -3,9 +3,14 @@ import org.openrdf.model.Statement; import org.openrdf.model.URI; import org.openrdf.model.Value; +import org.openrdf.model.impl.LiteralImpl; import org.openrdf.model.impl.StatementImpl; import org.openrdf.model.impl.URIImpl; import org.wikidata.query.rdf.common.uri.Entity; +import org.wikidata.query.rdf.common.uri.RDF; +import org.wikidata.query.rdf.common.uri.SchemaDotOrg; + +import com.google.common.collect.ImmutableList; /** * Constructs statements for testing. @@ -26,6 +31,43 @@ } /** + * Build the statements describing a sitelink. + * + * @param entityId entity being linked + * @param link address of the link + * @param language language the link is in + * @return statements describing the sitelink + */ + public static ImmutableList<Statement> siteLink(String entityId, String link, String language) { + return siteLink(entityId, link, language, false); + } + + /** + * Build the statements describing a sitelink. + * + * @param entityId entity being linked + * @param link address of the link + * @param language language the link is in + * @param outOfOrder should the link be out of order compared to how + * wikidata dumps it? + * @return statements describing the sitelink + */ + + public static ImmutableList<Statement> siteLink(String entityId, String link, String language, boolean outOfOrder) { + if (outOfOrder) { + return ImmutableList.of(// + statement(link, SchemaDotOrg.IN_LANGUAGE, new LiteralImpl(language)),// + statement(link, SchemaDotOrg.ABOUT, entityId),// + statement(link, RDF.TYPE, SchemaDotOrg.ARTICLE)); + + } + return ImmutableList.of(// + statement(link, RDF.TYPE, SchemaDotOrg.ARTICLE),// + statement(link, SchemaDotOrg.ABOUT, entityId),// + statement(link, SchemaDotOrg.IN_LANGUAGE, new LiteralImpl(language))); + } + + /** * Convert a string into a URI for testing. */ public static URI uri(String r) { diff --git a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java index a6525ff..0643221 100644 --- a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java +++ b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java @@ -4,6 +4,7 @@ import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.hasItem; import static org.hamcrest.Matchers.not; +import static org.wikidata.query.rdf.tool.StatementHelper.siteLink; import static org.wikidata.query.rdf.tool.StatementHelper.statement; import java.util.ArrayList; @@ -51,7 +52,7 @@ List<Statement> statements = basicEntity("Q23"); statements.add(statement("Q23", "P509", "Q6")); munger.munge("Q23", statements); - assertThat(statements, hasItem(equalTo(statement("Q23", "P509", "Q6")))); + assertThat(statements, hasItem(statement("Q23", "P509", "Q6"))); } @Test(expected = BadSubjectException.class) @@ -76,7 +77,7 @@ statements.add(articleDecl); } munger.munge("Q23", statements); - assertThat(statements, both(hasItem(equalTo(articleDecl))).and(hasItem(equalTo(metaDecl)))); + assertThat(statements, both(hasItem(articleDecl)).and(hasItem(metaDecl))); } @Test @@ -88,9 +89,9 @@ List<Statement> statements = basicEntity("Q23"); statements.addAll(ImmutableList.of(rdfsDecl, skosDecl, schemaDecl)); munger.munge("Q23", statements); - assertThat(statements, hasItem(equalTo(rdfsDecl))); - assertThat(statements, not(hasItem(equalTo(skosDecl)))); - assertThat(statements, not(hasItem(equalTo(schemaDecl)))); + assertThat(statements, hasItem(rdfsDecl)); + assertThat(statements, not(hasItem(skosDecl))); + assertThat(statements, not(hasItem(schemaDecl))); } @Test @@ -102,8 +103,20 @@ statements.add(georgeDecl); statements.add(marthaDecl); munger.munge("Q23", statements); - assertThat(statements, hasItem(equalTo(georgeDecl))); - assertThat(statements, not(hasItem(equalTo(marthaDecl)))); + assertThat(statements, hasItem(georgeDecl)); + assertThat(statements, not(hasItem(marthaDecl))); + } + + @Test + public void skipSiteLinks() { + List<Statement> siteLink = siteLink("Q23", "http://en.wikipedia.org/wiki/George_Washington", "en", + randomBoolean()); + List<Statement> george = basicEntity("Q23"); + george.addAll(siteLink); + munger.removeSiteLinks().munge("Q23", george); + for (Statement siteLinkPart : siteLink) { + assertThat(george, not(hasItem(siteLinkPart))); + } } private List<Statement> basicEntity(String entityId) { diff --git a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java index 268717e..5a39928 100644 --- a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java +++ b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/RdfRepositoryIntegrationTest.java @@ -5,6 +5,7 @@ import static org.junit.Assert.assertThat; import static org.junit.Assert.assertTrue; import static org.wikidata.query.rdf.tool.Matchers.binds; +import static org.wikidata.query.rdf.tool.StatementHelper.siteLink; import static org.wikidata.query.rdf.tool.StatementHelper.statement; import java.math.BigInteger; @@ -47,13 +48,11 @@ @Test public void newSiteLink() throws QueryEvaluationException { - repository.sync("Q23", ImmutableList.of(// - statement("http://en.wikipedia.org/wiki/George_Washington", SchemaDotOrg.ABOUT, "Q23"))); - TupleQueryResult r = repository.query("SELECT * WHERE {?s ?p ?o}"); + repository.sync("Q23", siteLink("Q23", "http://en.wikipedia.org/wiki/George_Washington", "en")); + TupleQueryResult r = repository.query("SELECT * WHERE {?s <http://schema.org/about> ?o}"); assertTrue(r.hasNext()); assertThat(r.next(), allOf(// binds("s", "http://en.wikipedia.org/wiki/George_Washington"),// - binds("p", SchemaDotOrg.ABOUT),// binds("o", "Q23"))); assertFalse(r.hasNext()); } @@ -61,13 +60,11 @@ @Test public void moveSiteLink() throws QueryEvaluationException { newSiteLink(); - repository.sync("Q23", ImmutableList.of(// - statement("http://en.wikipedia.org/wiki/George_Washingmoved", SchemaDotOrg.ABOUT, "Q23"))); - TupleQueryResult r = repository.query("SELECT * WHERE {?s ?p ?o}"); + repository.sync("Q23", siteLink("Q23", "http://en.wikipedia.org/wiki/George_Washingmoved", "en")); + TupleQueryResult r = repository.query("SELECT * WHERE {?s <http://schema.org/about> ?o}"); assertTrue(r.hasNext()); assertThat(r.next(), allOf(// binds("s", "http://en.wikipedia.org/wiki/George_Washingmoved"),// - binds("p", SchemaDotOrg.ABOUT),// binds("o", "Q23"))); assertFalse(r.hasNext()); } @@ -113,6 +110,19 @@ } @Test + public void statementWithBackslash() throws QueryEvaluationException { + repository.sync("Q42", ImmutableList.of(// + statement("Q42", "P396", new LiteralImpl("IT\\ICCU\\RAVV\\034417")))); + TupleQueryResult r = repository.query("SELECT * WHERE {?s ?p ?o}"); + assertTrue(r.hasNext()); + assertThat(r.next(), allOf(// + binds("s", "Q42"),// + binds("p", "P396"),// + binds("o", new LiteralImpl("IT\\ICCU\\RAVV\\034417")))); + assertFalse(r.hasNext()); + } + + @Test public void newLabelLanguage() throws QueryEvaluationException { newLabel(); repository.sync("Q23", ImmutableList.of(// -- To view, visit https://gerrit.wikimedia.org/r/201101 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Icc8a3b4c327e4719c0eb221060218448ebcdd877 Gerrit-PatchSet: 1 Gerrit-Project: wikidata/query/rdf Gerrit-Branch: master Gerrit-Owner: Manybubbles <never...@wikimedia.org> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits