Smalyshev has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/355565 )
Change subject: Cut overlong object values down to 32k. ...................................................................... Cut overlong object values down to 32k. Blazegraph would not accept objects longer than 32k, see IVUnicode.java:134. Bug: T165834 Change-Id: Ib6fc46ba2311da590042e9ad7ba4732666437c59 --- M tools/runUpdate.sh M tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java M tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java 3 files changed, 53 insertions(+), 1 deletion(-) git pull ssh://gerrit.wikimedia.org:29418/wikidata/query/rdf refs/changes/65/355565/1 diff --git a/tools/runUpdate.sh b/tools/runUpdate.sh index dc07654..c95892c 100755 --- a/tools/runUpdate.sh +++ b/tools/runUpdate.sh @@ -1,4 +1,4 @@ #!/bin/bash -java -cp target/wikidata-query-tools-*-SNAPSHOT-jar-with-dependencies.jar org.wikidata.query.rdf.tool.Update --sparqlUrl http://localhost:9999/bigdata/namespace/kb/sparql $* +java -cp target/wikidata-query-tools-*-SNAPSHOT-jar-with-dependencies.jar org.wikidata.query.rdf.tool.Update --sparqlUrl http://localhost:9999/bigdata/namespace/wdq/sparql $* diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java index 740f882..ff912fe 100644 --- a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java +++ b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java @@ -33,6 +33,7 @@ import org.slf4j.LoggerFactory; import org.wikidata.query.rdf.common.WikibasePoint; import org.wikidata.query.rdf.common.WikibasePoint.CoordinateOrder; +import org.wikidata.query.rdf.common.uri.GeoSparql; import org.wikidata.query.rdf.common.uri.OWL; import org.wikidata.query.rdf.common.uri.Ontology; import org.wikidata.query.rdf.common.uri.Provenance; @@ -414,6 +415,12 @@ if (!statement()) { itr.remove(); } + // Check object length, cut if needed. + Statement shortStatement = checkObjectLength(); + if(shortStatement != null) { + itr.remove(); + restoredStatements.add(shortStatement); + } } statement = null; @@ -422,6 +429,28 @@ } /** + * Check whether object's length is more than 32k. + * If so, create new statement that cuts object down to 32k. + * @return New statement or null if not needed. + */ + private Statement checkObjectLength() { + if(statement.getObject() instanceof Literal) { + Literal value = (Literal)statement.getObject(); + if (value.stringValue().length() > Short.MAX_VALUE) { + final Literal newValue; + if(value.getDatatype().equals(org.openrdf.model.vocabulary.RDF.LANGSTRING)) { + newValue = new LiteralImpl(value.stringValue().substring(0, Short.MAX_VALUE), value.getLanguage()); + } else { + newValue = new LiteralImpl(value.stringValue().substring(0, Short.MAX_VALUE), value.getDatatype()); + } + return new StatementImpl(statement.getSubject(), + statement.getPredicate(), newValue); + } + } + return null; + } + + /** * Process a statement. * * @return true to keep the statement, false to remove it diff --git a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java index 6275e61..a9ff440 100644 --- a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java +++ b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java @@ -13,6 +13,7 @@ import java.util.Collections; import java.util.List; +import org.apache.commons.lang3.StringUtils; import org.junit.Test; import org.junit.runner.RunWith; import org.openrdf.model.Statement; @@ -343,6 +344,28 @@ assertThat(result, hasItem(expected)); } + /** + * Ensure that long strings are cut to MAX_VALUE. + */ + @Test + public void veryLongValue() { + String longString = StringUtils.repeat("A", Short.MAX_VALUE); + List<Statement> result = entity("Q2223") + .remove(statement("Q2223", uris.property(PropertyType.DIRECT) + "P9", new LiteralImpl(longString + longString))) + // With type + .remove(statement("Q2223", uris.property(PropertyType.DIRECT) + "P10", new LiteralImpl(longString + longString, OWL.DATATYPEPROPERTY))) + // With language + .remove(statement("Q2223", uris.property(PropertyType.DIRECT) + "P11", new LiteralImpl(longString + longString, "en"))) + .testWithoutShuffle(); + Statement expected = statement("Q2223", uris.property(PropertyType.DIRECT) + "P9", new LiteralImpl(longString)); + assertThat(result, hasItem(expected)); + expected = statement("Q2223", uris.property(PropertyType.DIRECT) + "P10", new LiteralImpl(longString, OWL.DATATYPEPROPERTY)); + assertThat(result, hasItem(expected)); + expected = statement("Q2223", uris.property(PropertyType.DIRECT) + "P11", new LiteralImpl(longString, "en")); + assertThat(result, hasItem(expected)); + } + + @Test public void propertyDefs() { entity("P1234") -- To view, visit https://gerrit.wikimedia.org/r/355565 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: Ib6fc46ba2311da590042e9ad7ba4732666437c59 Gerrit-PatchSet: 1 Gerrit-Project: wikidata/query/rdf Gerrit-Branch: master Gerrit-Owner: Smalyshev <[email protected]> _______________________________________________ MediaWiki-commits mailing list [email protected] https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits
