Smalyshev has uploaded a new change for review. ( 
https://gerrit.wikimedia.org/r/355565 )

Change subject: Cut overlong object values down to 32k.
......................................................................

Cut overlong object values down to 32k.

Blazegraph would not accept objects longer than 32k, see IVUnicode.java:134.
Bug: T165834

Change-Id: Ib6fc46ba2311da590042e9ad7ba4732666437c59
---
M tools/runUpdate.sh
M tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java
M tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java
3 files changed, 53 insertions(+), 1 deletion(-)


  git pull ssh://gerrit.wikimedia.org:29418/wikidata/query/rdf 
refs/changes/65/355565/1

diff --git a/tools/runUpdate.sh b/tools/runUpdate.sh
index dc07654..c95892c 100755
--- a/tools/runUpdate.sh
+++ b/tools/runUpdate.sh
@@ -1,4 +1,4 @@
 #!/bin/bash
 
-java -cp target/wikidata-query-tools-*-SNAPSHOT-jar-with-dependencies.jar 
org.wikidata.query.rdf.tool.Update --sparqlUrl 
http://localhost:9999/bigdata/namespace/kb/sparql $*
+java -cp target/wikidata-query-tools-*-SNAPSHOT-jar-with-dependencies.jar 
org.wikidata.query.rdf.tool.Update --sparqlUrl 
http://localhost:9999/bigdata/namespace/wdq/sparql $*
 
diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java 
b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java
index 740f882..ff912fe 100644
--- a/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java
+++ b/tools/src/main/java/org/wikidata/query/rdf/tool/rdf/Munger.java
@@ -33,6 +33,7 @@
 import org.slf4j.LoggerFactory;
 import org.wikidata.query.rdf.common.WikibasePoint;
 import org.wikidata.query.rdf.common.WikibasePoint.CoordinateOrder;
+import org.wikidata.query.rdf.common.uri.GeoSparql;
 import org.wikidata.query.rdf.common.uri.OWL;
 import org.wikidata.query.rdf.common.uri.Ontology;
 import org.wikidata.query.rdf.common.uri.Provenance;
@@ -414,6 +415,12 @@
                 if (!statement()) {
                     itr.remove();
                 }
+                // Check object length, cut if needed.
+                Statement shortStatement = checkObjectLength();
+                if(shortStatement != null) {
+                    itr.remove();
+                    restoredStatements.add(shortStatement);
+                }
             }
 
             statement = null;
@@ -422,6 +429,28 @@
         }
 
         /**
+         * Check whether object's length is more than 32k.
+         * If so, create new statement that cuts object down to 32k.
+         * @return New statement or null if not needed.
+         */
+        private Statement checkObjectLength() {
+            if(statement.getObject() instanceof Literal) {
+                Literal value = (Literal)statement.getObject();
+                if (value.stringValue().length() > Short.MAX_VALUE) {
+                    final Literal newValue;
+                    
if(value.getDatatype().equals(org.openrdf.model.vocabulary.RDF.LANGSTRING)) {
+                        newValue = new 
LiteralImpl(value.stringValue().substring(0, Short.MAX_VALUE), 
value.getLanguage());
+                    } else {
+                        newValue = new 
LiteralImpl(value.stringValue().substring(0, Short.MAX_VALUE), 
value.getDatatype());
+                    }
+                    return new StatementImpl(statement.getSubject(),
+                            statement.getPredicate(), newValue);
+                }
+            }
+            return null;
+        }
+
+        /**
          * Process a statement.
          *
          * @return true to keep the statement, false to remove it
diff --git 
a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java 
b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java
index 6275e61..a9ff440 100644
--- a/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java
+++ b/tools/src/test/java/org/wikidata/query/rdf/tool/rdf/MungerUnitTest.java
@@ -13,6 +13,7 @@
 import java.util.Collections;
 import java.util.List;
 
+import org.apache.commons.lang3.StringUtils;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.openrdf.model.Statement;
@@ -343,6 +344,28 @@
         assertThat(result, hasItem(expected));
     }
 
+    /**
+     * Ensure that long strings are cut to MAX_VALUE.
+     */
+    @Test
+    public void veryLongValue() {
+        String longString = StringUtils.repeat("A", Short.MAX_VALUE);
+        List<Statement> result = entity("Q2223")
+                .remove(statement("Q2223", uris.property(PropertyType.DIRECT) 
+ "P9", new LiteralImpl(longString + longString)))
+                // With type
+                .remove(statement("Q2223", uris.property(PropertyType.DIRECT) 
+ "P10", new LiteralImpl(longString + longString, OWL.DATATYPEPROPERTY)))
+                // With language
+                .remove(statement("Q2223", uris.property(PropertyType.DIRECT) 
+ "P11", new LiteralImpl(longString + longString, "en")))
+                .testWithoutShuffle();
+        Statement expected = statement("Q2223", 
uris.property(PropertyType.DIRECT) + "P9", new LiteralImpl(longString));
+        assertThat(result, hasItem(expected));
+        expected = statement("Q2223", uris.property(PropertyType.DIRECT) + 
"P10", new LiteralImpl(longString, OWL.DATATYPEPROPERTY));
+        assertThat(result, hasItem(expected));
+        expected = statement("Q2223", uris.property(PropertyType.DIRECT) + 
"P11", new LiteralImpl(longString, "en"));
+        assertThat(result, hasItem(expected));
+    }
+
+
     @Test
     public void propertyDefs() {
         entity("P1234")

-- 
To view, visit https://gerrit.wikimedia.org/r/355565
To unsubscribe, visit https://gerrit.wikimedia.org/r/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Ib6fc46ba2311da590042e9ad7ba4732666437c59
Gerrit-PatchSet: 1
Gerrit-Project: wikidata/query/rdf
Gerrit-Branch: master
Gerrit-Owner: Smalyshev <[email protected]>

_______________________________________________
MediaWiki-commits mailing list
[email protected]
https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits

Reply via email to