Hjfocs has uploaded a new change for review. ( https://gerrit.wikimedia.org/r/365253 )
Change subject: T169045: Implement the upload service ...................................................................... T169045: Implement the upload service '/upload' REST endpoint of the primary sources tool ingestion API. This allows a third-party data provider to upload its dataset via a POST request. The dataset undergoes Wikidata RDF data model validation and its valid subset is uploaded to a Blazegraph instance through the Blazegraph data loader service. If the dataset does not pass RDF syntax checking, the service returns a bad request response. Change-Id: I33dde366ee41a118aeec5c863f9157a9ab31b291 --- M tools/pom.xml A tools/src/main/java/org/wikidata/query/rdf/tool/primarysources/WikibaseDataModelValidator.java A tools/src/main/java/org/wikidata/query/rdf/tool/primarysources/ingestion/UploadServlet.java A tools/src/test/java/org/wikidata/query/rdf/tool/primarysources/WikibaseDataModelValidatorUnitTest.java A tools/src/test/java/org/wikidata/query/rdf/tool/primarysources/ingestion/IngestionAPIIntegrationTest.java A tools/src/test/resources/bad_chuck_berry.ttl A tools/src/test/resources/good_chuck_berry.ttl A tools/src/test/resources/just_bad_rdf.ttl M war/pom.xml M war/src/main/webapp/WEB-INF/web.xml 10 files changed, 1,269 insertions(+), 0 deletions(-) git pull ssh://gerrit.wikimedia.org:29418/wikidata/query/rdf refs/changes/53/365253/1 diff --git a/tools/pom.xml b/tools/pom.xml index faeca64..da95ae0 100644 --- a/tools/pom.xml +++ b/tools/pom.xml @@ -55,8 +55,18 @@ <artifactId>nanohttpd</artifactId> </dependency> <dependency> + <groupId>commons-fileupload</groupId> + <artifactId>commons-fileupload</artifactId> + <version>1.3.1</version> + </dependency> + <dependency> <groupId>io.dropwizard.metrics</groupId> <artifactId>metrics-core</artifactId> + </dependency> + <dependency> + <groupId>javax.servlet</groupId> + <artifactId>javax.servlet-api</artifactId> + <version>3.1.0</version> </dependency> <dependency> <groupId>org.apache.commons</groupId> @@ -120,6 +130,12 @@ <scope>runtime</scope> </dependency> <dependency> + <groupId>org.apache.httpcomponents</groupId> + <artifactId>httpmime</artifactId> + <version>4.4</version> + <scope>test</scope> + </dependency> + <dependency> <groupId>org.hamcrest</groupId> <artifactId>hamcrest-core</artifactId> <scope>test</scope> diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/primarysources/WikibaseDataModelValidator.java b/tools/src/main/java/org/wikidata/query/rdf/tool/primarysources/WikibaseDataModelValidator.java new file mode 100644 index 0000000..153283b --- /dev/null +++ b/tools/src/main/java/org/wikidata/query/rdf/tool/primarysources/WikibaseDataModelValidator.java @@ -0,0 +1,443 @@ +package org.wikidata.query.rdf.tool.primarysources; + +import com.google.common.collect.ImmutableMap; +import org.openrdf.model.Model; +import org.openrdf.model.Statement; +import org.openrdf.model.URI; +import org.openrdf.model.Value; +import org.openrdf.model.impl.TreeModel; +import org.openrdf.rio.RDFFormat; +import org.openrdf.rio.RDFParseException; +import org.openrdf.rio.Rio; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.wikidata.query.rdf.common.uri.Provenance; +import org.wikidata.query.rdf.common.uri.WikibaseUris; + +import java.io.IOException; +import java.io.InputStream; +import java.net.HttpURLConnection; +import java.net.URL; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +/** + * Validates a given dataset against the <a href="https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format#Data_model">Wikidata RDF data model</a> + * The dataset undergoes RDF syntax check first, then the actual data model validation. + * + * @author Marco Fossati - User:Hjfocs + * @since 0.2.4 + * Created on Jun 19, 2017. + */ +public class WikibaseDataModelValidator { + + /** + * Map of regular expressions that validate the following Wikidata terms: + * <ul> + * <li>item, e.g., <code>Q9521</code>;</li> + * <li>property, e.g., <code>P18</code>;</li> + * <li>reified statement, e.g., <code>Q5921-583C7277-B344-4C96-8CF2-0557C2D0CD34</code>;</li> + * <li>reified reference, e.g., <code>288ab581e7d2d02995a26dfa8b091d96e78457fc</code>.</li> + * </ul> + */ + public static final Map<String, Pattern> TERM_VALIDATORS = ImmutableMap.of( + "item", Pattern.compile("^Q\\d+$"), + "property", Pattern.compile("^P\\d+$"), + "statement", Pattern.compile("^Q\\d+-[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}$"), + "reference", Pattern.compile("^[0-9a-f]{40}$")); + + /** + * The set of Wikidata namespaces. + */ + static final WikibaseUris VALID_NAMESPACES = WikibaseUris.WIKIDATA; + + /** + * 3 characters is the maximum value to consider a given resource as invalid due to a typo. + */ + private static final int EDIT_DISTANCE_THRESHOLD = 3; + private static final Logger log = LoggerFactory.getLogger(WikibaseDataModelValidator.class); + + /** + * Check the RDF syntax correctness of a given dataset. + * Note that parsing is done in memory over the whole dataset. + * + * @param dataset - the input stream of the dataset to check + * @param baseURI - the base URI + * @param format - the RDF format used to serialize the input dataset + * @return the successfully parsed RDF {@link Model} + * @throws IOException - if there are troubles reading the input stream + * @throws RDFParseException - if the dataset is not valid RDF + */ + public Model checkSyntax(InputStream dataset, String baseURI, RDFFormat format) throws IOException, RDFParseException { + return Rio.parse(dataset, baseURI, format); + } + + /** + * Validate a RDF triple with a Wikidata Item as subject. + * Example: + * Chuck Berry has image reified node + * wd:Q5921 p:P18 wds:Q5921-{uuid} + * Taken from https://www.wikidata.org/wiki/Special:EntityData/Q5921.ttl + * + * @param itemTriple - the Item triple to be validated + * @return a {@link List} of invalid triple components or empty if everything is valid + */ + public List<String> validateItemTriple(Statement itemTriple) { + List<String> invalid = new ArrayList<>(); + String subject = itemTriple.getSubject().stringValue(); + if (!isValidTripleComponent(subject, VALID_NAMESPACES.entity(), "item")) { + invalid.add(subject); + } + String predicate = itemTriple.getPredicate().stringValue(); + if (!isValidTripleComponent(predicate, VALID_NAMESPACES.property(WikibaseUris.PropertyType.CLAIM), "property")) { + invalid.add(predicate); + } + String object = itemTriple.getObject().stringValue(); + if (!isValidTripleComponent(object, VALID_NAMESPACES.statement(), "statement")) { + invalid.add(object); + } + return invalid; + } + + /** + * Validate a RDF triple with a reified statement as subject. + * Example: + * reified node has image Commons URL + * wds:Q5921-{uuid} ps:P18 <http://commons.wikimedia.org/wiki/Special:FilePath/Chuck-berry-2007-07-18.jpg> + * Taken from https://www.wikidata.org/wiki/Special:EntityData/Q5921.ttl + * + * @param propertyTriple - the Item triple to be validated + * @return a {@link List} of invalid triple components or empty if everything is valid + */ + public List<String> validatePropertyTriple(Statement propertyTriple) { + List<String> invalid = new ArrayList<>(); + String subject = propertyTriple.getSubject().stringValue(); + if (!isValidTripleComponent(subject, VALID_NAMESPACES.statement(), "statement")) { + invalid.add(subject); + } + String predicate = propertyTriple.getPredicate().stringValue(); + if (!isValidTripleComponent(predicate, VALID_NAMESPACES.property(WikibaseUris.PropertyType.STATEMENT), "property")) { + invalid.add(predicate); + } + // The object can be a a literal, an Item, or a URL + Value object = propertyTriple.getObject(); + if (object instanceof URI) { + String objectString = object.stringValue(); + if (!isValidTripleComponent(objectString, VALID_NAMESPACES.entity(), "item")) { + // Not an Item, if it starts with "http://www.wikidata.org/", it probably means that an invalid Wikidata resource is used + if (objectString.startsWith(VALID_NAMESPACES.root() + "/")) { + log.error("Probably a Wikidata term, but not an Item: {}", objectString); + invalid.add(objectString); + } else { + // Check if it's a typo via edit distance between the current namespace and the valid one + int distance = computeNamespaceDistance(objectString, VALID_NAMESPACES.entity(), "item"); + if (distance <= EDIT_DISTANCE_THRESHOLD) { + log.error("Probably a typo: {}", objectString); + invalid.add(objectString); + } else { + // The edit distance is above the threshold: probably it's a URL, check if it resolves + int responseCode = 200; + try { + responseCode = resolve(objectString); + } catch (IOException ioe) { + // Doesn't resolve because of lower-level issues + log.error("Not resolvable: {} Reason: {}", objectString, ioe.getClass().getSimpleName()); + invalid.add(objectString); + } + // Allow success (2xx) and redirection (3xx) code ranges + if (responseCode >= 400) { + log.error("Not resolvable: {} HTTP error code: {}", objectString, responseCode); + invalid.add(objectString); + } + } + } + } + } + return invalid; + } + + /** + * Validate a RDF triple that involves a reference. + * Example: + * reified node has reference reified reference + * wds:Qn-uuid prov:wasDerivedFrom wdref:{hash} + * Taken from https://www.wikidata.org/wiki/Special:EntityData/Q5921.ttl + * + * @param referenceTriple - the reference triple to be validated + * @return a {@link List} of invalid triple components or empty if everything is valid + */ + public List<String> validateReferenceTriple(Statement referenceTriple) { + List<String> invalid = new ArrayList<>(); + String subject = referenceTriple.getSubject().stringValue(); + if (!isValidTripleComponent(subject, VALID_NAMESPACES.statement(), "statement")) { + invalid.add(subject); + } + String predicate = referenceTriple.getPredicate().stringValue(); + if (!predicate.equals(Provenance.WAS_DERIVED_FROM)) { + invalid.add(predicate); + } + String object = referenceTriple.getObject().stringValue(); + if (!isValidTripleComponent(object, VALID_NAMESPACES.reference(), "reference")) { + invalid.add(object); + } + return invalid; + } + + /** + * Validate a RDF triple that involves a qualifier. + * Example: + * reified node has media legend literal + * wds:Q666-{uuid} pq:P2096 "Chuck Berry (2007)"@ca + * Taken from https://www.wikidata.org/wiki/Special:EntityData/Q5921.ttl + * + * @param qualifierTriple - the qualifier triple to be validated + * @return a {@link List} of invalid triple components or empty if everything is valid + */ + public List<String> validateQualifierTriple(Statement qualifierTriple) { + List<String> invalid = new ArrayList<>(); + String subject = qualifierTriple.getSubject().stringValue(); + if (!isValidTripleComponent(subject, VALID_NAMESPACES.statement(), "statement")) { + invalid.add(subject); + } + String predicate = qualifierTriple.getPredicate().stringValue(); + if (!isValidTripleComponent(predicate, VALID_NAMESPACES.property(WikibaseUris.PropertyType.QUALIFIER), "property")) { + invalid.add(predicate); + } + // The object can be a a literal or an Item + Value object = qualifierTriple.getObject(); + if (object instanceof URI) { + String objectString = object.stringValue(); + if (!isValidTripleComponent(objectString, VALID_NAMESPACES.entity(), "item")) { + invalid.add(objectString); + } + } + return invalid; + } + + /** + * Validate a RDF triple that contains the reference value. + * Example: + * reified reference was imported from Russian Wikipedia + * wdref:{hash} pr:P143 wd:Q206855 + * + * @param referenceValueTriple - the reference value triple to be validated + * @return a {@link List} of invalid triple components or empty if everything is valid + */ + public List<String> validateReferenceValueTriple(Statement referenceValueTriple) { + List<String> invalid = new ArrayList<>(); + String subject = referenceValueTriple.getSubject().stringValue(); + if (!isValidTripleComponent(subject, VALID_NAMESPACES.reference(), "reference")) { + invalid.add(subject); + } + String predicate = referenceValueTriple.getPredicate().stringValue(); + if (!isValidTripleComponent(predicate, VALID_NAMESPACES.property(WikibaseUris.PropertyType.REFERENCE), "property")) { + invalid.add(predicate); + } + // The object can be an Item, or a URL + String object = referenceValueTriple.getObject().stringValue(); + if (!isValidTripleComponent(object, VALID_NAMESPACES.entity(), "item")) { + // Not an Item, if it starts with "http://www.wikidata.org/", it probably means that an invalid Wikidata resource is used + if (object.startsWith(VALID_NAMESPACES.root() + "/")) { + log.error("Probably a Wikidata term, but not an Item: {}", object); + invalid.add(object); + } else { + // Check if it's a typo via edit distance between the current namespace and the valid one + int distance = computeNamespaceDistance(object, VALID_NAMESPACES.entity(), "item"); + if (distance <= EDIT_DISTANCE_THRESHOLD) { + log.error("Probably a typo: {}", object); + invalid.add(object); + } else { + // The edit distance is above the threshold: probably it's a URL, check if it resolves + int responseCode = 200; + try { + responseCode = resolve(object); + } catch (IOException ioe) { + // Doesn't resolve because of lower-level issues + log.error("Not resolvable: {} Reason: {}", object, ioe.getClass().getSimpleName()); + invalid.add(object); + } + // Allow success (2xx) and redirection (3xx) code ranges + if (responseCode >= 400) { + log.error("Not resolvable: {} HTTP error code: {}", object, responseCode); + invalid.add(object); + } + } + } + } + return invalid; + } + + /** + * Validate the given dataset, remove invalid triples, and log the list of invalid components. + * + * @param dataset - the RDF dataset to be validated, which has already undergone syntax check + * @return a sub-set of the input dataset, pruned from invalid triples + */ + public AbstractMap.SimpleImmutableEntry<Model, List<String>> handleDataset(Model dataset) { + Model valid = new TreeModel(); + List<String> invalid = new ArrayList<>(); + Iterator<Statement> statementIterator = dataset.iterator(); + while (statementIterator.hasNext()) { + Statement statement = statementIterator.next(); + handleSubject(valid, invalid, statement); + } + if (invalid.isEmpty()) { + log.info("Your dataset is valid and will be fully uploaded"); + } else { + log.warn("Your dataset has issues, only valid triples will be uploaded. List of invalid triples: {}", invalid); + } + return new AbstractMap.SimpleImmutableEntry<>(valid, invalid); + } + + /** + * Dispatch the validation based on the triple subject. + */ + private void handleSubject(Model valid, List<String> invalid, Statement statement) { + List<String> currentInvalid; + String subject = statement.getSubject().stringValue(); + if (subject.startsWith(WikibaseDataModelValidator.VALID_NAMESPACES.entity() + "Q")) { + currentInvalid = validateItemTriple(statement); + if (currentInvalid.isEmpty()) { + valid.add(statement); + } else { + invalid.addAll(currentInvalid); + } + } else if (subject.startsWith(WikibaseDataModelValidator.VALID_NAMESPACES.statement())) { + handlePredicate(valid, invalid, statement); + } else if (subject.startsWith(WikibaseDataModelValidator.VALID_NAMESPACES.reference())) { + currentInvalid = validateReferenceValueTriple(statement); + if (currentInvalid.isEmpty()) { + valid.add(statement); + } else { + invalid.addAll(currentInvalid); + } + } else { + log.error("Invalid triple: {}", statement); + invalid.add(statement.toString()); + } + } + + /** + * Dispatch the validation based on the triple predicate. + */ + private void handlePredicate(Model valid, List<String> invalid, Statement statement) { + List<String> currentInvalid; + String predicate = statement.getPredicate().stringValue(); + if (predicate.startsWith(WikibaseDataModelValidator.VALID_NAMESPACES.property(WikibaseUris.PropertyType.STATEMENT))) { + currentInvalid = validatePropertyTriple(statement); + if (currentInvalid.isEmpty()) { + valid.add(statement); + } else { + invalid.addAll(currentInvalid); + } + } else if (predicate.equals(Provenance.WAS_DERIVED_FROM)) { + currentInvalid = validateReferenceTriple(statement); + if (currentInvalid.isEmpty()) { + valid.add(statement); + } else { + invalid.addAll(currentInvalid); + } + } else if (predicate.startsWith(WikibaseDataModelValidator.VALID_NAMESPACES.property(WikibaseUris.PropertyType.QUALIFIER))) { + currentInvalid = validateQualifierTriple(statement); + if (currentInvalid.isEmpty()) { + valid.add(statement); + } else { + invalid.addAll(currentInvalid); + } + } else { + log.error("Invalid triple: {}", statement); + invalid.add(statement.toString()); + } + } + + /** + * Perform an HTTP GET to check that the given resource is resolvable in the Internet. + * + * @throws IOException - when there are troubles resolving the resource, typically malformed URLs + */ + private int resolve(String resource) throws IOException { + URL url = new URL(resource); + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); + connection.setRequestMethod("GET"); + connection.connect(); + return connection.getResponseCode(); + } + + /** + * Validate the given triple component. + */ + private boolean isValidTripleComponent(String tripleComponent, String expectedNamespace, String expectedTerm) { + if (tripleComponent.startsWith(expectedNamespace)) { + String term = tripleComponent.substring(expectedNamespace.length()); + return isValidTerm(term, expectedTerm); + } else { + return false; + } + } + + /** + * Validate the given resource term. + */ + private boolean isValidTerm(String term, String expectedTerm) { + Pattern regex = TERM_VALIDATORS.get(expectedTerm); + Matcher matcher = regex.matcher(term); + return matcher.matches(); + } + + /** + * The edit distance of a given resource is computed between its namespace and a valid one. + * Note that the term is not involved here. + */ + private int computeNamespaceDistance(String resource, String expectedNamespace, String expectedTerm) { + Pattern pattern = TERM_VALIDATORS.get(expectedTerm); + // Don't match start-of-line + end-of-line + String regex = pattern.pattern().replace("^", "").replace("$", ""); + String[] split = resource.split(regex); + String namespace = split[0]; + return levenshteinDistance(namespace, expectedNamespace); + } + + /** + * Levenshtein distance implementation. + * Copied from https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Java + * Rephrased to fit the code style. + */ + private int levenshteinDistance(CharSequence lhs, CharSequence rhs) { + int len0 = lhs.length() + 1; + int len1 = rhs.length() + 1; + // The array of distances + int[] cost = new int[len0]; + int[] newcost = new int[len0]; + // Initial cost of skipping prefix in String s0 + for (int i = 0; i < len0; i++) cost[i] = i; + // Dynamically computing the array of distances + // Transformation cost for each letter in s1 + for (int j = 1; j < len1; j++) { + // Initial cost of skipping prefix in String s1 + newcost[0] = j; + // Transformation cost for each letter in s0 + for (int i = 1; i < len0; i++) { + // Matching current letters in both strings + int match = (lhs.charAt(i - 1) == rhs.charAt(j - 1)) ? 0 : 1; + // Computing cost for each transformation + int costReplace = cost[i - 1] + match; + int costInsert = cost[i] + 1; + int costDelete = newcost[i - 1] + 1; + // Keep minimum cost + newcost[i] = Math.min(Math.min(costInsert, costDelete), costReplace); + } + // Swap cost/newcost arrays + int[] swap = cost; + cost = newcost; + newcost = swap; + } + // The distance is the cost for transforming all letters in both strings + return cost[len0 - 1]; + } + +} diff --git a/tools/src/main/java/org/wikidata/query/rdf/tool/primarysources/ingestion/UploadServlet.java b/tools/src/main/java/org/wikidata/query/rdf/tool/primarysources/ingestion/UploadServlet.java new file mode 100644 index 0000000..d28df9e --- /dev/null +++ b/tools/src/main/java/org/wikidata/query/rdf/tool/primarysources/ingestion/UploadServlet.java @@ -0,0 +1,264 @@ +package org.wikidata.query.rdf.tool.primarysources.ingestion; + +import com.google.common.io.Resources; +import org.apache.commons.fileupload.FileItemIterator; +import org.apache.commons.fileupload.FileItemStream; +import org.apache.commons.fileupload.FileUploadException; +import org.apache.commons.fileupload.servlet.ServletFileUpload; +import org.apache.commons.fileupload.util.Streams; +import org.openrdf.model.Model; +import org.openrdf.rio.RDFFormat; +import org.openrdf.rio.RDFHandlerException; +import org.openrdf.rio.RDFParseException; +import org.openrdf.rio.Rio; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.wikidata.query.rdf.tool.primarysources.WikibaseDataModelValidator; + +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; +import java.io.BufferedReader; +import java.io.DataOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.net.HttpURLConnection; +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.List; +import java.util.Properties; + +/** + * @author Marco Fossati - User:Hjfocs + * @since 0.2.4 + * Created on Jul 04, 2017. + */ +@SuppressWarnings("checkstyle:classfanoutcomplexity") +public class UploadServlet extends HttpServlet { + + /** + * The data provider should not care about the base URI. A constant is used instead. + */ + private static final String BASE_URI = "https://www.wikidata.org"; + /** + * The less verbose RDF format is the default. + */ + private static final RDFFormat DEFAULT_RDF_FORMAT = RDFFormat.TURTLE; + /** + * The uploaded dataset must be saved to the server local file system, before sending it to the Blazegraph bulk load service. + * See https://wiki.blazegraph.com/wiki/index.php/REST_API#Bulk_Load_Configuration + */ + private static final String TEMP_DATASET_FILE_NAME = "to_be_uploaded"; + /** + * Endpoint name of the Blazegraph bulk load service. + */ + private static final String BLAZEGRAPH_DATA_LOADER_ENDPOINT = "/dataloader"; + private static final Logger log = LoggerFactory.getLogger(UploadServlet.class); + + /** + * {@link Properties} required for the Blazegraph bulk load service, to set up the database instance. + * See https://wiki.blazegraph.com/wiki/index.php/REST_API#Bulk_Load_Configuration + * <p> + * The data provider should not care about this. + */ + private String blazegraphPropertiesLocation; + /** + * {@link Properties} required for the Blazegraph bulk load service, to set up the data loader. + * See https://wiki.blazegraph.com/wiki/index.php/REST_API#Bulk_Load_Configuration + * <p> + * The data provider should not care about this. + */ + private Properties dataLoaderProperties; + /** + * Temporary file with the uploaded dataset to be stored in the server local file system. + * The uploaded dataset must be saved to the server local file system, before sending it to the Blazegraph bulk load service. + * See https://wiki.blazegraph.com/wiki/index.php/REST_API#Bulk_Load_Configuration + */ + private File tempDataset; + + @Override + public void init() throws ServletException { + try { + blazegraphPropertiesLocation = Resources.getResource("RWStore.properties").toURI().getPath(); + } catch (URISyntaxException use) { + log.error("Could not parse the Blazegraph properties file URI: {} Parse error at index {}", use.getInput(), use.getIndex()); + } + dataLoaderProperties = new Properties(); + dataLoaderProperties.setProperty("quiet", "false"); + dataLoaderProperties.setProperty("verbose", "0"); + dataLoaderProperties.setProperty("closure", "false"); + dataLoaderProperties.setProperty("durableQueues", "true"); + dataLoaderProperties.setProperty("com.bigdata.rdf.store.DataLoader.flush", "false"); + dataLoaderProperties.setProperty("com.bigdata.rdf.store.DataLoader.bufferCapacity", "100000"); + dataLoaderProperties.setProperty("com.bigdata.rdf.store.DataLoader.queueCapacity", "10"); + dataLoaderProperties.setProperty("namespace", "wdq"); + dataLoaderProperties.setProperty("propertyFile", blazegraphPropertiesLocation); + } + + /** + * Upload a RDF dataset to Blazegraph, upon Wikidata data model validation. + * + * @param request the client HTTP request + * @param response the servlet HTTP response + * @throws IOException if an input or output error is detected when the servlet handles the request + * @throws ServletException if the request for the POST could not be handled + */ + public void doPost(HttpServletRequest request, HttpServletResponse response) throws IOException, ServletException { + RDFFormat format = null; + WikibaseDataModelValidator validator = new WikibaseDataModelValidator(); + Model validSyntax = null; + // Check that we have a file upload request + boolean isMultipart = ServletFileUpload.isMultipartContent(request); + if (isMultipart) { + // Create a new file upload handler + ServletFileUpload upload = new ServletFileUpload(); + // Parse the request + FileItemIterator iter; + try { + iter = upload.getItemIterator(request); + while (iter.hasNext()) { + FileItemStream item = iter.next(); + String name = item.getFieldName(); + InputStream stream = item.openStream(); + if (item.isFormField()) { + log.info("Form field " + name + " with value " + Streams.asString(stream) + " detected."); + } else { + String fileName = item.getName(); + log.info("File field " + name + " with file name " + fileName + " detected."); + /* + * Guess the RDF format based on the file name extension. + * This is the only solution, as the content type is multipart/form-data. + * Fall back to Turtle if the guess fails, as we cannot blame the user for uploading proper content with an arbitrary (or no) extension. + */ + format = Rio.getParserFormatForFileName(fileName, DEFAULT_RDF_FORMAT); + // 1. Validate syntax + try { + validSyntax = validator.checkSyntax(stream, BASE_URI, format); + } catch (RDFParseException rpe) { + log.error("The dataset is not valid RDF. Error at line {}, column {}. Will fail with a bad request", rpe.getLineNumber(), rpe + .getColumnNumber()); + response.sendError(HttpServletResponse.SC_BAD_REQUEST, "Your dataset is not valid RDF. Found an error at line " + rpe + .getLineNumber() + + ", " + + "column " + rpe.getColumnNumber() + + ". Please fix it and try again"); + return; + } + } + } + } catch (FileUploadException fue) { + log.error("Failed reading/parsing the request or storing files: {}", fue); + response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR, fue.getLocalizedMessage()); + return; + } + } else { + String actualContentType = request.getContentType(); + log.error("Not a multipart content type: {} Will fail with a bad request", actualContentType); + response.sendError(HttpServletResponse.SC_BAD_REQUEST, "You should upload your dataset as a file using multipart/form-data content type, not " + + actualContentType + + ". Please fix your HTTP request and try again."); + return; + } + + // 2. Validate the data model + AbstractMap.SimpleImmutableEntry<Model, List<String>> validated = validator.handleDataset(validSyntax); + Model toBeUploaded = validated.getKey(); + List<String> invalid = validated.getValue(); + // The data loader needs a file to be stored on the server local file system + try { + // Set a suitable extension based on the RDF format + tempDataset = File.createTempFile(TEMP_DATASET_FILE_NAME, "." + format.getDefaultFileExtension()); + Rio.write(toBeUploaded, Files.newBufferedWriter(tempDataset.toPath(), StandardCharsets.UTF_8), format); + } catch (RDFHandlerException rhe) { + log.error("Failed writing RDF: {}", rhe); + response.sendError(HttpServletResponse.SC_INTERNAL_SERVER_ERROR, rhe.getLocalizedMessage()); + return; + } + // 3. Send the dataset to the Blazegraph bulk load service + AbstractMap.SimpleImmutableEntry<Integer, List<String>> dataLoaderResponse = sendDatasetToDataLoader(request); + // 4. Send the final response, including the list of invalid triples and the response content from the bulk load service + sendResponse(response, invalid, dataLoaderResponse); + } + + @Override + public void destroy() { + tempDataset.delete(); + } + + /** + * + * @param response the servlet HTTP response + * @param invalid the list of invalid data + * @param dataLoaderResponse the Blazegraph data loader servlet HTTP response + * @throws IOException if an error occurs while getting the response output writer + */ + private void sendResponse(HttpServletResponse response, List<String> invalid, AbstractMap.SimpleImmutableEntry<Integer, List<String>> dataLoaderResponse) + throws IOException { + // The final response code is the data loader one + int dataLoaderResponseCode = dataLoaderResponse.getKey(); + List<String> dataLoaderResponseContent = dataLoaderResponse.getValue(); + response.setStatus(dataLoaderResponseCode); + response.setContentType("text/plain"); + response.setCharacterEncoding(StandardCharsets.UTF_8.name()); + PrintWriter pw = response.getWriter(); + for (String invalidComponent : invalid) { + pw.println(invalidComponent); + } + if (!dataLoaderResponseContent.isEmpty()) { + for (String dataLoaderResponseLine : dataLoaderResponse.getValue()) { + pw.println(dataLoaderResponseLine); + } + } + pw.close(); + } + + /** + * Send the uploaded dataset to the Blazegraph bulk load service, firing a POST with the required request. + * See https://wiki.blazegraph.com/wiki/index.php/REST_API#Bulk_Load_Configuration + * Alternative solutions may be: + * B. use a HttpServletRequestWrapper, i.e., wrapper = new HttpServletRequestWrapper(request); + * C. use a Filter + * @param request the client HTTP request + * @throws IOException if an input or output error is detected when the client sends the request to the data loader servlet + */ + private AbstractMap.SimpleImmutableEntry<Integer, List<String>> sendDatasetToDataLoader(HttpServletRequest request) throws IOException { + List<String> responseContent = new ArrayList<>(); + URL url = new URL(request.getRequestURL().toString().replace(request.getServletPath(), BLAZEGRAPH_DATA_LOADER_ENDPOINT)); + HttpURLConnection connection = (HttpURLConnection) url.openConnection(); + connection.setRequestMethod("POST"); + connection.setRequestProperty("Content-Type", "text/plain"); + connection.setDoOutput(true); + OutputStream os = connection.getOutputStream(); + DataOutputStream dos = new DataOutputStream(os); + dataLoaderProperties.setProperty("fileOrDirs", tempDataset.getPath()); + dataLoaderProperties.store(dos, "Expected properties for the Blazegraph data loader service"); + dos.close(); + // Check that everything went fine + int responseCode = connection.getResponseCode(); + InputStream responseStream; + // Get the data loader response only if it went wrong + if (responseCode == HttpServletResponse.SC_OK) { + log.info("The dataset ingestion into Blazegraph went fine"); + } else { + log.error("Failed ingesting the dataset into Blazegraph, HTTP error code: {}", responseCode); + responseStream = connection.getErrorStream(); + BufferedReader responseReader = new BufferedReader(new InputStreamReader(responseStream, StandardCharsets.UTF_8)); + String line; + while ((line = responseReader.readLine()) != null) { + responseContent.add(line); + } + responseReader.close(); + } + connection.disconnect(); + return new AbstractMap.SimpleImmutableEntry<>(responseCode, responseContent); + } +} diff --git a/tools/src/test/java/org/wikidata/query/rdf/tool/primarysources/WikibaseDataModelValidatorUnitTest.java b/tools/src/test/java/org/wikidata/query/rdf/tool/primarysources/WikibaseDataModelValidatorUnitTest.java new file mode 100644 index 0000000..7b20715 --- /dev/null +++ b/tools/src/test/java/org/wikidata/query/rdf/tool/primarysources/WikibaseDataModelValidatorUnitTest.java @@ -0,0 +1,330 @@ +package org.wikidata.query.rdf.tool.primarysources; + +import com.carrotsearch.randomizedtesting.RandomizedRunner; +import com.carrotsearch.randomizedtesting.RandomizedTest; +import com.google.common.io.Resources; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.openrdf.model.Model; +import org.openrdf.model.Resource; +import org.openrdf.model.Statement; +import org.openrdf.model.URI; +import org.openrdf.model.Value; +import org.openrdf.model.ValueFactory; +import org.openrdf.model.impl.TreeModel; +import org.openrdf.model.impl.ValueFactoryImpl; +import org.openrdf.model.util.Models; +import org.openrdf.rio.RDFFormat; +import org.openrdf.rio.RDFParseException; +import org.openrdf.rio.Rio; +import org.wikidata.query.rdf.common.uri.Provenance; +import org.wikidata.query.rdf.common.uri.WikibaseUris; + +import java.io.IOException; +import java.io.InputStream; +import java.io.UnsupportedEncodingException; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Locale; +import java.util.UUID; + +/** + * @author Marco Fossati - User:Hjfocs + * @since 0.2.4 + * Created on Jun 19, 2017. + */ +@RunWith(RandomizedRunner.class) +public class WikibaseDataModelValidatorUnitTest extends RandomizedTest { + + private static final String GOOD_DATASET_FILE_NAME = "good_chuck_berry.ttl"; // Valid data model + private static final String BAD_DATASET_FILE_NAME = "bad_chuck_berry.ttl"; // Invalid data model + private static final String BAD_RDF_FILE_NAME = "just_bad_rdf.ttl"; // Invalid RDF + private static final String BASE_URI = "http://test.dataset"; + private static final RDFFormat RDF_FORMAT = RDFFormat.TURTLE; + + private static WikibaseDataModelValidator validator; + private static InputStream goodDataset; + private static InputStream badDataset; + private static Model goodParsedDataset; + private static Model badParsedDataset; + + @BeforeClass + public static void setUpOnce() throws IOException, RDFParseException { + validator = new WikibaseDataModelValidator(); + goodDataset = openTestDatasetStream(GOOD_DATASET_FILE_NAME); + goodParsedDataset = Rio.parse(goodDataset, BASE_URI, RDF_FORMAT); + badDataset = openTestDatasetStream(BAD_DATASET_FILE_NAME); + badParsedDataset = Rio.parse(badDataset, BASE_URI, RDF_FORMAT); + } + + @AfterClass + public static void tearDownOnce() throws IOException { + goodDataset.close(); + badDataset.close(); + } + + private static InputStream openTestDatasetStream(String fileName) throws IOException { + return Resources.asByteSource( + Resources.getResource(fileName)) + .openBufferedStream(); + } + + @Test + public void testGoodSyntax() throws Exception { + InputStream goodRDF = openTestDatasetStream(GOOD_DATASET_FILE_NAME); + Model shouldBeGood = validator.checkSyntax(goodRDF, BASE_URI, RDF_FORMAT); + goodRDF.close(); + assertNotNull(shouldBeGood); + assertEquals(goodParsedDataset, shouldBeGood); + } + + @Test(expected = RDFParseException.class) + public void testBadSyntax() throws Exception { + InputStream badRDF = openTestDatasetStream(BAD_RDF_FILE_NAME); + validator.checkSyntax(badRDF, BASE_URI, RDF_FORMAT); + badRDF.close(); + } + + @Test + public void testValidateItemTriple() throws Exception { + ValueFactory vf = ValueFactoryImpl.getInstance(); + + // Invalid triple components + Resource invalidSubject = vf.createURI("http://not.a.wikidata.item"); + URI invalidProperty = vf.createURI("http://quite.invalid.wikidata.property"); + Value invalidObject = vf.createURI("http://im.no.reified.statement.node"); + + // Valid triple components + Resource validSubject = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.entity() + "Q666"); + URI validProperty = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.property(WikibaseUris.PropertyType.CLAIM) + "P88"); + Value validObject = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.statement() + "Q666-" + UUID.randomUUID().toString()); + + // Combine valid and invalid components into a set of test triples + Statement totallyInvalid = vf.createStatement(invalidSubject, invalidProperty, invalidObject); + Statement totallyValid = vf.createStatement(validSubject, validProperty, validObject); + Statement withInvalidSubject = vf.createStatement(invalidSubject, validProperty, validObject); + Statement withInvalidProperty = vf.createStatement(validSubject, invalidProperty, validObject); + Statement withInvalidObject = vf.createStatement(validSubject, validProperty, invalidObject); + Statement withInvalidSubjectAndProperty = vf.createStatement(invalidSubject, invalidProperty, validObject); + Statement withInvalidPropertyAndObject = vf.createStatement(validSubject, invalidProperty, invalidObject); + Statement withInvalidSubjectAndObject = vf.createStatement(invalidSubject, validProperty, invalidObject); + + assertEquals(Arrays.asList(invalidSubject.stringValue(), invalidProperty.stringValue(), invalidObject.stringValue()), + validator.validateItemTriple(totallyInvalid)); + assertEquals(new ArrayList<>(), validator.validateItemTriple(totallyValid)); + assertEquals(Arrays.asList(invalidSubject.stringValue()), validator.validateItemTriple(withInvalidSubject)); + assertEquals(Arrays.asList(invalidProperty.stringValue()), validator.validateItemTriple(withInvalidProperty)); + assertEquals(Arrays.asList(invalidObject.stringValue()), validator.validateItemTriple(withInvalidObject)); + assertEquals(Arrays.asList(invalidSubject.stringValue(), invalidProperty.stringValue()), validator.validateItemTriple(withInvalidSubjectAndProperty)); + assertEquals(Arrays.asList(invalidProperty.stringValue(), invalidObject.stringValue()), validator.validateItemTriple(withInvalidPropertyAndObject)); + assertEquals(Arrays.asList(invalidSubject.stringValue(), invalidObject.stringValue()), validator.validateItemTriple(withInvalidSubjectAndObject)); + } + + @Test + public void testValidatePropertyTriple() throws Exception { + ValueFactory vf = ValueFactoryImpl.getInstance(); + + // Tricky invalid triple components + Resource subjectWithInvalidUUID = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.statement() + "Q" + String.valueOf(randomIntBetween(1, + 100000)) + + "-this-is-not-a-uuid"); + URI propertyWithInvalidNamespace = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.property(WikibaseUris.PropertyType.STATEMENT_VALUE) + "P" + + String.valueOf(randomIntBetween(1, 100000))); + Value objectWithInvalidNamespace = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.value() + "Q" + String.valueOf(randomIntBetween(1, + 100000))); + Value objectWithTypo = vf.createURI("http://www.wikidata.orge/entiti/" + "Q" + String.valueOf(randomIntBetween(1, 100000))); + Value unresolvableObject = vf.createURI("http://this.leads.to.nowhere"); + + // Valid triple components + Resource validSubject = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.statement() + "Q666-" + UUID.randomUUID().toString()); + URI validProperty = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.property(WikibaseUris.PropertyType.STATEMENT) + "P" + String.valueOf( + randomIntBetween(1, 100000))); + Value validObject = vf.createLiteral(randomFloat()); + + // Combine valid and invalid components into a set of test triples + Statement totallyInvalid = vf.createStatement(subjectWithInvalidUUID, propertyWithInvalidNamespace, objectWithInvalidNamespace); + Statement totallyValid = vf.createStatement(validSubject, validProperty, validObject); + Statement withInvalidSubject = vf.createStatement(subjectWithInvalidUUID, validProperty, validObject); + Statement withInvalidProperty = vf.createStatement(validSubject, propertyWithInvalidNamespace, validObject); + Statement withInvalidNamespaceObject = vf.createStatement(validSubject, validProperty, objectWithInvalidNamespace); + Statement withTypoObject = vf.createStatement(validSubject, validProperty, objectWithTypo); + Statement withUnresolvableObject = vf.createStatement(validSubject, validProperty, unresolvableObject); + Statement withInvalidSubjectAndProperty = vf.createStatement(subjectWithInvalidUUID, propertyWithInvalidNamespace, validObject); + Statement withInvalidPropertyAndObject = vf.createStatement(validSubject, propertyWithInvalidNamespace, objectWithInvalidNamespace); + Statement withInvalidSubjectAndObject = vf.createStatement(subjectWithInvalidUUID, validProperty, objectWithInvalidNamespace); + + assertEquals(Arrays.asList(subjectWithInvalidUUID.stringValue(), propertyWithInvalidNamespace.stringValue(), objectWithInvalidNamespace.stringValue()), + validator.validatePropertyTriple(totallyInvalid)); + assertEquals(new ArrayList<>(), validator.validatePropertyTriple(totallyValid)); + assertEquals(Arrays.asList(subjectWithInvalidUUID.stringValue()), validator.validatePropertyTriple(withInvalidSubject)); + assertEquals(Arrays.asList(propertyWithInvalidNamespace.stringValue()), validator.validatePropertyTriple(withInvalidProperty)); + assertEquals(Arrays.asList(objectWithInvalidNamespace.stringValue()), validator.validatePropertyTriple(withInvalidNamespaceObject)); + assertEquals(Arrays.asList(objectWithTypo.stringValue()), validator.validatePropertyTriple(withTypoObject)); + assertEquals(Arrays.asList(unresolvableObject.stringValue()), validator.validatePropertyTriple(withUnresolvableObject)); + assertEquals(Arrays.asList(subjectWithInvalidUUID.stringValue(), propertyWithInvalidNamespace.stringValue()), validator.validatePropertyTriple( + withInvalidSubjectAndProperty)); + assertEquals(Arrays.asList(propertyWithInvalidNamespace.stringValue(), objectWithInvalidNamespace.stringValue()), validator.validatePropertyTriple( + withInvalidPropertyAndObject)); + assertEquals(Arrays.asList(subjectWithInvalidUUID.stringValue(), objectWithInvalidNamespace.stringValue()), validator.validatePropertyTriple( + withInvalidSubjectAndObject)); + } + + @Test + public void testValidateReferenceTriple() throws Exception { + ValueFactory vf = ValueFactoryImpl.getInstance(); + + // Tricky invalid triple components + Resource subjectWithInvalidUUID = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.statement() + "Q" + String.valueOf(randomIntBetween(1, + 100000)) + + "-this-is-not-a-uuid"); + URI invalidProperty = vf.createURI(Provenance.NAMESPACE + "wasderivedfrom"); + Value objectWithInvalidHash = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.reference() + "IdontTh1nkImag00dHash"); + + // Valid triple components + Resource validSubject = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.statement() + "Q666-" + UUID.randomUUID().toString()); + URI validProperty = vf.createURI(Provenance.WAS_DERIVED_FROM); + Value validObject = createValidReferenceNode(vf); + + // Combine valid and invalid components into a set of test triples + Statement totallyInvalid = vf.createStatement(subjectWithInvalidUUID, invalidProperty, objectWithInvalidHash); + Statement totallyValid = vf.createStatement(validSubject, validProperty, validObject); + Statement withInvalidSubject = vf.createStatement(subjectWithInvalidUUID, validProperty, validObject); + Statement withInvalidProperty = vf.createStatement(validSubject, invalidProperty, validObject); + Statement withInvalidNamespaceObject = vf.createStatement(validSubject, validProperty, objectWithInvalidHash); + Statement withInvalidSubjectAndProperty = vf.createStatement(subjectWithInvalidUUID, invalidProperty, validObject); + Statement withInvalidPropertyAndObject = vf.createStatement(validSubject, invalidProperty, objectWithInvalidHash); + Statement withInvalidSubjectAndObject = vf.createStatement(subjectWithInvalidUUID, validProperty, objectWithInvalidHash); + + assertEquals(Arrays.asList(subjectWithInvalidUUID.stringValue(), invalidProperty.stringValue(), objectWithInvalidHash.stringValue()), + validator.validateReferenceTriple(totallyInvalid)); + assertEquals(new ArrayList<>(), validator.validateReferenceTriple(totallyValid)); + assertEquals(Arrays.asList(subjectWithInvalidUUID.stringValue()), validator.validateReferenceTriple(withInvalidSubject)); + assertEquals(Arrays.asList(invalidProperty.stringValue()), validator.validateReferenceTriple(withInvalidProperty)); + assertEquals(Arrays.asList(objectWithInvalidHash.stringValue()), validator.validateReferenceTriple(withInvalidNamespaceObject)); + assertEquals(Arrays.asList(subjectWithInvalidUUID.stringValue(), invalidProperty.stringValue()), validator.validateReferenceTriple( + withInvalidSubjectAndProperty)); + assertEquals(Arrays.asList(invalidProperty.stringValue(), objectWithInvalidHash.stringValue()), validator.validateReferenceTriple( + withInvalidPropertyAndObject)); + assertEquals(Arrays.asList(subjectWithInvalidUUID.stringValue(), objectWithInvalidHash.stringValue()), validator.validateReferenceTriple( + withInvalidSubjectAndObject)); + + } + + @Test + public void testValidateQualifierTriple() throws Exception { + ValueFactory vf = ValueFactoryImpl.getInstance(); + + // Tricky invalid triple components + Resource subjectWithInvalidUUID = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.statement() + "Q" + String.valueOf(randomIntBetween(1, + 100000)) + + "-this-is-not-a-uuid"); + URI propertyWithInvalidNamespace = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.property(WikibaseUris.PropertyType.QUALIFIER_VALUE) + "P" + + String.valueOf(randomIntBetween(1, 100000))); + Value objectWithInvalidNamespace = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.reference() + "Q" + String.valueOf(randomIntBetween(1, + 100000))); + + // Valid triple components + Resource validSubject = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.statement() + "Q666-" + UUID.randomUUID().toString()); + URI validProperty = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.property(WikibaseUris.PropertyType.QUALIFIER) + "P" + String.valueOf( + randomIntBetween(1, 100000))); + Value validObject = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.entity() + "Q" + String.valueOf(randomIntBetween(1, 100000))); + + // Combine valid and invalid components into a set of test triples + Statement totallyInvalid = vf.createStatement(subjectWithInvalidUUID, propertyWithInvalidNamespace, objectWithInvalidNamespace); + Statement totallyValid = vf.createStatement(validSubject, validProperty, validObject); + Statement withInvalidSubject = vf.createStatement(subjectWithInvalidUUID, validProperty, validObject); + Statement withInvalidProperty = vf.createStatement(validSubject, propertyWithInvalidNamespace, validObject); + Statement withInvalidNamespaceObject = vf.createStatement(validSubject, validProperty, objectWithInvalidNamespace); + Statement withInvalidSubjectAndProperty = vf.createStatement(subjectWithInvalidUUID, propertyWithInvalidNamespace, validObject); + Statement withInvalidPropertyAndObject = vf.createStatement(validSubject, propertyWithInvalidNamespace, objectWithInvalidNamespace); + Statement withInvalidSubjectAndObject = vf.createStatement(subjectWithInvalidUUID, validProperty, objectWithInvalidNamespace); + + assertEquals(Arrays.asList(subjectWithInvalidUUID.stringValue(), propertyWithInvalidNamespace.stringValue(), objectWithInvalidNamespace.stringValue()), + validator.validateQualifierTriple(totallyInvalid)); + assertEquals(new ArrayList<>(), validator.validateQualifierTriple(totallyValid)); + assertEquals(Arrays.asList(subjectWithInvalidUUID.stringValue()), validator.validateQualifierTriple(withInvalidSubject)); + assertEquals(Arrays.asList(propertyWithInvalidNamespace.stringValue()), validator.validateQualifierTriple(withInvalidProperty)); + assertEquals(Arrays.asList(objectWithInvalidNamespace.stringValue()), validator.validateQualifierTriple(withInvalidNamespaceObject)); + assertEquals(Arrays.asList(subjectWithInvalidUUID.stringValue(), propertyWithInvalidNamespace.stringValue()), validator.validateQualifierTriple( + withInvalidSubjectAndProperty)); + assertEquals(Arrays.asList(propertyWithInvalidNamespace.stringValue(), objectWithInvalidNamespace.stringValue()), validator.validateQualifierTriple( + withInvalidPropertyAndObject)); + assertEquals(Arrays.asList(subjectWithInvalidUUID.stringValue(), objectWithInvalidNamespace.stringValue()), validator.validateQualifierTriple( + withInvalidSubjectAndObject)); + } + + @Test + public void testValidateReferenceValueTriple() throws Exception { + ValueFactory vf = ValueFactoryImpl.getInstance(); + + // Tricky invalid triple components + Resource subjectWithInvalidHash = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.reference() + "IdontTh1nkImag00dHash"); + URI propertyWithInvalidNamespace = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.property(WikibaseUris.PropertyType.REFERENCE_VALUE) + "P" + + String.valueOf(randomIntBetween(1, 100000))); + Value objectWithInvalidNamespace = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.entityData() + "Q" + String.valueOf(randomIntBetween(1, + 100000))); + Value objectWithTypo = vf.createURI("http://wwww.wikidata.org/entit/" + "Q" + String.valueOf(randomIntBetween(1, 100000))); + Value unresolvableObject = vf.createURI("http://road.to.nowhere"); + + // Valid triple components + Resource validSubject = createValidReferenceNode(vf); + URI validProperty = vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.property(WikibaseUris.PropertyType.REFERENCE) + "P" + String.valueOf( + randomIntBetween(1, 100000))); + Value validObject = vf.createURI("https://en.wikipedia.org"); + + // Combine valid and invalid components into a set of test triples + Statement totallyInvalid = vf.createStatement(subjectWithInvalidHash, propertyWithInvalidNamespace, objectWithInvalidNamespace); + Statement totallyValid = vf.createStatement(validSubject, validProperty, validObject); + Statement withInvalidSubject = vf.createStatement(subjectWithInvalidHash, validProperty, validObject); + Statement withInvalidProperty = vf.createStatement(validSubject, propertyWithInvalidNamespace, validObject); + Statement withInvalidNamespaceObject = vf.createStatement(validSubject, validProperty, objectWithInvalidNamespace); + Statement withTypoObject = vf.createStatement(validSubject, validProperty, objectWithTypo); + Statement withUnresolvableObject = vf.createStatement(validSubject, validProperty, unresolvableObject); + Statement withInvalidSubjectAndProperty = vf.createStatement(subjectWithInvalidHash, propertyWithInvalidNamespace, validObject); + Statement withInvalidPropertyAndObject = vf.createStatement(validSubject, propertyWithInvalidNamespace, objectWithInvalidNamespace); + Statement withInvalidSubjectAndObject = vf.createStatement(subjectWithInvalidHash, validProperty, objectWithInvalidNamespace); + + assertEquals(Arrays.asList(subjectWithInvalidHash.stringValue(), propertyWithInvalidNamespace.stringValue(), objectWithInvalidNamespace.stringValue()), + validator.validateReferenceValueTriple(totallyInvalid)); + assertEquals(new ArrayList<>(), validator.validateReferenceValueTriple(totallyValid)); + assertEquals(Arrays.asList(subjectWithInvalidHash.stringValue()), validator.validateReferenceValueTriple(withInvalidSubject)); + assertEquals(Arrays.asList(propertyWithInvalidNamespace.stringValue()), validator.validateReferenceValueTriple(withInvalidProperty)); + assertEquals(Arrays.asList(objectWithInvalidNamespace.stringValue()), validator.validateReferenceValueTriple(withInvalidNamespaceObject)); + assertEquals(Arrays.asList(objectWithTypo.stringValue()), validator.validateReferenceValueTriple(withTypoObject)); + assertEquals(Arrays.asList(unresolvableObject.stringValue()), validator.validateReferenceValueTriple(withUnresolvableObject)); + assertEquals(Arrays.asList(subjectWithInvalidHash.stringValue(), propertyWithInvalidNamespace.stringValue()), validator.validateReferenceValueTriple( + withInvalidSubjectAndProperty)); + assertEquals(Arrays.asList(propertyWithInvalidNamespace.stringValue(), objectWithInvalidNamespace.stringValue()), validator + .validateReferenceValueTriple( + withInvalidPropertyAndObject)); + assertEquals(Arrays.asList(subjectWithInvalidHash.stringValue(), objectWithInvalidNamespace.stringValue()), validator.validateReferenceValueTriple( + withInvalidSubjectAndObject)); + } + + @Test + public void testHandleDataset() throws Exception { + Model good = validator.handleDataset(goodParsedDataset).getKey(); + Model bad = validator.handleDataset(badParsedDataset).getKey(); + assertEquals(goodParsedDataset, good); + assertTrue(Models.isSubset(bad, badParsedDataset)); + assertEquals(new TreeModel(), bad); + } + + /* + * Build a valid reference node with a SHA-1 hash + */ + private URI createValidReferenceNode(ValueFactory vf) throws NoSuchAlgorithmException, UnsupportedEncodingException { + MessageDigest md = MessageDigest.getInstance("SHA-1"); + String toDigest = "I wanna become a SHA-1 hash"; + byte[] digest = md.digest(toDigest.getBytes("utf8")); + StringBuilder hash = new StringBuilder(); + for (byte b : digest) { + hash.append(String.format(Locale.ENGLISH, "%02x", b & 0xff)); + } + return vf.createURI(WikibaseDataModelValidator.VALID_NAMESPACES.reference() + hash.toString()); + } +} diff --git a/tools/src/test/java/org/wikidata/query/rdf/tool/primarysources/ingestion/IngestionAPIIntegrationTest.java b/tools/src/test/java/org/wikidata/query/rdf/tool/primarysources/ingestion/IngestionAPIIntegrationTest.java new file mode 100644 index 0000000..03054a5 --- /dev/null +++ b/tools/src/test/java/org/wikidata/query/rdf/tool/primarysources/ingestion/IngestionAPIIntegrationTest.java @@ -0,0 +1,120 @@ +package org.wikidata.query.rdf.tool.primarysources.ingestion; + +import com.google.common.io.Resources; +import org.apache.http.HttpEntity; +import org.apache.http.client.methods.CloseableHttpResponse; +import org.apache.http.client.methods.HttpPost; +import org.apache.http.entity.mime.MultipartEntityBuilder; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.junit.After; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; +import org.openrdf.query.TupleQueryResult; +import org.wikidata.query.rdf.tool.AbstractRdfRepositoryIntegrationTestBase; + +import javax.servlet.http.HttpServletResponse; +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStreamReader; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +/** + * @author Marco Fossati - User:Hjfocs + * @since 0.2.4 + * Created on Jul 10, 2017. + */ +public class IngestionAPIIntegrationTest extends AbstractRdfRepositoryIntegrationTestBase { + + private static final String BASE_ENDPOINT = "http://localhost:9999/bigdata"; + private static final String GOOD_DATASET_FILE_NAME = "good_chuck_berry.ttl"; // Valid data model + private static final String BAD_DATASET_FILE_NAME = "bad_chuck_berry.ttl"; // Invalid data model + private static final String BAD_RDF_FILE_NAME = "just_bad_rdf.ttl"; // Invalid RDF + + private static URI uploadEndpoint; + private static File goodDataset; + private static File badDataset; + private static File badRDF; + + private CloseableHttpClient client; + + @BeforeClass + public static void setUpOnce() throws URISyntaxException { + uploadEndpoint = URI.create(BASE_ENDPOINT + "/upload"); + goodDataset = new File(Resources.getResource(GOOD_DATASET_FILE_NAME).toURI()); + badDataset = new File(Resources.getResource(BAD_DATASET_FILE_NAME).toURI()); + badRDF = new File(Resources.getResource(BAD_RDF_FILE_NAME).toURI()); + } + + + @Before + public void setUp() throws Exception { + client = HttpClients.createDefault(); + } + + @After + public void tearDown() throws Exception { + client.close(); + } + + @Test + public void testGoodDatasetUpload() throws Exception { + CloseableHttpResponse goodResponse = postDatasetUpload(uploadEndpoint, goodDataset); + List<String> goodResponseContent = readResponse(goodResponse); + assertEquals(HttpServletResponse.SC_OK, goodResponse.getStatusLine().getStatusCode()); + assertEquals(new ArrayList<>(), goodResponseContent); + assertTrue(rdfRepository().ask("ask where { wds:Q5921-583C7277-B344-4C96-8CF2-0557C2D0CD34 pq:P2096 \"Chuck Berry (2007)\"@ca }")); + TupleQueryResult uploaded = rdfRepository().query("select * where {?s ?p ?o}"); + int uploadedCount = 0; + while (uploaded.hasNext()) { + uploaded.next(); + uploadedCount++; + } + assertEquals(4, uploadedCount); + } + + @Test + public void testBadDatasetUpload() throws Exception { + CloseableHttpResponse badResponse = postDatasetUpload(uploadEndpoint, badDataset); + List<String> badResponseContent = readResponse(badResponse); + assertEquals(HttpServletResponse.SC_OK, badResponse.getStatusLine().getStatusCode()); + assertEquals(10, badResponseContent.size()); + assertTrue(badResponseContent.contains("http://www.wikidata.org/prop/qualifier/I_m_not_a_valid_Item_triple")); + assertFalse(rdfRepository().ask("ask where {?s ?p ?o}")); + assertFalse(rdfRepository().query("select * where {?s ?p ?o}").hasNext()); + } + + @Test + public void testBadRDFUpload() throws Exception { + CloseableHttpResponse badRDFResponse = postDatasetUpload(uploadEndpoint, badRDF); + assertEquals(HttpServletResponse.SC_BAD_REQUEST, badRDFResponse.getStatusLine().getStatusCode()); + } + + private List<String> readResponse(CloseableHttpResponse response) throws IOException { + List<String> responseContent = new ArrayList<>(); + HttpEntity responseEntity = response.getEntity(); + BufferedReader br = new BufferedReader(new InputStreamReader(responseEntity.getContent(), StandardCharsets.UTF_8)); + String line; + while ((line = br.readLine()) != null) { + responseContent.add(line); + } + br.close(); + return responseContent; + } + + private CloseableHttpResponse postDatasetUpload(URI endpoint, File dataset) throws IOException { + HttpPost post = new HttpPost(endpoint); + MultipartEntityBuilder builder = MultipartEntityBuilder.create(); + builder.addBinaryBody("dataset", dataset); + HttpEntity datasetUpload = builder.build(); + post.setEntity(datasetUpload); + return client.execute(post); + } + +} diff --git a/tools/src/test/resources/bad_chuck_berry.ttl b/tools/src/test/resources/bad_chuck_berry.ttl new file mode 100644 index 0000000..e4c1c7c --- /dev/null +++ b/tools/src/test/resources/bad_chuck_berry.ttl @@ -0,0 +1,39 @@ +@prefix wikibase: <http://wikiba.se/ontology-beta#> . +@prefix wd: <http://www.wikidata.org/entity/> . +@prefix wds: <http://www.wikidata.org/entity/statement/> . +@prefix wdref: <http://www.wikidata.org/reference/> . +@prefix p: <http://www.wikidata.org/prop/> . +@prefix ps: <http://www.wikidata.org/prop/statement/> . +@prefix pq: <http://www.wikidata.org/prop/qualifier/> . +@prefix pr: <http://www.wikidata.org/prop/reference/> . +@prefix prov: <http://www.w3.org/ns/prov#> . + +# Item triple, invalid data model +wd:Q5921 p:P18 pq:I_m_not_a_valid_Item_triple . + +# Item triple, valid data model, invalid QID +wd:RnR p:P18 wds:RnR-583C7277-B344-4C96-8CF2-0557C2D0CD34 . + +# Property triple, invalid data model ("pr" instead of "ps" prefix) +wds:Q5921-583C7277-B344-4C96-8CF2-0557C2D0CD34 pr:P18 <http://commons.wikimedia.org/wiki/Special:FilePath/Chuck-berry-2007-07-18.jpg> . + +# Property triple, valid data model, invalid UUID +wds:this-is-not-a-uuid ps:P18 <http://commons.wikimedia.org/wiki/Special:FilePath/Chuck-berry-2007-07-18.jpg> . + +# Qualifier triple, invalid data model ("p" instead of "pq" prefix) +wds:Q5921-583C7277-B344-4C96-8CF2-0557C2D0CD34 p:P666 "Chuck Berry (2007)"@ca . + +# Qualifier triple, valid data model, invalid property +wds:Q5921-583C7277-B344-4C96-8CF2-0557C2D0CD34 pq:Q666 "Chuck Berry (2007)"@ca . + +# Reference triple, invalid data model +wds:Q5921-583C7277-B344-4C96-8CF2-0557C2D0CD34 prov:wasDerivedFrom "There should be a reference node here!" . + +# Reference triple, valid data model, invalid hash +wds:Q5921-583C7277-B344-4C96-8CF2-0557C2D0CD34 prov:wasDerivedFrom wdref:not_a_hash . + +# Reference value triple, invalid data model ("ps" instead of "pr" prefix) +wdref:288ab581e7d2d02995a26dfa8b091d96e78457fc ps:P143 wd:Q206855 . + +# Reference value triple, valid data model, invalid URL +wdref:288ab581e7d2d02995a26dfa8b091d96e78457fc pr:P143 <http://where.am.i.going> . diff --git a/tools/src/test/resources/good_chuck_berry.ttl b/tools/src/test/resources/good_chuck_berry.ttl new file mode 100644 index 0000000..230d5c6 --- /dev/null +++ b/tools/src/test/resources/good_chuck_berry.ttl @@ -0,0 +1,15 @@ +@prefix wikibase: <http://wikiba.se/ontology-beta#> . +@prefix wd: <http://www.wikidata.org/entity/> . +@prefix wds: <http://www.wikidata.org/entity/statement/> . +@prefix wdref: <http://www.wikidata.org/reference/> . +@prefix p: <http://www.wikidata.org/prop/> . +@prefix ps: <http://www.wikidata.org/prop/statement/> . +@prefix pq: <http://www.wikidata.org/prop/qualifier/> . +@prefix pr: <http://www.wikidata.org/prop/reference/> . +@prefix prov: <http://www.w3.org/ns/prov#> . + +wds:Q5921-583C7277-B344-4C96-8CF2-0557C2D0CD34 ps:P18 <http://commons.wikimedia.org/wiki/Special:FilePath/Chuck-berry-2007-07-18.jpg> ; + pq:P2096 "Chuck Berry (2007)"@ca ; + prov:wasDerivedFrom wdref:288ab581e7d2d02995a26dfa8b091d96e78457fc . + +wdref:288ab581e7d2d02995a26dfa8b091d96e78457fc pr:P143 wd:Q206855 . diff --git a/tools/src/test/resources/just_bad_rdf.ttl b/tools/src/test/resources/just_bad_rdf.ttl new file mode 100644 index 0000000..277e398 --- /dev/null +++ b/tools/src/test/resources/just_bad_rdf.ttl @@ -0,0 +1,21 @@ +@prefix wikibase: <http://wikiba.se/ontology-beta#> . +@prefix wd: <http://www.wikidata.org/entity/> . +@prefix wds: <http://www.wikidata.org/entity/statement/> . +@prefix wdref: <http://www.wikidata.org/reference/> . +@prefix p: <http://www.wikidata.org/prop/> . +@prefix ps: <http://www.wikidata.org/prop/statement/> . +@prefix pq: <http://www.wikidata.org/prop/qualifier/> . +@prefix pr: <http://www.wikidata.org/prop/reference/> . +@prefix prov: <http://www.w3.org/ns/prov#> . + +### BEGIN: partially invalid RDF +# +# Invalid URI +wd:Q5921 p:P18 wds:"very bad > URI . +# Undeclared prefix +wd:Q5921 p:P18 rnr:let_s_rock . +# +### END: partially invalid RDF + +# Totally invalid RDF +OK, I'm a rock'n'roll legend, but I'm not valid RDF . diff --git a/war/pom.xml b/war/pom.xml index a7a7b64..10e1ccc 100644 --- a/war/pom.xml +++ b/war/pom.xml @@ -75,6 +75,12 @@ <artifactId>common</artifactId> <scope>runtime</scope> </dependency> + <dependency> + <groupId>org.wikidata.query.rdf</groupId> + <artifactId>tools</artifactId> + <version>${project.version}</version> + <scope>runtime</scope> + </dependency> </dependencies> <build> diff --git a/war/src/main/webapp/WEB-INF/web.xml b/war/src/main/webapp/WEB-INF/web.xml index 91efc26..071635f 100644 --- a/war/src/main/webapp/WEB-INF/web.xml +++ b/war/src/main/webapp/WEB-INF/web.xml @@ -157,6 +157,15 @@ <param-value>true</param-value> </init-param> </servlet> + <!-- BEGIN: primary sources tool APIs --> + <servlet> + <servlet-name>Upload</servlet-name> + <display-name>Upload</display-name> + <description>Primary sources tool ingestion API upload service</description> + <servlet-class>org.wikidata.query.rdf.tool.primarysources.ingestion.UploadServlet</servlet-class> + <async-supported>true</async-supported> + </servlet> + <!--END: primary sources tool APIs --> <!-- Note: The HALoadBalancerServlet is deployed from override-web.xml --> <!-- Serve anything under /html/* as a simple file. --> <servlet-mapping> @@ -214,6 +223,12 @@ <servlet-name>Counters</servlet-name> <url-pattern>/counters</url-pattern> </servlet-mapping> + <!--BEGIN: primary sources tool APIs --> + <servlet-mapping> + <servlet-name>Upload</servlet-name> + <url-pattern>/upload</url-pattern> + </servlet-mapping> + <!--END: primary sources tool APIs --> <!-- Map the initial request into the UI. --> <welcome-file-list> <welcome-file>html/index.html</welcome-file> -- To view, visit https://gerrit.wikimedia.org/r/365253 To unsubscribe, visit https://gerrit.wikimedia.org/r/settings Gerrit-MessageType: newchange Gerrit-Change-Id: I33dde366ee41a118aeec5c863f9157a9ab31b291 Gerrit-PatchSet: 1 Gerrit-Project: wikidata/query/rdf Gerrit-Branch: master Gerrit-Owner: Hjfocs <foss...@spaziodati.eu> _______________________________________________ MediaWiki-commits mailing list MediaWiki-commits@lists.wikimedia.org https://lists.wikimedia.org/mailman/listinfo/mediawiki-commits