ANY23-177 Add support for JSON-LD
Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/47278c16 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/47278c16 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/47278c16 Branch: refs/heads/master Commit: 47278c1649050f107b615e00a51db00a356c7ca6 Parents: ebfbbec Author: Lewis John McGibbney <[email protected]> Authored: Tue Mar 25 19:26:47 2014 +0000 Committer: Lewis John McGibbney <[email protected]> Committed: Tue Mar 25 19:26:47 2014 +0000 ---------------------------------------------------------------------- core/pom.xml | 7 + .../any23/extractor/akn/AKNExtractor.java | 50 - .../extractor/akn/AKNExtractorFactory.java | 54 - .../apache/any23/extractor/akn/AKNParser.java | 33 - .../any23/extractor/akn/package-info.java | 27 - .../any23/extractor/rdf/BaseRDFExtractor.java | 2 - .../any23/extractor/rdf/JSONLDExtractor.java | 51 + .../extractor/rdf/JSONLDExtractorFactory.java | 59 ++ .../any23/extractor/rdf/RDFParserFactory.java | 19 + .../any23/extractor/rdf/example-jsonld.jsonld | 17 + .../extractor/rdf/JSONLDExtractorTest.java | 97 ++ pom.xml | 8 + .../any23/extractor/rdf/embedded_json-ld.html | 981 +++++++++++++++++++ .../any23/extractor/rdf/place-example.jsonld | 27 + 14 files changed, 1266 insertions(+), 166 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/47278c16/core/pom.xml ---------------------------------------------------------------------- diff --git a/core/pom.xml b/core/pom.xml index 6a7db29..e938a7c 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -134,6 +134,13 @@ <artifactId>sesame-repository-api</artifactId> </dependency> <!-- END: Sesame --> + + <!-- BEGIN: Misc --> + <dependency> + <groupId>com.github.jsonld-java</groupId> + <artifactId>jsonld-java-sesame</artifactId> + </dependency> + <!-- END: Misc --> <!-- BEGIN: Apache Commons, this version is hosted in the any23-repository-external repository --> http://git-wip-us.apache.org/repos/asf/any23/blob/47278c16/core/src/main/java/org/apache/any23/extractor/akn/AKNExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/akn/AKNExtractor.java b/core/src/main/java/org/apache/any23/extractor/akn/AKNExtractor.java deleted file mode 100644 index e637276..0000000 --- a/core/src/main/java/org/apache/any23/extractor/akn/AKNExtractor.java +++ /dev/null @@ -1,50 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.any23.extractor.akn; - -import java.io.IOException; - -import org.apache.any23.extractor.ExtractionContext; -import org.apache.any23.extractor.ExtractionException; -import org.apache.any23.extractor.ExtractionParameters; -import org.apache.any23.extractor.ExtractionResult; -import org.apache.any23.extractor.Extractor; -import org.apache.any23.extractor.ExtractorDescription; -import org.w3c.dom.Document; - -/** - * Extractor for the <a href="http://www.akomtantoso.org">Akoma Ntoso</a> - * XML Format. - * @author lewismc - * - */ -public class AKNExtractor implements Extractor.TagSoupDOMExtractor { - - @Override - public void run(ExtractionParameters extractionParameters, ExtractionContext context, Document in, - ExtractionResult out) throws IOException, ExtractionException { - // TODO Auto-generated method stub - - } - - @Override - public ExtractorDescription getDescription() { - // TODO Auto-generated method stub - return null; - } - -} http://git-wip-us.apache.org/repos/asf/any23/blob/47278c16/core/src/main/java/org/apache/any23/extractor/akn/AKNExtractorFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/akn/AKNExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/akn/AKNExtractorFactory.java deleted file mode 100644 index bbd0a87..0000000 --- a/core/src/main/java/org/apache/any23/extractor/akn/AKNExtractorFactory.java +++ /dev/null @@ -1,54 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.any23.extractor.akn; - -import java.util.Arrays; - -import org.apache.any23.extractor.ExtractorDescription; -import org.apache.any23.extractor.ExtractorFactory; -import org.apache.any23.extractor.SimpleExtractorFactory; -import org.apache.any23.rdf.PopularPrefixes; -import org.apache.any23.rdf.Prefixes; -import org.kohsuke.MetaInfServices; - -/** - * @author lewismc - * - */ -@MetaInfServices(ExtractorFactory.class) -public class AKNExtractorFactory extends SimpleExtractorFactory<AKNExtractor> implements - ExtractorFactory<AKNExtractor> { - - private static final ExtractorDescription descriptionInstance = new AKNExtractorFactory(); - private static final String NAME = "akomaNtoso"; - private static final Prefixes PREFIXES = PopularPrefixes.createSubset("akn", "AKN", "AKOMA"); - - public AKNExtractorFactory() { - super(AKNExtractorFactory.NAME, - AKNExtractorFactory.PREFIXES); - } - - @Override - public AKNExtractor createExtractor() { - return new AKNExtractor(); - } - - public static ExtractorDescription getDescriptionInstance() { - return descriptionInstance; - } -} http://git-wip-us.apache.org/repos/asf/any23/blob/47278c16/core/src/main/java/org/apache/any23/extractor/akn/AKNParser.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/akn/AKNParser.java b/core/src/main/java/org/apache/any23/extractor/akn/AKNParser.java deleted file mode 100644 index 2320da2..0000000 --- a/core/src/main/java/org/apache/any23/extractor/akn/AKNParser.java +++ /dev/null @@ -1,33 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.any23.extractor.akn; - -/** - * This class provides utility methods for handling <b>Akoma Ntoso</b> - * nodes contained within a <i>DOM</i> document. - * @author lewismc - */ -public class AKNParser { - - enum ErrorMode { - /** This mode raises an exception at first encountered error. */ - StopAtFirstError, - /** This mode produces a full error report. */ - FullReport - } - -} http://git-wip-us.apache.org/repos/asf/any23/blob/47278c16/core/src/main/java/org/apache/any23/extractor/akn/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/akn/package-info.java b/core/src/main/java/org/apache/any23/extractor/akn/package-info.java deleted file mode 100644 index 508ee81..0000000 --- a/core/src/main/java/org/apache/any23/extractor/akn/package-info.java +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -/** - * <p>This package contains the definition of a specific - * {@link org.apache.any23.extractor.Extractor} for <i>AkomaNtoso</i> - * files.</p> - * <p>Akoma Ntoso is an emerging legal document standard for representing - * legislative and judicial documents in XML format. - * @see http://www.akomtantoso.org - * @see http://code.google.com/p/akomantoso - * @author lewismc - */ -package org.apache.any23.extractor.akn; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/47278c16/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java index 6dda7a9..18a30ca 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/BaseRDFExtractor.java @@ -28,8 +28,6 @@ import org.openrdf.rio.RDFParseException; import org.openrdf.rio.RDFParser; import org.openrdf.rio.RioSetting; import org.openrdf.rio.helpers.BasicParserSettings; -import org.openrdf.rio.helpers.RDFParserBase; - import java.io.IOException; import java.io.InputStream; import java.util.HashSet; http://git-wip-us.apache.org/repos/asf/any23/blob/47278c16/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java b/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java new file mode 100644 index 0000000..23a4d1e --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractor.java @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.rdf; + +import org.apache.any23.extractor.ExtractionContext; +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractorDescription; +import org.openrdf.rio.RDFParser; + +/** + * Concrete implementation of {@link org.apache.any23.extractor.Extractor.ContentExtractor} + * handling <a href="http://www.w3.org/TR/json-ld/">JSON-LD</a> format. + * + */ +public class JSONLDExtractor extends BaseRDFExtractor { + + public JSONLDExtractor(boolean verifyDataType, boolean stopAtFirstError) { + super(verifyDataType, stopAtFirstError); + } + + public JSONLDExtractor() { + this(false, false); + } + + @Override + public ExtractorDescription getDescription() { + return JSONLDExtractorFactory.getDescriptionInstance(); + } + + @Override + protected RDFParser getParser(ExtractionContext extractionContext, ExtractionResult extractionResult) { + return RDFParserFactory.getInstance().getJSONLDParser( + isVerifyDataType(), isStopAtFirstError(), extractionContext, extractionResult + ); + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/47278c16/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractorFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractorFactory.java b/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractorFactory.java new file mode 100644 index 0000000..bedd200 --- /dev/null +++ b/core/src/main/java/org/apache/any23/extractor/rdf/JSONLDExtractorFactory.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.extractor.rdf; + +import java.util.Arrays; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.Prefixes; +import org.kohsuke.MetaInfServices; + +/** + * + */ +@MetaInfServices(ExtractorFactory.class) +public class JSONLDExtractorFactory extends SimpleExtractorFactory<JSONLDExtractor> implements + ExtractorFactory<JSONLDExtractor> { + + public static final String NAME = "rdf-jsonld"; + + public static final Prefixes PREFIXES = null; + + private static final ExtractorDescription descriptionInstance = new JSONLDExtractorFactory(); + + public JSONLDExtractorFactory() { + super( + JSONLDExtractorFactory.NAME, + JSONLDExtractorFactory.PREFIXES, + Arrays.asList( + "application/ld+json;q=0.1" + ), + "example-jsonld.jsonld"); + } + + @Override + public JSONLDExtractor createExtractor() { + return new JSONLDExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/47278c16/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java b/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java index 606364b..575cebb 100644 --- a/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java +++ b/core/src/main/java/org/apache/any23/extractor/rdf/RDFParserFactory.java @@ -157,6 +157,25 @@ public class RDFParserFactory { configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult); return parser; } + + /** + * Returns a new instance of a configured {@link SesameJSONLDParser}. + * @param verifyDataType data verification enable if <code>true</code>. + * @param stopAtFirstError the parser stops at first error if <code>true</code>. + * @param extractionContext the extraction context where the parser is used. + * @param extractionResult the output extraction result. + * @return a new instance of a configured JSONLDParser parser. + */ + public RDFParser getJSONLDParser( + final boolean verifyDataType, + final boolean stopAtFirstError, + final ExtractionContext extractionContext, + final ExtractionResult extractionResult + ) { + final RDFParser parser = Rio.createParser(RDFFormat.JSONLD); + configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult); + return parser; + } /** * Configures the given parser on the specified extraction result http://git-wip-us.apache.org/repos/asf/any23/blob/47278c16/core/src/main/resources/org/apache/any23/extractor/rdf/example-jsonld.jsonld ---------------------------------------------------------------------- diff --git a/core/src/main/resources/org/apache/any23/extractor/rdf/example-jsonld.jsonld b/core/src/main/resources/org/apache/any23/extractor/rdf/example-jsonld.jsonld new file mode 100644 index 0000000..8c25185 --- /dev/null +++ b/core/src/main/resources/org/apache/any23/extractor/rdf/example-jsonld.jsonld @@ -0,0 +1,17 @@ +{ + "@context": { + "name": "http://xmlns.com/foaf/0.1/name", + "knows": "http://xmlns.com/foaf/0.1/knows" + }, + "@id": "http://me.markus-lanthaler.com/", + "name": "Markus Lanthaler", + "knows": [ + { + "@id": "http://manu.sporny.org/about#manu", + "name": "Manu Sporny" + }, + { + "name": "Dave Longley" + } + ] +} http://git-wip-us.apache.org/repos/asf/any23/blob/47278c16/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java new file mode 100644 index 0000000..d6b42ea --- /dev/null +++ b/core/src/test/java/org/apache/any23/extractor/rdf/JSONLDExtractorTest.java @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.extractor.rdf; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +import org.apache.any23.extractor.ExtractionContext; +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractionParameters; +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractionResultImpl; +import org.apache.any23.rdf.RDFUtils; +import org.apache.any23.writer.JSONWriter; +import org.apache.any23.writer.RDFXMLWriter; +import org.apache.any23.writer.TripleHandler; +import org.apache.any23.writer.TripleHandlerException; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Ignore; +import org.junit.Test; +import org.openrdf.model.URI; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Test case for {@link JSONLDExtractor}. + * + */ +public class JSONLDExtractorTest { + + private static final Logger logger = LoggerFactory.getLogger(JSONLDExtractorTest.class); + + private JSONLDExtractor extractor; + + @Before + public void setUp() throws Exception { + extractor = new JSONLDExtractor(); + } + + @After + public void tearDown() throws Exception { + extractor = null; + } + + @Test + public void testExtractFromJSONLDDocuement() + throws IOException, ExtractionException, TripleHandlerException { + final URI uri = RDFUtils.uri("http://host.com/place-example.jsonld"); + extract(uri, "/org/apache/any23/extractor/rdf/place-example.jsonld"); + } + + @Ignore("Need to verify if jsonld-java-sesame can extract from HTML") + @Test + public void testExtractFromHTMLDocument() + throws IOException, ExtractionException, TripleHandlerException { + final URI uri = RDFUtils.uri("http://host.com/embedded_json-ld.html"); + extract(uri, "/org/apache/any23/extractor/rdf/embedded_json-ld.html"); + } + + public void extract(URI uri, String filePath) + throws IOException, ExtractionException, TripleHandlerException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final TripleHandler tHandler = new JSONWriter(baos); + final ExtractionContext extractionContext = new ExtractionContext("json-ld-extractor", uri); + final ExtractionResult result = new ExtractionResultImpl(extractionContext, extractor, tHandler); + extractor.setStopAtFirstError(false); + try { + extractor.run( + ExtractionParameters.newDefault(), + extractionContext, + this.getClass().getResourceAsStream(filePath), + result + ); + } finally { + logger.debug(baos.toString()); + tHandler.close(); + result.close(); + } + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/47278c16/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 44c1dfa..0a03abc 100644 --- a/pom.xml +++ b/pom.xml @@ -396,6 +396,14 @@ <version>0.3</version> </dependency> <!-- END: Sesame --> + + <!-- BEGIN: Misc --> + <dependency> + <groupId>com.github.jsonld-java</groupId> + <artifactId>jsonld-java-sesame</artifactId> + <version>0.3</version> + </dependency> + <!-- END: Misc --> <!-- BEGIN: Apache Commons --> <dependency>
