Repository: any23 Updated Branches: refs/heads/master 12640a953 -> 692c583f8
http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/cli/src/test/java/org/apache/any23/cli/flows/PeopleExtractorFactory.java ---------------------------------------------------------------------- diff --git a/cli/src/test/java/org/apache/any23/cli/flows/PeopleExtractorFactory.java b/cli/src/test/java/org/apache/any23/cli/flows/PeopleExtractorFactory.java new file mode 100644 index 0000000..75d4c61 --- /dev/null +++ b/cli/src/test/java/org/apache/any23/cli/flows/PeopleExtractorFactory.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.cli.flows; + +import org.apache.any23.configuration.Settings; +import org.apache.any23.writer.DecoratingWriterFactory; +import org.apache.any23.writer.TripleHandler; + +public class PeopleExtractorFactory implements DecoratingWriterFactory { + + @Override + public String getIdentifier() { + return "people"; + } + + @Override + public TripleHandler getTripleWriter(TripleHandler delegate, Settings settings) { + return new PeopleExtractor(delegate); + } + + @Override + public Settings getSupportedSettings() { + return Settings.of(); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/cli/src/test/resources/META-INF/services/org.apache.any23.writer.WriterFactory ---------------------------------------------------------------------- diff --git a/cli/src/test/resources/META-INF/services/org.apache.any23.writer.WriterFactory b/cli/src/test/resources/META-INF/services/org.apache.any23.writer.WriterFactory new file mode 100644 index 0000000..c595410 --- /dev/null +++ b/cli/src/test/resources/META-INF/services/org.apache.any23.writer.WriterFactory @@ -0,0 +1 @@ +org.apache.any23.cli.flows.PeopleExtractorFactory http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/JSONLDWriter.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/JSONLDWriter.java b/core/src/main/java/org/apache/any23/writer/JSONLDWriter.java index 50d3900..4edf9be 100644 --- a/core/src/main/java/org/apache/any23/writer/JSONLDWriter.java +++ b/core/src/main/java/org/apache/any23/writer/JSONLDWriter.java @@ -16,18 +16,44 @@ */ package org.apache.any23.writer; +import org.apache.any23.configuration.Settings; +import org.eclipse.rdf4j.rio.WriterConfig; +import org.eclipse.rdf4j.rio.helpers.BasicWriterSettings; + import java.io.OutputStream; -import org.eclipse.rdf4j.rio.RDFFormat; -import org.eclipse.rdf4j.rio.Rio; /** - * Implementation of <i>JSON-LD</i> format writer. + * Implementation of <i>JSON-LD</i> {@link TripleWriter}. * * @author Julio Caguano + * @author Hans Brende ([email protected]) */ -public class JSONLDWriter extends RDFWriterTripleHandler implements FormatWriter { +public class JSONLDWriter extends RDFWriterTripleHandler { + + static class Internal { + private static final org.eclipse.rdf4j.rio.jsonld.JSONLDWriterFactory rdf4j + = new org.eclipse.rdf4j.rio.jsonld.JSONLDWriterFactory(); + + static final TripleFormat FORMAT = format(rdf4j); + + static final Settings SUPPORTED_SETTINGS = Settings.of( + WriterSettings.PRETTY_PRINT + ); + } + + @Override + void configure(WriterConfig config, Settings settings) { + config.set(BasicWriterSettings.PRETTY_PRINT, settings.get(WriterSettings.PRETTY_PRINT)); + } + + public JSONLDWriter(OutputStream os) { - super(Rio.createWriter(RDFFormat.JSONLD, os)); + this(os, Settings.of()); } + + public JSONLDWriter(OutputStream os, Settings settings) { + super(Internal.rdf4j, Internal.FORMAT, os, settings); + } + } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/JSONLDWriterFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/JSONLDWriterFactory.java b/core/src/main/java/org/apache/any23/writer/JSONLDWriterFactory.java index df20279..482b0a9 100644 --- a/core/src/main/java/org/apache/any23/writer/JSONLDWriterFactory.java +++ b/core/src/main/java/org/apache/any23/writer/JSONLDWriterFactory.java @@ -17,35 +17,37 @@ package org.apache.any23.writer; import java.io.OutputStream; -import org.eclipse.rdf4j.rio.RDFFormat; + +import org.apache.any23.configuration.Settings; /** * * @author Julio Caguano. + * @author Hans Brende ([email protected]) */ -public class JSONLDWriterFactory implements WriterFactory { +public class JSONLDWriterFactory implements TripleWriterFactory { - public static final String MIME_TYPE = RDFFormat.JSONLD.getDefaultMIMEType(); + public static final String MIME_TYPE = JSONLDWriter.Internal.FORMAT.getMimeType(); public static final String IDENTIFIER = "jsonld"; @Override - public RDFFormat getRdfFormat() { - return RDFFormat.JSONLD; + public TripleFormat getTripleFormat() { + return JSONLDWriter.Internal.FORMAT; } @Override public String getIdentifier() { - return JSONLDWriterFactory.IDENTIFIER; + return IDENTIFIER; } @Override - public String getMimeType() { - return JSONLDWriterFactory.MIME_TYPE; + public TripleHandler getTripleWriter(OutputStream out, Settings settings) { + return new JSONLDWriter(out, settings); } @Override - public FormatWriter getRdfWriter(OutputStream os) { - return new JSONLDWriter(os); + public Settings getSupportedSettings() { + return JSONLDWriter.Internal.SUPPORTED_SETTINGS; } } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/JSONWriter.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/JSONWriter.java b/core/src/main/java/org/apache/any23/writer/JSONWriter.java index 70e2700..58d869a 100644 --- a/core/src/main/java/org/apache/any23/writer/JSONWriter.java +++ b/core/src/main/java/org/apache/any23/writer/JSONWriter.java @@ -22,7 +22,8 @@ import com.fasterxml.jackson.core.util.DefaultPrettyPrinter; import java.io.IOException; import java.io.OutputStream; import java.util.Optional; -import org.apache.any23.extractor.ExtractionContext; + +import org.apache.any23.configuration.Settings; import org.eclipse.rdf4j.model.BNode; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Literal; @@ -30,11 +31,11 @@ import org.eclipse.rdf4j.model.Resource; import org.eclipse.rdf4j.model.Value; /** - * Implementation of <i>JSON</i> format writer. + * Implementation of <i>JSON</i> {@link TripleWriter}. * * @author Michele Mostarda ([email protected]) */ -public class JSONWriter implements FormatWriter { +public class JSONWriter extends TripleWriterHandler implements FormatWriter { private JsonGenerator ps; private boolean documentStarted = false; @@ -46,18 +47,21 @@ public class JSONWriter implements FormatWriter { JsonFactory factory = new JsonFactory(); try { this.ps = factory.createGenerator(os) + .disable(JsonGenerator.Feature.AUTO_CLOSE_TARGET) + .enable(JsonGenerator.Feature.FLUSH_PASSED_TO_STREAM) .setPrettyPrinter(new DefaultPrettyPrinter()); } catch (IOException ex) { } } - @Override - public void startDocument(IRI documentIRI) throws TripleHandlerException { + private void start(boolean throwIfStarted) throws TripleHandlerException { if (documentStarted) { - throw new IllegalStateException("Document already started."); + if (throwIfStarted) { + throw new IllegalStateException("Document already started."); + } + return; } documentStarted = true; - try { ps.writeStartObject(); ps.writeFieldName("quads"); @@ -68,14 +72,14 @@ public class JSONWriter implements FormatWriter { } @Override - public void openContext(ExtractionContext context) throws TripleHandlerException { - // Empty. + public void startDocument(IRI documentIRI) throws TripleHandlerException { + start(true); } @Override - public void receiveTriple(Resource s, IRI p, Value o, IRI g, ExtractionContext context) + public void writeTriple(Resource s, IRI p, Value o, Resource g) throws TripleHandlerException { - validateDocumentStarted(); + start(false); try { ps.writeStartArray(); @@ -104,43 +108,28 @@ public class JSONWriter implements FormatWriter { } @Override - public void receiveNamespace(String prefix, String uri, ExtractionContext context) + public void writeNamespace(String prefix, String uri) throws TripleHandlerException { // Empty. } @Override - public void closeContext(ExtractionContext context) throws TripleHandlerException { - // Empty. - } - - @Override public void endDocument(IRI documentIRI) throws TripleHandlerException { validateDocumentStarted(); - - try { - ps.writeEndArray(); - ps.writeEndObject(); - documentStarted = false; - } catch (IOException ex) { - throw new TripleHandlerException("IO Error while closing document.", ex); - } - } - - @Override - public void setContentLength(long contentLength) { - // Empty. } @Override public void close() throws TripleHandlerException { - if (documentStarted) { - endDocument(null); - } + start(false); + try { + ps.writeEndArray(); + ps.writeEndObject(); ps.close(); } catch (IOException ex) { - throw new TripleHandlerException("IO Error while closing stream.", ex); + throw new TripleHandlerException("IO Error while closing document.", ex); + } finally { + ps = null; } } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/JSONWriterFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/JSONWriterFactory.java b/core/src/main/java/org/apache/any23/writer/JSONWriterFactory.java index eea4def..8877a25 100644 --- a/core/src/main/java/org/apache/any23/writer/JSONWriterFactory.java +++ b/core/src/main/java/org/apache/any23/writer/JSONWriterFactory.java @@ -17,19 +17,24 @@ package org.apache.any23.writer; -import java.io.OutputStream; +import org.apache.any23.configuration.Settings; -import org.eclipse.rdf4j.rio.RDFFormat; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; +import java.util.Collections; /** * @author Peter Ansell [email protected] - * + * @author Hans Brende ([email protected]) */ -public class JSONWriterFactory implements WriterFactory { +public class JSONWriterFactory implements TripleWriterFactory { + public static final String MIME_TYPE = "text/json"; public static final String IDENTIFIER = "json"; + private static final TripleFormat FORMAT = TripleFormat.of("JSON", Collections.singleton(MIME_TYPE), + StandardCharsets.UTF_8, Collections.emptySet(), null, TripleFormat.QUADS); /** * */ @@ -37,24 +42,23 @@ public class JSONWriterFactory implements WriterFactory { } @Override - public RDFFormat getRdfFormat() { - throw new RuntimeException( - "TODO: Implement an RDFFormat for this RDF JSON serialisation format"); + public TripleFormat getTripleFormat() { + return FORMAT; } @Override - public String getIdentifier() { - return JSONWriterFactory.IDENTIFIER; + public Settings getSupportedSettings() { + return Settings.of(); } @Override - public String getMimeType() { - return JSONWriterFactory.MIME_TYPE; + public String getIdentifier() { + return JSONWriterFactory.IDENTIFIER; } @Override - public FormatWriter getRdfWriter(OutputStream os) { - return new JSONWriter(os); + public TripleHandler getTripleWriter(OutputStream out, Settings settings) { + return new JSONWriter(out); } } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/NQuadsWriter.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/NQuadsWriter.java b/core/src/main/java/org/apache/any23/writer/NQuadsWriter.java index 359f62c..ebbd9c2 100644 --- a/core/src/main/java/org/apache/any23/writer/NQuadsWriter.java +++ b/core/src/main/java/org/apache/any23/writer/NQuadsWriter.java @@ -17,20 +17,43 @@ package org.apache.any23.writer; -import java.io.OutputStream; +import org.apache.any23.configuration.Settings; +import org.eclipse.rdf4j.rio.WriterConfig; +import org.eclipse.rdf4j.rio.helpers.NTriplesWriterSettings; -import org.eclipse.rdf4j.rio.RDFFormat; -import org.eclipse.rdf4j.rio.Rio; +import java.io.OutputStream; /** - * Implementation of an <i>NQuads</i> writer. + * Implementation of an <i>N-Quads</i> {@link TripleWriter}. * * @author Michele Mostarda ([email protected]) + * @author Hans Brende ([email protected]) */ -public class NQuadsWriter extends RDFWriterTripleHandler implements FormatWriter { +public class NQuadsWriter extends RDFWriterTripleHandler { + + static class Internal { + private static final org.eclipse.rdf4j.rio.nquads.NQuadsWriterFactory rdf4j + = new org.eclipse.rdf4j.rio.nquads.NQuadsWriterFactory(); + + static final TripleFormat FORMAT = format(rdf4j); + + static final Settings SUPPORTED_SETTINGS = Settings.of( + WriterSettings.PRINT_ASCII + ); + } + + @Override + void configure(WriterConfig config, Settings settings) { + config.set(NTriplesWriterSettings.ESCAPE_UNICODE, settings.get(WriterSettings.PRINT_ASCII)); + } + public NQuadsWriter(OutputStream os) { - super( Rio.createWriter(RDFFormat.NQUADS, os) ); + this(os, Settings.of()); + } + + public NQuadsWriter(OutputStream os, Settings settings) { + super(Internal.rdf4j, Internal.FORMAT, os, settings); } } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/NQuadsWriterFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/NQuadsWriterFactory.java b/core/src/main/java/org/apache/any23/writer/NQuadsWriterFactory.java index 964d53c..767f2ae 100644 --- a/core/src/main/java/org/apache/any23/writer/NQuadsWriterFactory.java +++ b/core/src/main/java/org/apache/any23/writer/NQuadsWriterFactory.java @@ -19,15 +19,15 @@ package org.apache.any23.writer; import java.io.OutputStream; -import org.eclipse.rdf4j.rio.RDFFormat; +import org.apache.any23.configuration.Settings; /** - * @author Peter Ansell [email protected] - * + * @author Peter Ansell ([email protected]) + * @author Hans Brende ([email protected]) */ -public class NQuadsWriterFactory implements WriterFactory { +public class NQuadsWriterFactory implements TripleWriterFactory { - public static final String MIME_TYPE = RDFFormat.NQUADS.getDefaultMIMEType(); + public static final String MIME_TYPE = NQuadsWriter.Internal.FORMAT.getMimeType(); public static final String IDENTIFIER = "nquads"; /** @@ -37,23 +37,23 @@ public class NQuadsWriterFactory implements WriterFactory { } @Override - public RDFFormat getRdfFormat() { - return RDFFormat.NQUADS; + public TripleFormat getTripleFormat() { + return NQuadsWriter.Internal.FORMAT; } @Override - public String getIdentifier() { - return NQuadsWriterFactory.IDENTIFIER; + public Settings getSupportedSettings() { + return NQuadsWriter.Internal.SUPPORTED_SETTINGS; } @Override - public String getMimeType() { - return NQuadsWriterFactory.MIME_TYPE; + public String getIdentifier() { + return IDENTIFIER; } @Override - public FormatWriter getRdfWriter(OutputStream os) { - return new NQuadsWriter(os); + public TripleHandler getTripleWriter(OutputStream os, Settings settings) { + return new NQuadsWriter(os, settings); } } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/NTriplesWriter.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/NTriplesWriter.java b/core/src/main/java/org/apache/any23/writer/NTriplesWriter.java index 0d862ae..933b185 100644 --- a/core/src/main/java/org/apache/any23/writer/NTriplesWriter.java +++ b/core/src/main/java/org/apache/any23/writer/NTriplesWriter.java @@ -17,15 +17,40 @@ package org.apache.any23.writer; +import org.apache.any23.configuration.Settings; +import org.eclipse.rdf4j.rio.WriterConfig; +import org.eclipse.rdf4j.rio.helpers.NTriplesWriterSettings; + import java.io.OutputStream; /** - * <i>N3</i> triples writer. + * Implementation of an <i>N-Triples</i> {@link TripleWriter}. + * @author Hans Brende ([email protected]) */ -public class NTriplesWriter extends RDFWriterTripleHandler implements FormatWriter { +public class NTriplesWriter extends RDFWriterTripleHandler { + + static class Internal { + private static final org.eclipse.rdf4j.rio.ntriples.NTriplesWriterFactory rdf4j + = new org.eclipse.rdf4j.rio.ntriples.NTriplesWriterFactory(); + + static final TripleFormat FORMAT = format(rdf4j); + + static final Settings SUPPORTED_SETTINGS = Settings.of( + WriterSettings.PRINT_ASCII + ); + } + + @Override + void configure(WriterConfig config, Settings settings) { + config.set(NTriplesWriterSettings.ESCAPE_UNICODE, settings.get(WriterSettings.PRINT_ASCII)); + } public NTriplesWriter(OutputStream out) { - super(new org.eclipse.rdf4j.rio.ntriples.NTriplesWriter(out)); + this(out, Settings.of()); + } + + public NTriplesWriter(OutputStream os, Settings settings) { + super(Internal.rdf4j, Internal.FORMAT, os, settings); } } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/NTriplesWriterFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/NTriplesWriterFactory.java b/core/src/main/java/org/apache/any23/writer/NTriplesWriterFactory.java index 91d5fed..a631347 100644 --- a/core/src/main/java/org/apache/any23/writer/NTriplesWriterFactory.java +++ b/core/src/main/java/org/apache/any23/writer/NTriplesWriterFactory.java @@ -19,15 +19,15 @@ package org.apache.any23.writer; import java.io.OutputStream; -import org.eclipse.rdf4j.rio.RDFFormat; +import org.apache.any23.configuration.Settings; /** - * @author Peter Ansell [email protected] - * + * @author Peter Ansell ([email protected]) + * @author Hans Brende ([email protected]) */ -public class NTriplesWriterFactory implements WriterFactory { +public class NTriplesWriterFactory implements TripleWriterFactory { - public static final String MIME_TYPE = RDFFormat.NTRIPLES.getDefaultMIMEType(); + public static final String MIME_TYPE = NTriplesWriter.Internal.FORMAT.getMimeType(); public static final String IDENTIFIER = "ntriples"; /** @@ -37,23 +37,23 @@ public class NTriplesWriterFactory implements WriterFactory { } @Override - public RDFFormat getRdfFormat() { - return RDFFormat.NTRIPLES; + public TripleFormat getTripleFormat() { + return NTriplesWriter.Internal.FORMAT; } @Override - public String getIdentifier() { - return NTriplesWriterFactory.IDENTIFIER; + public Settings getSupportedSettings() { + return NTriplesWriter.Internal.SUPPORTED_SETTINGS; } @Override - public String getMimeType() { - return NTriplesWriterFactory.MIME_TYPE; + public String getIdentifier() { + return NTriplesWriterFactory.IDENTIFIER; } @Override - public FormatWriter getRdfWriter(OutputStream os) { - return new NTriplesWriter(os); + public TripleHandler getTripleWriter(OutputStream os, Settings settings) { + return new NTriplesWriter(os, settings); } } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/RDFWriterTripleHandler.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/RDFWriterTripleHandler.java b/core/src/main/java/org/apache/any23/writer/RDFWriterTripleHandler.java index aaf4105..c237ff5 100644 --- a/core/src/main/java/org/apache/any23/writer/RDFWriterTripleHandler.java +++ b/core/src/main/java/org/apache/any23/writer/RDFWriterTripleHandler.java @@ -17,6 +17,7 @@ package org.apache.any23.writer; +import org.apache.any23.configuration.Settings; import org.apache.any23.extractor.ExtractionContext; import org.apache.any23.rdf.RDFUtils; import org.eclipse.rdf4j.model.Resource; @@ -24,6 +25,16 @@ import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Value; import org.eclipse.rdf4j.rio.RDFHandlerException; import org.eclipse.rdf4j.rio.RDFWriter; +import org.eclipse.rdf4j.rio.RDFWriterFactory; +import org.eclipse.rdf4j.rio.WriterConfig; + +import java.io.BufferedWriter; +import java.io.Flushable; +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.nio.charset.Charset; +import java.util.Optional; /** * A {@link TripleHandler} that writes @@ -32,25 +43,56 @@ import org.eclipse.rdf4j.rio.RDFWriter; * * @author Richard Cyganiak ([email protected]) * @author Michele Mostarda ([email protected]) + * @author Hans Brende ([email protected]) */ -public abstract class RDFWriterTripleHandler implements FormatWriter, TripleHandler { - - protected final RDFWriter writer; +public abstract class RDFWriterTripleHandler extends TripleWriterHandler implements FormatWriter { - private boolean closed = false; + private RDFWriter _writer; + private boolean writerStarted; + private final Flushable out; + private final TripleFormat format; /** * The annotation flag. */ private boolean annotated = false; - protected RDFWriterTripleHandler(RDFWriter destination) { - writer = destination; - try { - writer.startRDF(); - } catch (RDFHandlerException e) { - throw new RuntimeException(e); + static TripleFormat format(RDFWriterFactory rdf4j) { + return TripleFormat.of(rdf4j.getRDFFormat()); + } + + RDFWriterTripleHandler(RDFWriterFactory rdf4j, TripleFormat format, OutputStream out, Settings settings) { + this.format = format; + Optional<Charset> charset = format.getCharset(); + RDFWriter w; + if (!charset.isPresent()) { + this.out = out; + w = _writer = rdf4j.getWriter(out); + } else { + //use buffered writer if format supports encoding + BufferedWriter buf = new BufferedWriter(new OutputStreamWriter(out, charset.get())); + this.out = buf; + w = _writer = rdf4j.getWriter(buf); + } + configure(w.getWriterConfig(), settings); + } + + abstract void configure(WriterConfig config, Settings settings); + + RDFWriter writer() throws TripleHandlerException { + RDFWriter w = _writer; + if (w == null) { + throw new TripleHandlerException("writer has been closed!"); + } + if (!writerStarted) { + writerStarted = true; + try { + w.startRDF(); + } catch (RDFHandlerException e) { + throw new TripleHandlerException("Error while starting document", e); + } } + return w; } /** @@ -77,7 +119,7 @@ public abstract class RDFWriterTripleHandler implements FormatWriter, TripleHand @Override public void startDocument(IRI documentIRI) throws TripleHandlerException { - handleComment("OUTPUT FORMAT: " + writer.getRDFFormat()); + handleComment("OUTPUT FORMAT: " + format); } @Override @@ -86,25 +128,23 @@ public abstract class RDFWriterTripleHandler implements FormatWriter, TripleHand } @Override - public void receiveTriple(Resource s, IRI p, Value o, IRI g, ExtractionContext context) + public void writeTriple(Resource s, IRI p, Value o, Resource g) throws TripleHandlerException { - final IRI graph = g == null ? context.getDocumentIRI() : g; try { - writer.handleStatement( - RDFUtils.quad(s, p, o, graph)); + writer().handleStatement(RDFUtils.quad(s, p, o, g)); } catch (RDFHandlerException ex) { throw new TripleHandlerException( - String.format("Error while receiving triple: %s %s %s %s", s, p, o, graph), + String.format("Error while receiving triple: %s %s %s %s", s, p, o, g), ex ); } } @Override - public void receiveNamespace(String prefix, String uri, ExtractionContext context) + public void writeNamespace(String prefix, String uri) throws TripleHandlerException { try { - writer.handleNamespace(prefix, uri); + writer().handleNamespace(prefix, uri); } catch (RDFHandlerException ex) { throw new TripleHandlerException(String.format("Error while receiving namespace: %s:%s", prefix, uri), ex @@ -119,32 +159,36 @@ public abstract class RDFWriterTripleHandler implements FormatWriter, TripleHand @Override public void close() throws TripleHandlerException { - if (closed) return; - closed = true; + RDFWriter writer = _writer; + if (writer == null) { + return; + } + _writer = null; try { - writer.endRDF(); + if (!writerStarted) { + writer.startRDF(); + } + writer.endRDF(); //calls flush() } catch (RDFHandlerException e) { - throw new TripleHandlerException("Error while closing the triple handler.", e); + throw new TripleHandlerException("Error closing writer", e); } } @Override public void endDocument(IRI documentIRI) throws TripleHandlerException { - // Empty. - } - - @Override - public void setContentLength(long contentLength) { - // Empty. + try { + out.flush(); + } catch (IOException e) { + throw new TripleHandlerException("Error ending document", e); + } } private void handleComment(String comment) throws TripleHandlerException { if( !annotated ) return; try { - writer.handleComment(comment); + writer().handleComment(comment); } catch (RDFHandlerException rdfhe) { throw new TripleHandlerException("Error while handing comment.", rdfhe); } } - } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/RDFXMLWriter.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/RDFXMLWriter.java b/core/src/main/java/org/apache/any23/writer/RDFXMLWriter.java index ecbf6ed..1f8c127 100644 --- a/core/src/main/java/org/apache/any23/writer/RDFXMLWriter.java +++ b/core/src/main/java/org/apache/any23/writer/RDFXMLWriter.java @@ -17,15 +17,38 @@ package org.apache.any23.writer; +import org.apache.any23.configuration.Settings; +import org.eclipse.rdf4j.rio.WriterConfig; + import java.io.OutputStream; /** - * <i>RDF/XML</i> writer implementation. + * <i>RDF/XML</i> {@link TripleWriter} implementation. + * @author Hans Brende ([email protected]) */ -public class RDFXMLWriter extends RDFWriterTripleHandler implements FormatWriter { +public class RDFXMLWriter extends RDFWriterTripleHandler { + + static class Internal { + private static final org.eclipse.rdf4j.rio.rdfxml.RDFXMLWriterFactory rdf4j + = new org.eclipse.rdf4j.rio.rdfxml.RDFXMLWriterFactory(); + + //TODO support pretty printing with RDFXMLPrettyWriterFactory + + static final TripleFormat FORMAT = format(rdf4j); + + static final Settings SUPPORTED_SETTINGS = Settings.of(); + } + + @Override + void configure(WriterConfig config, Settings settings) { + } + + public RDFXMLWriter(OutputStream os) { + this(os, Settings.of()); + } - public RDFXMLWriter(OutputStream out) { - super( new org.eclipse.rdf4j.rio.rdfxml.RDFXMLWriter(out) ); + public RDFXMLWriter(OutputStream os, Settings settings) { + super(Internal.rdf4j, Internal.FORMAT, os, settings); } } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/RDFXMLWriterFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/RDFXMLWriterFactory.java b/core/src/main/java/org/apache/any23/writer/RDFXMLWriterFactory.java index c40bca3..a3dceb6 100644 --- a/core/src/main/java/org/apache/any23/writer/RDFXMLWriterFactory.java +++ b/core/src/main/java/org/apache/any23/writer/RDFXMLWriterFactory.java @@ -19,15 +19,15 @@ package org.apache.any23.writer; import java.io.OutputStream; -import org.eclipse.rdf4j.rio.RDFFormat; +import org.apache.any23.configuration.Settings; /** - * @author Peter Ansell [email protected] - * + * @author Peter Ansell ([email protected]) + * @author Hans Brende ([email protected]) */ -public class RDFXMLWriterFactory implements WriterFactory { +public class RDFXMLWriterFactory implements TripleWriterFactory { - public static final String MIME_TYPE = RDFFormat.RDFXML.getDefaultMIMEType(); + public static final String MIME_TYPE = RDFXMLWriter.Internal.FORMAT.getMimeType(); public static final String IDENTIFIER = "rdfxml"; /** @@ -37,23 +37,23 @@ public class RDFXMLWriterFactory implements WriterFactory { } @Override - public RDFFormat getRdfFormat() { - return RDFFormat.RDFXML; + public TripleFormat getTripleFormat() { + return RDFXMLWriter.Internal.FORMAT; } @Override - public String getIdentifier() { - return RDFXMLWriterFactory.IDENTIFIER; + public Settings getSupportedSettings() { + return RDFXMLWriter.Internal.SUPPORTED_SETTINGS; } @Override - public String getMimeType() { - return RDFXMLWriterFactory.MIME_TYPE; + public String getIdentifier() { + return IDENTIFIER; } @Override - public FormatWriter getRdfWriter(OutputStream os) { - return new RDFXMLWriter(os); + public TripleHandler getTripleWriter(OutputStream os, Settings settings) { + return new RDFXMLWriter(os, settings); } } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/TriXWriter.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/TriXWriter.java b/core/src/main/java/org/apache/any23/writer/TriXWriter.java index 6ae871b..e048c17 100644 --- a/core/src/main/java/org/apache/any23/writer/TriXWriter.java +++ b/core/src/main/java/org/apache/any23/writer/TriXWriter.java @@ -17,17 +17,38 @@ package org.apache.any23.writer; +import org.apache.any23.configuration.Settings; +import org.eclipse.rdf4j.rio.WriterConfig; + import java.io.OutputStream; /** - * <a href="http://www.w3.org/2004/03/trix/">TriX</a> format writer implementation. + * <a href="http://www.w3.org/2004/03/trix/">TriX</a> {@link TripleWriter} implementation. * * @author Michele Mostarda ([email protected]) + * @author Hans Brende ([email protected]) */ -public class TriXWriter extends RDFWriterTripleHandler implements FormatWriter { +public class TriXWriter extends RDFWriterTripleHandler { + + static class Internal { + private static final org.eclipse.rdf4j.rio.trix.TriXWriterFactory rdf4j + = new org.eclipse.rdf4j.rio.trix.TriXWriterFactory(); + + static final TripleFormat FORMAT = format(rdf4j); + + static final Settings SUPPORTED_SETTINGS = Settings.of(); + } + + @Override + void configure(WriterConfig config, Settings settings) { + } + + public TriXWriter(OutputStream os) { + this(os, Settings.of()); + } - public TriXWriter(OutputStream out) { - super( new org.eclipse.rdf4j.rio.trix.TriXWriter(out) ); + public TriXWriter(OutputStream os, Settings settings) { + super(Internal.rdf4j, Internal.FORMAT, os, settings); } } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/TriXWriterFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/TriXWriterFactory.java b/core/src/main/java/org/apache/any23/writer/TriXWriterFactory.java index 0facc59..09fbfb8 100644 --- a/core/src/main/java/org/apache/any23/writer/TriXWriterFactory.java +++ b/core/src/main/java/org/apache/any23/writer/TriXWriterFactory.java @@ -19,15 +19,15 @@ package org.apache.any23.writer; import java.io.OutputStream; -import org.eclipse.rdf4j.rio.RDFFormat; +import org.apache.any23.configuration.Settings; /** * @author Peter Ansell [email protected] - * + * @author Hans Brende ([email protected]) */ -public class TriXWriterFactory implements WriterFactory { +public class TriXWriterFactory implements TripleWriterFactory { - public static final String MIME_TYPE = RDFFormat.TRIX.getDefaultMIMEType(); + public static final String MIME_TYPE = TriXWriter.Internal.FORMAT.getMimeType(); public static final String IDENTIFIER = "trix"; /** @@ -37,23 +37,23 @@ public class TriXWriterFactory implements WriterFactory { } @Override - public RDFFormat getRdfFormat() { - return RDFFormat.TRIX; + public TripleFormat getTripleFormat() { + return TriXWriter.Internal.FORMAT; } @Override - public String getIdentifier() { - return TriXWriterFactory.IDENTIFIER; + public Settings getSupportedSettings() { + return TriXWriter.Internal.SUPPORTED_SETTINGS; } @Override - public String getMimeType() { - return TriXWriterFactory.MIME_TYPE; + public String getIdentifier() { + return IDENTIFIER; } @Override - public FormatWriter getRdfWriter(OutputStream os) { - return new TriXWriter(os); + public TripleHandler getTripleWriter(OutputStream os, Settings settings) { + return new TriXWriter(os, settings); } } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/TripleWriterHandler.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/TripleWriterHandler.java b/core/src/main/java/org/apache/any23/writer/TripleWriterHandler.java new file mode 100644 index 0000000..56fcdc3 --- /dev/null +++ b/core/src/main/java/org/apache/any23/writer/TripleWriterHandler.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.writer; + +import org.apache.any23.extractor.ExtractionContext; +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.Resource; +import org.eclipse.rdf4j.model.Value; + +/** + * This class connects a {@link TripleHandler} to a {@link TripleWriter} by writing received data. + * + * @author Hans Brende ([email protected]) + */ +public abstract class TripleWriterHandler implements TripleHandler, TripleWriter { + + /** + * Writers may override this method to handle a "receiveTriple" extraction event. + * The default implementation calls: + * <pre> + * {@code this.writeTriple(s, p, o, context == null || g != null ? g : context.getDocumentIRI())} + * </pre> + * @param s the subject received + * @param p the predicate received + * @param o the object received + * @param g the graph name received, or null + * @param context the extraction context + * @throws TripleHandlerException if there was an error responding to a received triple + */ + @Override + public void receiveTriple(Resource s, IRI p, Value o, IRI g, ExtractionContext context) throws TripleHandlerException { + writeTriple(s, p, o, context == null || g != null ? g : context.getDocumentIRI()); + } + + /** + * Writers may override this method to handle a "receiveNamespace" extraction event. + * The default implementation calls: + * <pre> + * {@code this.writeNamespace(prefix, uri)} + * </pre> + * @param prefix namespace prefix. + * @param uri namespace <i>IRI</i>. + * @param context the extraction context + * @throws TripleHandlerException if there was an error responding to the received namepsace. + */ + @Override + public void receiveNamespace(String prefix, String uri, ExtractionContext context) throws TripleHandlerException { + writeNamespace(prefix, uri); + } + + /** + * Writers may override this method to handle a "startDocument" extraction event. + * The default implementation does nothing. + * @param documentIRI the name of the document that was started + * @throws TripleHandlerException if an error occurred while responding to a "startDocument" + * extraction event. + */ + @Override + public void startDocument(IRI documentIRI) throws TripleHandlerException { } + + /** + * Writers may override this method to handle an "openContext" extraction event. + * The default implementation does nothing. + * @param context the context that was opened + * @throws TripleHandlerException if an error occurred while responding to a "startDocument" + * extraction event. + */ + @Override + public void openContext(ExtractionContext context) throws TripleHandlerException { } + + /** + * Writers may override this method to handle a "closeContext" extraction event. + * The default implementation does nothing. + * @param context the context to be closed. + * @throws TripleHandlerException if an error occurred while responding to a "closeContext" + * extraction event. + */ + @Override + public void closeContext(ExtractionContext context) throws TripleHandlerException { } + + /** + * Writers may override this method to handle an "endDocument" extraction event. + * The default implementation does nothing. + * @param documentIRI the document IRI. + * @throws TripleHandlerException if an error occurred while responding to a "endDocument" + * extraction event. + */ + @Override + public void endDocument(IRI documentIRI) throws TripleHandlerException { } + + /** + * Writers may override this method to handle a "setContentLength" extraction event. + * The default implementation does nothing. + * @param contentLength length of the content being processed. + */ + @Override + public void setContentLength(long contentLength) { } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/TurtleWriter.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/TurtleWriter.java b/core/src/main/java/org/apache/any23/writer/TurtleWriter.java index 0771fb4..31559c2 100644 --- a/core/src/main/java/org/apache/any23/writer/TurtleWriter.java +++ b/core/src/main/java/org/apache/any23/writer/TurtleWriter.java @@ -17,12 +17,57 @@ package org.apache.any23.writer; +import org.apache.any23.configuration.Settings; +import org.eclipse.rdf4j.common.net.ParsedIRI; +import org.eclipse.rdf4j.rio.RDFWriter; +import org.eclipse.rdf4j.rio.WriterConfig; +import org.eclipse.rdf4j.rio.helpers.BasicWriterSettings; + import java.io.OutputStream; +import java.io.Writer; +import java.net.URISyntaxException; /** - * <i>N3</i> notation writer. + * <i>N3</i> notation {@link TripleWriter} implementation. + * @author Hans Brende ([email protected]) */ -public class TurtleWriter extends RDFWriterTripleHandler implements FormatWriter { +public class TurtleWriter extends RDFWriterTripleHandler { + + static class Internal { + // rdf4j-internal ArrangedWriter + -ea causes AssertionError + // when writing example output of html-mf-hlisting extractor! + // Override to return rdf4j TurtleWriter instances instead. + private static final org.eclipse.rdf4j.rio.turtle.TurtleWriterFactory rdf4j + = new org.eclipse.rdf4j.rio.turtle.TurtleWriterFactory() { + @Override + public RDFWriter getWriter(OutputStream out) { + return new org.eclipse.rdf4j.rio.turtle.TurtleWriter(out); + } + @Override + public RDFWriter getWriter(OutputStream out, String baseURI) throws URISyntaxException { + return new org.eclipse.rdf4j.rio.turtle.TurtleWriter(out, new ParsedIRI(baseURI)); + } + @Override + public RDFWriter getWriter(Writer writer) { + return new org.eclipse.rdf4j.rio.turtle.TurtleWriter(writer); + } + @Override + public RDFWriter getWriter(Writer writer, String baseURI) throws URISyntaxException { + return new org.eclipse.rdf4j.rio.turtle.TurtleWriter(writer, new ParsedIRI(baseURI)); + } + }; + + static final TripleFormat FORMAT = format(rdf4j); + + static final Settings SUPPORTED_SETTINGS = Settings.of( + WriterSettings.PRETTY_PRINT + ); + } + + @Override + void configure(WriterConfig config, Settings settings) { + config.set(BasicWriterSettings.PRETTY_PRINT, settings.get(WriterSettings.PRETTY_PRINT)); + } /** * Constructor. @@ -30,7 +75,11 @@ public class TurtleWriter extends RDFWriterTripleHandler implements FormatWriter * @param out stream to write on. */ public TurtleWriter(OutputStream out) { - super(new org.eclipse.rdf4j.rio.turtle.TurtleWriter(out)); + this(out, Settings.of()); + } + + public TurtleWriter(OutputStream os, Settings settings) { + super(Internal.rdf4j, Internal.FORMAT, os, settings); } } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/TurtleWriterFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/TurtleWriterFactory.java b/core/src/main/java/org/apache/any23/writer/TurtleWriterFactory.java index 6a04e28..a0db985 100644 --- a/core/src/main/java/org/apache/any23/writer/TurtleWriterFactory.java +++ b/core/src/main/java/org/apache/any23/writer/TurtleWriterFactory.java @@ -19,15 +19,15 @@ package org.apache.any23.writer; import java.io.OutputStream; -import org.eclipse.rdf4j.rio.RDFFormat; +import org.apache.any23.configuration.Settings; /** * @author Peter Ansell [email protected] - * + * @author Hans Brende ([email protected]) */ -public class TurtleWriterFactory implements WriterFactory { +public class TurtleWriterFactory implements TripleWriterFactory { - public static final String MIME_TYPE = RDFFormat.TURTLE.getDefaultMIMEType(); + public static final String MIME_TYPE = TurtleWriter.Internal.FORMAT.getMimeType(); public static final String IDENTIFIER = "turtle"; /** @@ -37,23 +37,23 @@ public class TurtleWriterFactory implements WriterFactory { } @Override - public RDFFormat getRdfFormat() { - return RDFFormat.TURTLE; + public TripleFormat getTripleFormat() { + return TurtleWriter.Internal.FORMAT; } @Override - public String getIdentifier() { - return TurtleWriterFactory.IDENTIFIER; + public Settings getSupportedSettings() { + return TurtleWriter.Internal.SUPPORTED_SETTINGS; } @Override - public String getMimeType() { - return TurtleWriterFactory.MIME_TYPE; + public String getIdentifier() { + return IDENTIFIER; } @Override - public FormatWriter getRdfWriter(OutputStream os) { - return new TurtleWriter(os); + public TripleHandler getTripleWriter(OutputStream os, Settings settings) { + return new TurtleWriter(os, settings); } } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/URIListWriter.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/URIListWriter.java b/core/src/main/java/org/apache/any23/writer/URIListWriter.java index f8faca5..ae3aecf 100644 --- a/core/src/main/java/org/apache/any23/writer/URIListWriter.java +++ b/core/src/main/java/org/apache/any23/writer/URIListWriter.java @@ -17,71 +17,70 @@ package org.apache.any23.writer; -import org.apache.any23.extractor.ExtractionContext; import org.eclipse.rdf4j.model.Resource; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Value; +import java.io.BufferedWriter; import java.io.OutputStream; -import java.io.PrintStream; -import java.util.ArrayList; -import java.util.List; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.TreeSet; /** * This writer simply produces a list of unique <i>IRI</i> present in the * subject or in the object of every single extracted <i>RDF Statement</i>. * * @author Davide Palmisano ([email protected]) + * @author Hans Brende ([email protected]) */ -public class URIListWriter implements FormatWriter { +public class URIListWriter extends TripleWriterHandler implements FormatWriter { - private List<Resource> resources; + private static final Charset charset = StandardCharsets.UTF_8; - private PrintStream printStream; + static final TripleFormat FORMAT = TripleFormat.of("URIList", + Collections.singleton(URIListWriterFactory.MIME_TYPE), charset, Collections.singleton("txt"), null, + TripleFormat.NONSTANDARD); - private ExtractionContext extractionContext; + private final TreeSet<String> resources = new TreeSet<>(); - private long contentLength; + private PrintWriter writer; public URIListWriter(OutputStream outputStream) { - this.resources = new ArrayList<Resource>(); - this.printStream = new PrintStream(outputStream); + writer = new PrintWriter(new BufferedWriter( + new OutputStreamWriter(outputStream, charset))); } - public void startDocument(IRI documentIRI) throws TripleHandlerException {} - - public void openContext(ExtractionContext context) throws TripleHandlerException { - this.extractionContext = context; - } - - public void receiveTriple(Resource s, IRI p, Value o, IRI g, ExtractionContext context) + @Override + public void writeTriple(Resource s, IRI p, Value o, Resource g) throws TripleHandlerException { - if(!this.resources.contains(s)) { - this.resources.add(s); - this.printStream.println(s.stringValue()); + String string; + if (s instanceof IRI && resources.add(string = s.stringValue())) { + writer.println(string); } - if(o instanceof Resource && !this.resources.contains(o)) { - this.resources.add((Resource) o); - this.printStream.println(o.stringValue()); + if (o instanceof IRI && resources.add(string = o.stringValue())) { + writer.println(string); } } - public void receiveNamespace(String prefix, String uri, ExtractionContext context) + @Override + public void writeNamespace(String prefix, String uri) throws TripleHandlerException { } - public void closeContext(ExtractionContext context) throws TripleHandlerException { - } - + @Override public void endDocument(IRI documentIRI) throws TripleHandlerException { + writer.flush(); } - public void setContentLength(long contentLength) { - this.contentLength = contentLength; - } - + @Override public void close() throws TripleHandlerException { - this.printStream.close(); + writer.flush(); + writer = null; + resources.clear(); } @Override @@ -93,4 +92,5 @@ public class URIListWriter implements FormatWriter { public void setAnnotated(boolean f) { // Empty. } + } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/URIListWriterFactory.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/URIListWriterFactory.java b/core/src/main/java/org/apache/any23/writer/URIListWriterFactory.java index 9defefc..84836aa 100644 --- a/core/src/main/java/org/apache/any23/writer/URIListWriterFactory.java +++ b/core/src/main/java/org/apache/any23/writer/URIListWriterFactory.java @@ -19,17 +19,18 @@ package org.apache.any23.writer; import java.io.OutputStream; -import org.eclipse.rdf4j.rio.RDFFormat; +import org.apache.any23.configuration.Settings; /** * @author Peter Ansell [email protected] - * + * @author Hans Brende ([email protected]) */ -public class URIListWriterFactory implements WriterFactory { +public class URIListWriterFactory implements TripleWriterFactory { public static final String MIME_TYPE = "text/plain"; public static final String IDENTIFIER = "uri"; + /** * */ @@ -37,22 +38,22 @@ public class URIListWriterFactory implements WriterFactory { } @Override - public RDFFormat getRdfFormat() { - throw new RuntimeException("This writer does not print RDF triples"); + public TripleFormat getTripleFormat() { + return URIListWriter.FORMAT; } @Override - public String getIdentifier() { - return URIListWriterFactory.IDENTIFIER; + public Settings getSupportedSettings() { + return Settings.of(); } @Override - public String getMimeType() { - return URIListWriterFactory.MIME_TYPE; + public String getIdentifier() { + return IDENTIFIER; } @Override - public FormatWriter getRdfWriter(OutputStream os) { + public TripleHandler getTripleWriter(OutputStream os, Settings settings) { return new URIListWriter(os); } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/WriterSettings.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/WriterSettings.java b/core/src/main/java/org/apache/any23/writer/WriterSettings.java new file mode 100644 index 0000000..40e3b26 --- /dev/null +++ b/core/src/main/java/org/apache/any23/writer/WriterSettings.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.writer; + +import org.apache.any23.configuration.Setting; + + +/** + * + * This class encapsulates commonly supported settings for {@link TripleWriter} implementations. + * + * @author Hans Brende ([email protected]) + */ +public class WriterSettings { + private WriterSettings() { + throw new AssertionError(); + } + + // Keep identifiers short & sweet for ease of user's CLI usage! + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // (Since each WriterFactory must maintain its own pool of "supported settings", + // we don't need to worry about identifiers being globally unique. + // A single identifier could theoretically map to different keys--and + // therefore to different semantics--under different WriterFactory instances. + // Note that it is the *memory-based identity of the key*, not the + // key's textual identifier, that denotes the semantics for a given setting. + // However, since each Settings object is guaranteed to contain only one setting + // per identifier, we can be assured that identifiers will be unique on a + // per-WriterFactory basis.) + + /** + * Directive to writer that output should be printed in a way to maximize human readability. + */ + public static final Setting<Boolean> PRETTY_PRINT = Setting.newKey("pretty", Boolean.class) + .withValue(Boolean.TRUE); + + /** + * Directive to writer that at least the non-ASCII characters should be escaped. + */ + public static final Setting<Boolean> PRINT_ASCII = Setting.newKey("ascii", Boolean.class) + .withValue(Boolean.FALSE); + + +} http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/main/java/org/apache/any23/writer/package-info.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/writer/package-info.java b/core/src/main/java/org/apache/any23/writer/package-info.java index b49fd88..c245efb 100644 --- a/core/src/main/java/org/apache/any23/writer/package-info.java +++ b/core/src/main/java/org/apache/any23/writer/package-info.java @@ -17,6 +17,6 @@ /** * This package collects a set of {@link org.apache.any23.writer.TripleHandler} - * decorators and specific <i>RDF</i> format writers. + * decorators and specific <i>RDF</i> format {@link org.apache.any23.writer.TripleWriter} implementations. */ package org.apache.any23.writer; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/test/java/org/apache/any23/writer/JSONWriterTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/writer/JSONWriterTest.java b/core/src/test/java/org/apache/any23/writer/JSONWriterTest.java index 4c4fffd..0099d9f 100644 --- a/core/src/test/java/org/apache/any23/writer/JSONWriterTest.java +++ b/core/src/test/java/org/apache/any23/writer/JSONWriterTest.java @@ -35,7 +35,7 @@ public class JSONWriterTest { @Test public void testJSONWriting() throws TripleHandlerException, IOException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); - writeContent(new JSONWriter(baos)); + writeContentComplicated(new JSONWriter(baos)); final String expected = "{\n" @@ -64,12 +64,16 @@ public class JSONWriterTest { + " }, null ] ]\n" + "}"; Assert.assertEquals(expected, baos.toString()); + + baos.reset(); + writeContentSimple(new JSONWriter(baos)); + Assert.assertEquals(expected, baos.toString()); } @Test public void testJSONLDWriting() throws TripleHandlerException { ByteArrayOutputStream baos = new ByteArrayOutputStream(); - writeContent(new JSONLDWriter(baos)); + writeContentComplicated(new JSONLDWriter(baos)); final String expected = "[ {\n" + " \"@graph\" : [ {\n" + @@ -99,9 +103,37 @@ public class JSONWriterTest { " \"@id\" : \"http://graph/2\"\n" + "} ]"; Assert.assertEquals(expected, baos.toString()); + + baos.reset(); + writeContentSimple(new JSONLDWriter(baos)); + Assert.assertEquals(expected, baos.toString()); + } + + private void writeContentSimple(TripleWriter writer) throws TripleHandlerException { + writer.writeTriple(SimpleValueFactory.getInstance().createBNode("bn1"), + SimpleValueFactory.getInstance().createIRI("http://pred/1"), + SimpleValueFactory.getInstance().createIRI("http://value/1"), + SimpleValueFactory.getInstance().createIRI("http://graph/1")); + + writer.writeTriple(SimpleValueFactory.getInstance().createIRI("http://sub/2"), + SimpleValueFactory.getInstance().createIRI("http://pred/2"), + SimpleValueFactory.getInstance().createLiteral("language literal", "en"), + SimpleValueFactory.getInstance().createIRI("http://graph/2")); + + writer.writeTriple( + SimpleValueFactory.getInstance().createIRI("http://sub/3"), + SimpleValueFactory.getInstance().createIRI("http://pred/3"), + SimpleValueFactory.getInstance().createLiteral("123", + SimpleValueFactory.getInstance().createIRI("http://datatype")), + writer instanceof JSONLDWriter ? SimpleValueFactory.getInstance().createIRI("http://any23.org/tmp/") : null); + + writer.close(); + } - private void writeContent(FormatWriter writer) throws TripleHandlerException { + private void writeContentComplicated(TripleHandler writer) throws TripleHandlerException { + //creating a fake document uri in order to write triples is terrible. + //see improved solution in "writeContentSimple"! final IRI documentIRI = SimpleValueFactory.getInstance().createIRI("http://fake/uri"); writer.startDocument(documentIRI); writer.receiveTriple( @@ -127,6 +159,8 @@ public class JSONWriterTest { null ); } else if (writer instanceof JSONLDWriter) { + //creating a fake extraction context in order to write triples is terrible. + //see improved solution in "writeContentSimple"! ExtractionContext extractionContext = new ExtractionContext("rdf-nq", SimpleValueFactory.getInstance().createIRI("http://any23.org/tmp/")); writer.receiveTriple( SimpleValueFactory.getInstance().createIRI("http://sub/3"), http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/core/src/test/java/org/apache/any23/writer/WriterRegistryTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/writer/WriterRegistryTest.java b/core/src/test/java/org/apache/any23/writer/WriterRegistryTest.java index ec0ccf0..fc0f09a 100644 --- a/core/src/test/java/org/apache/any23/writer/WriterRegistryTest.java +++ b/core/src/test/java/org/apache/any23/writer/WriterRegistryTest.java @@ -22,11 +22,13 @@ import java.util.Collection; import java.util.HashSet; import java.util.List; import java.util.Set; + +import org.apache.any23.configuration.Settings; import org.junit.Assert; import org.junit.Test; /** - * Test case for {@link WriterRegistry}. + * Test case for {@link WriterFactoryRegistry}. * * @author Michele Mostarda ([email protected]) */ @@ -71,8 +73,16 @@ public class WriterRegistryTest { public void testGetWriterInstanceByIdentifier() { final List<String> ids = target.getIdentifiers(); final ByteArrayOutputStream baos = new ByteArrayOutputStream(); - for(String id : ids) { - Assert.assertNotNull( target.getWriterInstanceByIdentifier(id, baos) ); + final CompositeTripleHandler delegate = new CompositeTripleHandler(); + for (String id : ids) { + WriterFactory f = target.getWriterByIdentifier(id); + if (f instanceof TripleWriterFactory) { + Assert.assertNotNull(((TripleWriterFactory) f).getTripleWriter(baos, Settings.of())); + } else if (f instanceof DecoratingWriterFactory) { + Assert.assertNotNull(((DecoratingWriterFactory) f).getTripleWriter(delegate, Settings.of())); + } else { + Assert.fail(id + " is not a valid writer factory"); + } } } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/service/src/main/java/org/apache/any23/servlet/WebResponder.java ---------------------------------------------------------------------- diff --git a/service/src/main/java/org/apache/any23/servlet/WebResponder.java b/service/src/main/java/org/apache/any23/servlet/WebResponder.java index 024bf70..9640b17 100644 --- a/service/src/main/java/org/apache/any23/servlet/WebResponder.java +++ b/service/src/main/java/org/apache/any23/servlet/WebResponder.java @@ -25,10 +25,12 @@ import java.security.cert.CertificateException; import java.util.ArrayList; import java.util.Collection; import java.util.List; +import java.util.stream.Collectors; import javax.servlet.ServletOutputStream; import javax.servlet.http.HttpServletResponse; import org.apache.any23.Any23; import org.apache.any23.ExtractionReport; +import org.apache.any23.configuration.Settings; import org.apache.any23.extractor.ExtractionException; import org.apache.any23.extractor.ExtractionParameters; import org.apache.any23.extractor.Extractor; @@ -41,6 +43,7 @@ import org.apache.any23.validator.XMLValidationReportSerializer; import org.apache.any23.writer.CompositeTripleHandler; import org.apache.any23.writer.CountingTripleHandler; import org.apache.any23.writer.FormatWriter; +import org.apache.any23.writer.TripleWriterFactory; import org.apache.any23.writer.ReportingTripleHandler; import org.apache.any23.writer.TripleHandler; import org.apache.any23.writer.TripleHandlerException; @@ -315,19 +318,24 @@ class WebResponder { private boolean initRdfWriter(String format, boolean report, boolean annotate) throws IOException { final WriterFactory factory = getFormatWriter(format); - if (factory == null) { + if (!(factory instanceof TripleWriterFactory)) { sendError( 400, - "Invalid format '" + format + "', try one of: [rdfxml, turtle, ntriples, nquads, trix, json]", + "Invalid format '" + format + "', try one of: " + + writerRegistry.getWriters().stream() + .filter(f -> f instanceof TripleWriterFactory) + .map(WriterFactory::getIdentifier).collect(Collectors.toList()), null, null, report ); return false; } - FormatWriter fw = factory.getRdfWriter(byteOutStream); - fw.setAnnotated(annotate); - outputMediaType = factory.getMimeType(); + TripleHandler fw = ((TripleWriterFactory) factory).getTripleWriter(byteOutStream, Settings.of()); + if (fw instanceof FormatWriter) { + ((FormatWriter)fw).setAnnotated(annotate); + } + outputMediaType = ((TripleWriterFactory) factory).getTripleFormat().getMimeType(); List<TripleHandler> tripleHandlers = new ArrayList<>(); tripleHandlers.add(new IgnoreAccidentalRDFa(fw)); tripleHandlers.add(new CountingTripleHandler()); http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/test-resources/src/test/resources/cli/basic-with-stylesheet.html ---------------------------------------------------------------------- diff --git a/test-resources/src/test/resources/cli/basic-with-stylesheet.html b/test-resources/src/test/resources/cli/basic-with-stylesheet.html new file mode 100644 index 0000000..6348000 --- /dev/null +++ b/test-resources/src/test/resources/cli/basic-with-stylesheet.html @@ -0,0 +1,29 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<html xml:lang="en" xmlns="http://www.w3.org/1999/xhtml"> +<head> + <link rel="stylesheet" href="https://www.data.gov/app/plugins/simple-tooltips/zebra_tooltips.css?ver=4.9.1"> +</head> +<body> +<div xmlns:dc="http://purl.org/dc/terms/" xmlns:fake="http://fake.org/"> + <h2 property="dc:title">The trouble with Bob</h2> + <h3 property="dc:creator">Alice</h3> + <h3 property="fake:prop">Mary</h3> + ... +</div> +</body> +</html> \ No newline at end of file
