ANY23-396 Overhaul WriterFactory API
Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/692c583f Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/692c583f Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/692c583f Branch: refs/heads/master Commit: 692c583f848c5b7ae5a7940c857bfb0a9542c0d5 Parents: 12640a9 Author: Hans <[email protected]> Authored: Fri Sep 14 10:29:33 2018 -0500 Committer: Hans <[email protected]> Committed: Tue Oct 23 14:14:26 2018 -0500 ---------------------------------------------------------------------- api/pom.xml | 6 + .../org/apache/any23/configuration/Setting.java | 269 ++++++++++++++++++ .../apache/any23/configuration/Settings.java | 156 +++++++++++ .../any23/writer/DecoratingWriterFactory.java | 45 +++ .../org/apache/any23/writer/TripleFormat.java | 239 ++++++++++++++++ .../org/apache/any23/writer/TripleWriter.java | 60 ++++ .../any23/writer/TripleWriterFactory.java | 128 +++++++++ .../org/apache/any23/writer/WriterFactory.java | 45 ++- .../any23/writer/WriterFactoryRegistry.java | 271 ++++++++++++------- .../any23/configuration/SettingsTest.java | 227 ++++++++++++++++ .../apache/any23/writer/TripleFormatTest.java | 57 ++++ .../main/java/org/apache/any23/cli/Rover.java | 80 +++++- .../apache/any23/cli/ExtractorsFlowTest.java | 93 +++++++ .../java/org/apache/any23/cli/RoverTest.java | 39 +++ .../apache/any23/cli/flows/PeopleExtractor.java | 113 ++++++++ .../any23/cli/flows/PeopleExtractorFactory.java | 39 +++ .../org.apache.any23.writer.WriterFactory | 1 + .../org/apache/any23/writer/JSONLDWriter.java | 36 ++- .../any23/writer/JSONLDWriterFactory.java | 22 +- .../org/apache/any23/writer/JSONWriter.java | 57 ++-- .../apache/any23/writer/JSONWriterFactory.java | 30 +- .../org/apache/any23/writer/NQuadsWriter.java | 35 ++- .../any23/writer/NQuadsWriterFactory.java | 26 +- .../org/apache/any23/writer/NTriplesWriter.java | 31 ++- .../any23/writer/NTriplesWriterFactory.java | 26 +- .../any23/writer/RDFWriterTripleHandler.java | 104 +++++-- .../org/apache/any23/writer/RDFXMLWriter.java | 31 ++- .../any23/writer/RDFXMLWriterFactory.java | 26 +- .../org/apache/any23/writer/TriXWriter.java | 29 +- .../apache/any23/writer/TriXWriterFactory.java | 24 +- .../any23/writer/TripleWriterHandler.java | 114 ++++++++ .../org/apache/any23/writer/TurtleWriter.java | 55 +++- .../any23/writer/TurtleWriterFactory.java | 24 +- .../org/apache/any23/writer/URIListWriter.java | 66 ++--- .../any23/writer/URIListWriterFactory.java | 21 +- .../org/apache/any23/writer/WriterSettings.java | 59 ++++ .../org/apache/any23/writer/package-info.java | 2 +- .../org/apache/any23/writer/JSONWriterTest.java | 40 ++- .../apache/any23/writer/WriterRegistryTest.java | 16 +- .../org/apache/any23/servlet/WebResponder.java | 18 +- .../resources/cli/basic-with-stylesheet.html | 29 ++ 41 files changed, 2446 insertions(+), 343 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/api/pom.xml ---------------------------------------------------------------------- diff --git a/api/pom.xml b/api/pom.xml index ae275bd..748db36 100644 --- a/api/pom.xml +++ b/api/pom.xml @@ -43,6 +43,12 @@ <groupId>org.eclipse.rdf4j</groupId> <artifactId>rdf4j-rio-api</artifactId> </dependency> + + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> </dependencies> <build> http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/api/src/main/java/org/apache/any23/configuration/Setting.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/configuration/Setting.java b/api/src/main/java/org/apache/any23/configuration/Setting.java new file mode 100644 index 0000000..6932afd --- /dev/null +++ b/api/src/main/java/org/apache/any23/configuration/Setting.java @@ -0,0 +1,269 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.configuration; + +import java.lang.reflect.GenericArrayType; +import java.lang.reflect.ParameterizedType; +import java.lang.reflect.Type; +import java.lang.reflect.TypeVariable; +import java.util.HashMap; +import java.util.Optional; +import java.util.regex.Pattern; + +/** + * Represents a {@link Setting.Key Key} paired with a compatible value. + * + * @author Hans Brende ([email protected]) + */ +public final class Setting<V> { + + /** + * Convenience method for creating a new setting key with the specified identifier and value class. + * If the desired value type is a {@link ParameterizedType} such as {@code List<String>}, + * or custom value-checking is required, then this method is not appropriate; instead, + * extend the {@link Key} class directly. + * + * @param identifier a unique identifier for this key + * @param valueType the type of value allowed by this key + * @return a new {@link Key} instance initialized with the specified identifier and value type + * @throws IllegalArgumentException if the identifier or value type is invalid + */ + public static <V> Key<V> newKey(String identifier, Class<V> valueType) { + return new Key<V>(identifier, valueType) {}; + } + + /** + * Represents the key for a {@link Setting}. + */ + public static abstract class Key<V> { + private final String identifier; + private final Type valueType; + + private Key(String identifier, Class<V> valueType) { + this.identifier = checkIdentifier(identifier); + if ((this.valueType = valueType) == null) { + throw new IllegalArgumentException("value type cannot be null"); + } + + if (valueType.isArray()) { + throw new IllegalArgumentException(identifier + " value class must be immutable"); + } else if (valueType.getTypeParameters().length != 0) { + throw new IllegalArgumentException(identifier + " setting key must fill in type parameters for " + valueType.toGenericString()); + } else if (valueType.isPrimitive()) { + //ensure using primitive wrapper classes + //so that Class.isInstance(), etc. will work as expected + throw new IllegalArgumentException(identifier + " value class cannot be primitive"); + } + } + + private static final Pattern identifierPattern = Pattern.compile("[a-z][0-9a-z]*(\\.[a-z][0-9a-z]*)*"); + private static String checkIdentifier(String identifier) { + if (identifier == null) { + throw new IllegalArgumentException("identifier cannot be null"); + } + if (!identifierPattern.matcher(identifier).matches()) { + throw new IllegalArgumentException("identifier does not match " + identifierPattern.pattern()); + } + return identifier; + } + + /** + * Constructs a new key with the specified identifier. + * @param identifier the identifier for this key + * @throws IllegalArgumentException if the identifier is invalid, or the value type was determined to be invalid + */ + protected Key(String identifier) { + this.identifier = checkIdentifier(identifier); + + Type type = valueType = getValueType(); + + if (type instanceof Class) { + if (((Class) type).isArray()) { + throw new IllegalArgumentException(identifier + " value class must be immutable"); + } else if (((Class) type).getTypeParameters().length != 0) { + throw new IllegalArgumentException(identifier + " setting key must fill in type parameters for " + ((Class) type).toGenericString()); + } + } else if (type instanceof GenericArrayType) { + throw new IllegalArgumentException(identifier + " value class must be immutable"); + } else if (type instanceof TypeVariable) { + throw new IllegalArgumentException("Invalid setting key type 'Key<" + type.getTypeName() + ">' for identifier " + identifier); + } else if (!(type instanceof ParameterizedType)) { + throw new IllegalArgumentException(identifier + " invalid key type " + type + " (" + type.getClass().getName() + ")"); + } + } + + private Type getValueType() { + HashMap<TypeVariable<?>, Type> mapping = new HashMap<>(); + Class<?> rawType = getClass(); + assert rawType != Key.class; + for (;;) { + Type superclass = rawType.getGenericSuperclass(); + if (superclass instanceof ParameterizedType) { + rawType = (Class)((ParameterizedType) superclass).getRawType(); + Type[] args = ((ParameterizedType) superclass).getActualTypeArguments(); + if (Key.class.equals(rawType)) { + Type t = args[0]; + return mapping.getOrDefault(t, t); + } + TypeVariable<?>[] vars = rawType.getTypeParameters(); + for (int i = 0, len = vars.length; i < len; i++) { + Type t = args[i]; + mapping.put(vars[i], t instanceof TypeVariable ? mapping.get(t) : t); + } + } else { + rawType = (Class<?>)superclass; + if (Key.class.equals(rawType)) { + throw new IllegalArgumentException(getClass() + " does not supply type arguments"); + } + } + } + } + + /** + * Subclasses may override this method to check that new settings for this key are valid. + * The default implementation of this method throws a {@link NullPointerException} if the new value is null and the initial value was non-null. + * + * @param initial the setting containing the initial value for this key, or null if the setting has not yet been initialized + * @param newValue the new value for this setting + * @throws Exception if the new value for this setting was invalid + */ + protected void checkValue(Setting<V> initial, V newValue) throws Exception { + if (newValue == null && initial != null && initial.value != null) { + throw new NullPointerException(); + } + } + + private Setting<V> checked(Setting<V> origin, V value) { + try { + checkValue(origin, value); + } catch (Exception e) { + throw new IllegalArgumentException("invalid value for key '" + identifier + "': " + value, e); + } + return new Setting<>(this, value); + } + + /** + * @return a new {@link Setting} object with this key and the supplied value. + * + * @throws IllegalArgumentException if the new value was invalid, as determined by: + * <pre> + * {@code this.checkValue(null, value)} + * </pre> + * + * @see #checkValue(Setting, V) + */ + public final Setting<V> withValue(V value) { + return checked(null, value); + } + + /** + * @param o the object to check for equality + * @return {@code this == o} + */ + public final boolean equals(Object o) { + return super.equals(o); + } + + /** + * @return the identity-based hashcode of this key + */ + public final int hashCode() { + return super.hashCode(); + } + + public String toString() { + return identifier + ": " + valueType.getTypeName(); + } + } + + private final Key<V> key; + private final V value; + + private Setting(Key<V> key, V value) { + this.key = key; + this.value = value; + } + + /** + * @return the identifier for this setting + */ + public String getIdentifier() { + return key.identifier; + } + + /** + * @return the value for this setting + */ + public V getValue() { + return value; + } + + /** + * @return the type of value supported for this setting + */ + public Type getValueType() { + return key.valueType; + } + + /** + * @return the supplied setting, if it has the same key as this setting + */ + @SuppressWarnings("unchecked") + public final Optional<Setting<V>> cast(Setting<?> setting) { + return setting == null || setting.key != this.key ? Optional.empty() : Optional.of((Setting<V>)setting); + } + + /** + * @return a new {@link Setting} object with this setting's {@link Key Key} and the supplied value. + * + * @throws IllegalArgumentException if the new value was invalid, as determined by: + * <pre> + * {@code this.key.checkValue(this, newValue)} + * </pre> + * + * @see Key#checkValue(Setting, V) + */ + public Setting<V> withValue(V newValue) { + return key.checked(this, newValue); + } + + /** + * @return true if the supplied object is an instance of {@link Setting} and has the same key and value as this object. + */ + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof Setting)) return false; + + Setting<?> setting = (Setting<?>) o; + + if (key != setting.key) return false; + return value != null ? value.equals(setting.value) : setting.value == null; + } + + @Override + public int hashCode() { + return 31 * key.hashCode() + (value != null ? value.hashCode() : 0); + } + + @Override + public String toString() { + return key.identifier + "=" + value; + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/api/src/main/java/org/apache/any23/configuration/Settings.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/configuration/Settings.java b/api/src/main/java/org/apache/any23/configuration/Settings.java new file mode 100644 index 0000000..1289be3 --- /dev/null +++ b/api/src/main/java/org/apache/any23/configuration/Settings.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.configuration; + +import java.util.AbstractSet; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.Map; +import java.util.Optional; +import java.util.Set; + +/** + * This class represents an <i>immutable</i> {@link Set} of {@link Setting} objects, + * with the additional property that no two settings having the same {@link Setting#getIdentifier() identifier} + * can be simultaneously present in a {@code Settings} object. + * + * @author Hans Brende ([email protected]) + */ +public final class Settings extends AbstractSet<Setting<?>> { + + private static final Settings EMPTY_SETTINGS = new Settings(Collections.emptyMap()); + + private final Map<String, Setting<?>> values; + + private Settings(Map<String, Setting<?>> values) { + this.values = values; + } + + /** + * Returns the setting with the same {@link Setting.Key Key} as the supplied setting, if present. + */ + public <E> Optional<Setting<E>> find(Setting<E> setting) { + return setting.cast(values.get(setting.getIdentifier())); + } + + /** + * Returns the value set for {@code defaultSetting}'s {@link Setting.Key Key}, if present. + * Otherwise, returns {@code defaultSetting}'s value. + * <br><br> + * This method is semantically equivalent to: + * <br><br> + * <pre> + * {@code find(defaultSetting).orElse(defaultSetting).getValue()} + * </pre> + */ + public <E> E get(Setting<E> defaultSetting) { + return find(defaultSetting).orElse(defaultSetting).getValue(); + } + + + /////////////////////////////////////// + // AbstractSet overrides + /////////////////////////////////////// + + @Override + public boolean contains(Object o) { + if (!(o instanceof Setting<?>)) { + return false; + } + return o.equals(values.get(((Setting<?>) o).getIdentifier())); + } + + @Override + public int size() { + return values.size(); + } + + @Override + public Iterator<Setting<?>> iterator() { + return values.values().iterator(); + } + + /////////////////////////////////////// + // public constructors + /////////////////////////////////////// + + /** + * Returns an empty {@link Settings} object. + */ + public static Settings of() { + return EMPTY_SETTINGS; + } + + /** + * Returns a singleton {@link Settings} object, containing only the supplied setting. + */ + public static Settings of(Setting<?> s) { + return new Settings(Collections.singletonMap(s.getIdentifier(), s)); + } + + /** + * Returns a {@link Settings} object containing the supplied settings. + * For any two settings having the same key, the first will be overwritten by the second. + * @throws IllegalArgumentException if any two settings have the same identifier + */ + public static Settings of(Setting<?>... settings) { + Map<String, Setting<?>> map = mapForSize(settings.length); + for (Setting<?> s : settings) put(map, s); + return ofModifiable(map); + } + + /** + * Returns a {@link Settings} object containing the supplied settings. + * @throws IllegalArgumentException if any two settings have the same identifier + */ + public static Settings of(Collection<? extends Setting<?>> c) { + if (c instanceof Settings) { + return (Settings)c; + } + int size = c.size(); + if (size == 0) { + return EMPTY_SETTINGS; + } + Map<String, Setting<?>> map = mapForSize(size); + for (Setting<?> s : c) put(map, s); + return ofModifiable(map); + } + + /////////////////////////////////////// + // Private static helpers + /////////////////////////////////////// + + private static Settings ofModifiable(Map<String, Setting<?>> map) { + return new Settings(Collections.unmodifiableMap(map)); + } + + private static void put(Map<String, Setting<?>> map, Setting<?> setting) { + Setting<?> existing = map.put(setting.getIdentifier(), setting); + if (existing != null) { + throw new IllegalArgumentException(setting.getIdentifier() + " is already defined"); + } + } + + private static final float loadFactor = 0.75f; + private static Map<String, Setting<?>> mapForSize(int size) { + return new HashMap<>((int)(size / loadFactor) + 1, loadFactor); + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/api/src/main/java/org/apache/any23/writer/DecoratingWriterFactory.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/writer/DecoratingWriterFactory.java b/api/src/main/java/org/apache/any23/writer/DecoratingWriterFactory.java new file mode 100644 index 0000000..cc66372 --- /dev/null +++ b/api/src/main/java/org/apache/any23/writer/DecoratingWriterFactory.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.writer; + +import org.apache.any23.configuration.Settings; + +/** + * Base interface used for constructors of decorating {@link TripleHandler} implementations. + * @author Hans Brende ([email protected]) + */ +public interface DecoratingWriterFactory extends BaseWriterFactory<TripleHandler> { + + /** + * + * @return the settings supported by handlers produced by this factory + */ + @Override + Settings getSupportedSettings(); + + /** + * @param delegate the {@link TripleWriter} to delegate input to + * @param settings the settings with which to configure the returned handler + * @return a {@link TripleHandler} which writes to the specified delegate + * @throws NullPointerException if the delegate or settings is null + * @throws IllegalArgumentException if the settings are not correctly configured + */ + @Override + TripleHandler getTripleWriter(TripleHandler delegate, Settings settings); + +} http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/api/src/main/java/org/apache/any23/writer/TripleFormat.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/writer/TripleFormat.java b/api/src/main/java/org/apache/any23/writer/TripleFormat.java new file mode 100644 index 0000000..01292eb --- /dev/null +++ b/api/src/main/java/org/apache/any23/writer/TripleFormat.java @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.writer; + +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.ValueFactory; +import org.eclipse.rdf4j.model.impl.SimpleValueFactory; +import org.eclipse.rdf4j.rio.RDFFormat; + +import java.nio.charset.Charset; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Locale; +import java.util.Optional; +import java.util.stream.Collectors; + +/** + * @author Hans Brende ([email protected]) + */ +public class TripleFormat { + private final String name; + private final IRI standardIRI; + private final List<String> mimeTypes; + private final Charset charset; + private final List<String> fileExtensions; + private final Capabilities capabilities; + RDFFormat rdfFormat; + + private static final ValueFactory vf = SimpleValueFactory.getInstance(); + + private static final int WRITES_TRIPLES = 1; + private static final int WRITES_GRAPHS = 1 << 1; + private static final int WRITES_NAMESPACES = 1 << 2; + + public static final Capabilities NONSTANDARD = new Capabilities(0); + public static final Capabilities TRIPLES = new Capabilities(WRITES_TRIPLES); + public static final Capabilities QUADS = new Capabilities(WRITES_TRIPLES | WRITES_GRAPHS); + public static final Capabilities TRIPLES_AND_NAMESPACES = TRIPLES.withNamespaces(); + public static final Capabilities QUADS_AND_NAMESPACES = QUADS.withNamespaces(); + + public static class Capabilities { + private final int raw; + + private Capabilities(int raw) { + this.raw = raw; + } + + public boolean has(Capabilities other) { + int oraw = other.raw; + return (raw & oraw) == oraw; + } + + private Capabilities withNamespaces() { + return new Capabilities(raw | WRITES_NAMESPACES); + } + + //TODO: add "supportsComments()" + } + + private static IllegalArgumentException mimeTypeErr(String mt) { + return new IllegalArgumentException(mt + " is not a valid mimetype"); + } + + private static IllegalArgumentException extensionErr(String ext) { + return new IllegalArgumentException(ext + " is not a valid extension"); + } + + private static <E> E checkNonNull(E object, String name) { + if (object == null) { + throw new IllegalArgumentException(name + " must not be null"); + } + return object; + } + + //see https://tools.ietf.org/html/rfc2045#section-5.1 + private static void checkMimeTypes(List<String> mts) { + if (checkNonNull(mts, "mimetypes").isEmpty()) { + throw new IllegalArgumentException("mimetypes must not be empty"); + } + for (String mt : mts) { + boolean slash = false; + for (int i = 0, len = checkNonNull(mt, "mimetype").length(); i < len; i++) { + char ch = mt.charAt(i); + if (ch <= ' ' || ch >= 127 || ch == '(' || ch == ')' || + ch == '<' || ch == '>' || ch == '@' || ch == ',' || + ch == ';' || ch == ':' || ch == '\\' || ch == '"' || + ch == '[' || ch == ']' || ch == '?' || ch == '=' + //also disallow wildcards: + || ch == '*') { + throw mimeTypeErr(mt); + } else if (ch == '/') { + if (slash || i == 0 || i + 1 == len) { + throw mimeTypeErr(mt); + } + slash = true; + } + } + if (!slash) { + throw mimeTypeErr(mt); + } + } + } + + private static void checkExtensions(List<String> exts) { + for (String ext : checkNonNull(exts, "extensions")) { + int illegalDot = 0; + for (int i = 0, len = checkNonNull(ext, "extension").length(); i < len; i++) { + char ch = ext.charAt(i); + if (ch <= ' ' || ch >= 127 || ch == '<' || ch == '>' || + ch == ':' || ch == '"' || ch == '/' || ch == '\\' || + ch == '|' || ch == '?' || ch == '*') { + throw extensionErr(ext); + } else if (ch == '.') { + int next = i + 1; + if (i == illegalDot || next == len) { + throw extensionErr(ext); + } + illegalDot = next; + } + } + } + } + + private static String normalizeMimeType(String mt) { + return mt.toLowerCase(Locale.ENGLISH); + } + + private static String normalizeExtension(String ext) { + return ext.toLowerCase(Locale.ENGLISH); + } + + private TripleFormat(String name, Collection<String> mimeTypes, Charset charset, + Collection<String> fileExtensions, String standardIRI, Capabilities capabilities) { + this.name = checkNonNull(name, "display name"); + checkMimeTypes(this.mimeTypes = Collections.unmodifiableList(mimeTypes.stream() + .map(TripleFormat::normalizeMimeType).distinct().collect(Collectors.toList()))); + if ((this.charset = charset) != null && !charset.canEncode()) { + throw new IllegalArgumentException(charset + " does not allow encoding"); + } + checkExtensions(this.fileExtensions = Collections.unmodifiableList(fileExtensions.stream() + .map(TripleFormat::normalizeExtension).distinct().collect(Collectors.toList()))); + this.standardIRI = standardIRI == null ? null : vf.createIRI(standardIRI); + this.capabilities = checkNonNull(capabilities, "capabilities"); + } + + public static TripleFormat of(String displayName, Collection<String> mimeTypes, Charset defaultCharset, + Collection<String> fileExtensions, String standardIRI, Capabilities capabilities) { + return new TripleFormat(displayName, mimeTypes, defaultCharset, fileExtensions, standardIRI, capabilities); + } + + public Optional<Charset> getCharset() { + return Optional.ofNullable(charset); + } + + static Capabilities capabilities(RDFFormat format) { + if (format.supportsContexts()) { + return format.supportsNamespaces() ? QUADS_AND_NAMESPACES : QUADS; + } else { + return format.supportsNamespaces() ? TRIPLES_AND_NAMESPACES : TRIPLES; + } + } + + private static String iri(IRI iri) { + return iri == null ? null : iri.stringValue(); + } + + static TripleFormat of(RDFFormat format) { + TripleFormat f = of(format.getName(), format.getMIMETypes(), + format.getCharset(), format.getFileExtensions(), iri(format.getStandardURI()), + capabilities(format)); + f.rdfFormat = format; + return f; + } + + RDFFormat toRDFFormat() { + RDFFormat fmt = rdfFormat; + if (fmt != null) { + return fmt; + } + Capabilities capabilities = this.capabilities; + if (!capabilities.has(TRIPLES)) { + throw new UnsupportedOperationException("This format does not print RDF triples"); + } + return rdfFormat = new RDFFormat(name, mimeTypes, charset, fileExtensions, standardIRI, + capabilities.has(TRIPLES_AND_NAMESPACES), capabilities.has(QUADS)); + } + + public Optional<IRI> getStandardIRI() { + return Optional.ofNullable(standardIRI); + } + + public List<String> getMimeTypes() { + return mimeTypes; + } + + public String getMimeType() { + return mimeTypes.get(0); + } + + public List<String> getExtensions() { + return fileExtensions; + } + + public Optional<String> getExtension() { + return fileExtensions.isEmpty() ? Optional.empty() : Optional.of(fileExtensions.get(0)); + } + + public Capabilities getCapabilities() { + return capabilities; + } + + public String getDisplayName() { + return name; + } + + public String toString() { + return name + mimeTypes.stream().collect( + Collectors.joining(", ", " (mimeTypes=", "; ")) + + fileExtensions.stream().collect( + Collectors.joining(", ", "ext=", ")")); + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/api/src/main/java/org/apache/any23/writer/TripleWriter.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/writer/TripleWriter.java b/api/src/main/java/org/apache/any23/writer/TripleWriter.java new file mode 100644 index 0000000..3800045 --- /dev/null +++ b/api/src/main/java/org/apache/any23/writer/TripleWriter.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.writer; + +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.Resource; +import org.eclipse.rdf4j.model.Value; + +/** + * Base interface for triple writers that don't need an extraction context to write triples + * + * @author Hans Brende ([email protected]) + */ +public interface TripleWriter extends AutoCloseable { + + /** + * Writes a triple and, optionally, a graph resource name. + * @param s the subject to write + * @param p the predicate to write + * @param o the object to write + * @param g the graph name to write, or null + * @throws TripleHandlerException if there is an error writing the triple + */ + void writeTriple(Resource s, IRI p, Value o, Resource g) throws TripleHandlerException; + + /** + * Writes a prefix-namespace mapping. <br><b>NOTE:</b> this method should be called + * <b>before</b> writing out any triples. Calling this method <b>after</b> writing + * out a triple may result in the prefix-namespace mapping being ignored. + * @param prefix the namespace prefix + * @param uri the namespace uri + * @throws TripleHandlerException if there was an error writing out the prefix-namespace mapping + */ + void writeNamespace(String prefix, String uri) throws TripleHandlerException; + + /** + * Releases resources associated with this {@link TripleWriter}, and flushes (but by default does not close) + * any underlying {@link java.io.OutputStream}s. Future invocations of methods of this writer + * produce <b>undefined behavior</b> after this method has been called. + * @throws TripleHandlerException if there was an error closing this {@link TripleWriter} + */ + @Override + void close() throws TripleHandlerException; + +} http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/api/src/main/java/org/apache/any23/writer/TripleWriterFactory.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/writer/TripleWriterFactory.java b/api/src/main/java/org/apache/any23/writer/TripleWriterFactory.java new file mode 100644 index 0000000..20d4995 --- /dev/null +++ b/api/src/main/java/org/apache/any23/writer/TripleWriterFactory.java @@ -0,0 +1,128 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.writer; + +import org.apache.any23.configuration.Settings; +import org.apache.any23.extractor.ExtractionContext; +import org.eclipse.rdf4j.common.lang.FileFormat; +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.Resource; +import org.eclipse.rdf4j.model.Value; +import org.eclipse.rdf4j.rio.RDFFormat; + +import java.io.OutputStream; + +/** + * Base interface for constructors of {@link TripleHandler} implementations + * that write to an {@link OutputStream} using a particular {@link FileFormat}. + * @author Hans Brende ([email protected]) + */ +public interface TripleWriterFactory extends BaseWriterFactory<OutputStream> { + + /** + * @deprecated since 2.3. Use {@link #getTripleFormat()} instead. + */ + @Override + @Deprecated + default RDFFormat getRdfFormat() { + return getTripleFormat().toRDFFormat(); + } + + /** + * @return the format used to write to {@link OutputStream}s + */ + TripleFormat getTripleFormat(); + + /** + * @deprecated since 2.3. Use {@link #getTripleFormat()}.{@link TripleFormat#getMimeType() getMimeType()} instead. + */ + @Override + @Deprecated + default String getMimeType() { + return getTripleFormat().getMimeType(); + } + + /** + * @deprecated since 2.3. Use {@link #getTripleWriter(OutputStream, Settings)} instead. + */ + @Override + @Deprecated + default FormatWriter getRdfWriter(OutputStream os) { + TripleHandler th = getTripleWriter(os, Settings.of()); + return th instanceof FormatWriter ? (FormatWriter)th : new FormatWriter() { + @Override + public boolean isAnnotated() { + return false; + } + @Override + public void setAnnotated(boolean f) {} + @Override + public void startDocument(IRI documentIRI) throws TripleHandlerException { + th.startDocument(documentIRI); + } + @Override + public void openContext(ExtractionContext context) throws TripleHandlerException { + th.openContext(context); + } + @Override + public void receiveTriple(Resource s, IRI p, Value o, IRI g, ExtractionContext context) throws TripleHandlerException { + th.receiveTriple(s, p, o, g, context); + } + @Override + public void receiveNamespace(String prefix, String uri, ExtractionContext context) throws TripleHandlerException { + th.receiveNamespace(prefix, uri, context); + } + @Override + public void closeContext(ExtractionContext context) throws TripleHandlerException { + th.closeContext(context); + } + @Override + public void endDocument(IRI documentIRI) throws TripleHandlerException { + th.endDocument(documentIRI); + } + @Override + public void setContentLength(long contentLength) { + th.setContentLength(contentLength); + } + @Override + public void close() throws TripleHandlerException { + th.close(); + } + }; + } + + + /** + * + * @return the settings supported by writers produced by this factory + */ + @Override + Settings getSupportedSettings(); + + + /** + * @param out the {@link OutputStream} to write to + * @param settings the settings with which to configure the writer + * @return a {@link TripleHandler} which writes to the specified {@link OutputStream} + * @throws NullPointerException if the output stream or settings is null + * @throws IllegalArgumentException if the settings are not correctly configured + */ + @Override + TripleHandler getTripleWriter(OutputStream out, Settings settings); + +} http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/api/src/main/java/org/apache/any23/writer/WriterFactory.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/writer/WriterFactory.java b/api/src/main/java/org/apache/any23/writer/WriterFactory.java index 3012beb..060177b 100644 --- a/api/src/main/java/org/apache/any23/writer/WriterFactory.java +++ b/api/src/main/java/org/apache/any23/writer/WriterFactory.java @@ -19,18 +19,59 @@ package org.apache.any23.writer; import java.io.OutputStream; +import org.apache.any23.configuration.Settings; import org.eclipse.rdf4j.rio.RDFFormat; /** - * @author Peter Ansell [email protected] - * + * The superinterface of all {@link TripleHandler} factory interfaces. + * Do not implement this interface directly. Instead, implement one of the subinterfaces {@link TripleWriterFactory} or {@link DecoratingWriterFactory}. + * @author Peter Ansell ([email protected]) + * @author Hans Brende ([email protected]) */ public interface WriterFactory { + + /** + * @deprecated since 2.3. Use {@link TripleWriterFactory#getTripleFormat()} instead. + */ + @Deprecated RDFFormat getRdfFormat(); String getIdentifier(); + /** + * @deprecated since 2.3. Use {@link TripleWriterFactory#getTripleFormat()}.{@link TripleFormat#getMimeType() getMimeType()} instead. + */ + @Deprecated String getMimeType(); + /** + * @deprecated since 2.3. Use {@link TripleWriterFactory#getTripleWriter(OutputStream, Settings)} instead. + */ + @Deprecated FormatWriter getRdfWriter(OutputStream os); } + +interface BaseWriterFactory<Output> extends WriterFactory { + + Settings getSupportedSettings(); + + TripleHandler getTripleWriter(Output output, Settings settings); + + @Override + @Deprecated + default FormatWriter getRdfWriter(OutputStream os) { + throw new UnsupportedOperationException("this class does not support getRdfWriter()"); + } + + @Override + @Deprecated + default String getMimeType() { + throw new UnsupportedOperationException("this class does not support getMimeType()"); + } + + @Override + @Deprecated + default RDFFormat getRdfFormat() { + throw new UnsupportedOperationException("this class does not support getRdfFormat()"); + } +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/api/src/main/java/org/apache/any23/writer/WriterFactoryRegistry.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/writer/WriterFactoryRegistry.java b/api/src/main/java/org/apache/any23/writer/WriterFactoryRegistry.java index cbe5f9a..64830d8 100644 --- a/api/src/main/java/org/apache/any23/writer/WriterFactoryRegistry.java +++ b/api/src/main/java/org/apache/any23/writer/WriterFactoryRegistry.java @@ -19,15 +19,21 @@ package org.apache.any23.writer; import java.io.OutputStream; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.ServiceConfigurationError; import java.util.ServiceLoader; +import java.util.concurrent.CopyOnWriteArrayList; +import java.util.concurrent.CopyOnWriteArraySet; +import org.apache.any23.configuration.Settings; +import org.eclipse.rdf4j.rio.RDFFormat; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -35,6 +41,7 @@ import org.slf4j.LoggerFactory; * Registry class for {@link WriterFactory}s. * * @author Michele Mostarda ([email protected]) + * @author Hans Brende ([email protected]) */ public class WriterFactoryRegistry { @@ -43,54 +50,54 @@ public class WriterFactoryRegistry { /** * Singleton instance. */ - private static WriterFactoryRegistry instance; + private static class InstanceHolder { + private static final WriterFactoryRegistry instance = new WriterFactoryRegistry(); + } + + private static final WriterFactory[] EMPTY_WRITERS = new WriterFactory[0]; /** * List of registered writers. */ - private final List<WriterFactory> writers = - new ArrayList<>(); + private final List<WriterFactory> writers = new CopyOnWriteArrayList<>(); /** - * MIME Type to {@link FormatWriter} class. + * MIME Type to {@link WriterFactory} class. */ - private final Map<String,List<WriterFactory>> mimeToWriter = - new HashMap<>(); + private final Map<String, List<WriterFactory>> mimeToWriter = Collections.synchronizedMap(new HashMap<>()); /** - * Identifier to {@link FormatWriter} class. + * Identifier to {@link WriterFactory} class. */ - private final Map<String,WriterFactory> idToWriter = - new HashMap<>(); + private final Map<String, WriterFactory> idToWriter = new HashMap<>(); - private List<String> identifiers = new ArrayList<>(); + private final List<String> identifiers = new CopyOnWriteArrayList<>(); + + private final Collection<String> mimeTypes = new CopyOnWriteArraySet<>(); public WriterFactoryRegistry() { - ServiceLoader<WriterFactory> serviceLoader = java.util.ServiceLoader.load(WriterFactory.class, this.getClass().getClassLoader()); - - Iterator<WriterFactory> iterator = serviceLoader.iterator(); + ServiceLoader<WriterFactory> serviceLoader = java.util.ServiceLoader.load(WriterFactory.class, this.getClass().getClassLoader()); + + Iterator<WriterFactory> iterator = serviceLoader.iterator(); // use while(true) loop so that we can isolate all service loader errors from .next and .hasNext to a single service - while(true) - { - try - { - if(!iterator.hasNext()) - break; - - WriterFactory factory = iterator.next(); - - this.register(factory); - } - catch(ServiceConfigurationError error) - { - LOG.error("Found error loading a WriterFactory", error); - } - } + + ArrayList<WriterFactory> factories = new ArrayList<>(); + while (true) { + try { + if (!iterator.hasNext()) + break; + factories.add(iterator.next()); + } catch(ServiceConfigurationError error) { + LOG.error("Found error loading a WriterFactory", error); + } + } + + registerAll(factories.toArray(EMPTY_WRITERS)); } /** - * Reads the identifier specified for the given {@link FormatWriter}. + * Reads the identifier specified for the given {@link WriterFactory}. * * @param writerClass writer class. * @return identifier. @@ -100,97 +107,182 @@ public class WriterFactoryRegistry { } /** - * Reads the <i>MIME Type</i> specified for the given {@link FormatWriter}. + * Reads the <i>MIME Type</i> specified for the given {@link WriterFactory}. * * @param writerClass writer class. * @return MIME type. */ public static String getMimeType(WriterFactory writerClass) { - return writerClass.getMimeType(); + if (writerClass instanceof TripleWriterFactory) { + return ((TripleWriterFactory)writerClass).getTripleFormat().getMimeType(); + } else if (writerClass instanceof DecoratingWriterFactory) { + return null; + } else { + return reportAndGetCompatFormat(writerClass).getMimeType(); + } } /** * @return the {@link WriterFactoryRegistry} singleton instance. */ - public static synchronized WriterFactoryRegistry getInstance() { - if(instance == null) { - instance = new WriterFactoryRegistry(); + public static WriterFactoryRegistry getInstance() { + return InstanceHolder.instance; + } + + @SuppressWarnings("deprecation") + private static TripleFormat reportAndGetCompatFormat(WriterFactory f) { + LOG.warn("{} must implement either {} or {}.", f.getClass(), TripleWriterFactory.class, DecoratingWriterFactory.class); + final String mimeType = f.getMimeType(); + RDFFormat fmt; + try { + fmt = f.getRdfFormat(); + } catch (RuntimeException e) { + return TripleFormat.of(mimeType, Collections.singleton(mimeType), null, + Collections.emptySet(), null, TripleFormat.NONSTANDARD); } - return instance; + if (mimeType == null || fmt.hasDefaultMIMEType(mimeType)) { + return TripleFormat.of(fmt); + } + //override default MIME type on mismatch + return TripleFormat.of(fmt.getName(), Collections.singleton(mimeType), fmt.getCharset(), + fmt.getFileExtensions(), fmt.getStandardURI().stringValue(), TripleFormat.capabilities(fmt)); + } + + private static TripleWriterFactory getCompatFactory(WriterFactory f) { + final TripleFormat format = reportAndGetCompatFormat(f); + return new TripleWriterFactory() { + @Override + public TripleFormat getTripleFormat() { + return format; + } + + @Override + @SuppressWarnings("deprecation") + public TripleHandler getTripleWriter(OutputStream os, Settings settings) { + return f.getRdfWriter(os); + } + + @Override + public Settings getSupportedSettings() { + return Settings.of(); + } + + @Override + public String getIdentifier() { + return f.getIdentifier(); + } + }; } /** * Registers a new {@link WriterFactory} to the registry. * - * @param writerClass the class of the writer to be registered. + * @param f the writer factory to be registered. * @throws IllegalArgumentException if the id or the mimetype are null * or empty strings or if the identifier has been already defined. */ - public synchronized void register(WriterFactory writerClass) { - if(writerClass == null) + public void register(WriterFactory f) { + if (f == null) throw new NullPointerException("writerClass cannot be null."); - final String id = writerClass.getIdentifier(); - final String mimeType = writerClass.getMimeType(); - if(id == null || id.trim().length() == 0) { - throw new IllegalArgumentException("Invalid identifier returned by writer " + writerClass); + registerAll(new WriterFactory[]{f}); + } + + private void registerAll(WriterFactory[] factories) { + final int count = factories.length; + if (count == 0) { + return; } - if(mimeType == null || mimeType.trim().length() == 0) { - throw new IllegalArgumentException("Invalid MIME type returned by writer " + writerClass); + final HashMap<String, ArrayList<WriterFactory>> mimes = new HashMap<>(); + final String[] ids = new String[count]; + + for (int i = 0; i < count; i++) { + WriterFactory f = factories[i]; + if (!(f instanceof BaseWriterFactory<?>)) { + //backwards compatibility: view vanilla WriterFactory as TripleWriterFactory + f = factories[i] = getCompatFactory(f); + } + final String id = ids[i] = f.getIdentifier(); + if (id == null || id.trim().isEmpty()) { + throw new IllegalArgumentException("Invalid identifier returned by writer " + f); + } + if (f instanceof TripleWriterFactory) { + String mimeType = ((TripleWriterFactory)f).getTripleFormat().getMimeType(); + if (mimeType == null || mimeType.trim().isEmpty()) { + throw new IllegalArgumentException("Invalid MIME type returned by writer " + f); + } + mimes.computeIfAbsent(mimeType, k -> new ArrayList<>()).add(f); + } + } + + final List<String> idList = Arrays.asList(ids); + final List<WriterFactory> factoryList = Arrays.asList(factories); + final Map<String, WriterFactory> idToWriter; + synchronized (idToWriter = this.idToWriter) { + for (int i = 0; i < count; i++) { + String id = ids[i]; + if (idToWriter.putIfAbsent(id, factories[i]) != null) { + idToWriter.keySet().removeAll(idList.subList(0, i)); + throw new IllegalArgumentException("The writer identifier is already declared: " + id); + } + } } - if(idToWriter.containsKey(id)) - throw new IllegalArgumentException("The writer identifier is already declared."); - - writers.add(writerClass); - identifiers.add(writerClass.getIdentifier()); - List<WriterFactory> writerClasses = mimeToWriter.get(mimeType); - if(writerClasses == null) { - writerClasses = new ArrayList<>(); - mimeToWriter.put(mimeType, writerClasses); + //add in bulk to reduce writes to CopyOnWriteArrayList + writers.addAll(factoryList); + identifiers.addAll(idList); + for (Map.Entry<String, ArrayList<WriterFactory>> entry : mimes.entrySet()) { + String mimeType = entry.getKey(); + mimeTypes.add(mimeType); + mimeToWriter.computeIfAbsent(mimeType, k -> new CopyOnWriteArrayList<>()).addAll(entry.getValue()); } - writerClasses.add(writerClass); - idToWriter.put(id, writerClass); } /** - * Verifies if a {@link FormatWriter} with given <code>id</code> identifier has been registered. + * Verifies if a {@link WriterFactory} with given <code>id</code> identifier has been registered. * * @param id identifier. * @return <code>true</code> if the identifier has been registered, <code>false</code> otherwise. */ - public synchronized boolean hasIdentifier(String id) { - return idToWriter.containsKey(id); + public boolean hasIdentifier(String id) { + synchronized (idToWriter) { + return idToWriter.containsKey(id); + } } /** * @return the list of all the specified identifiers. */ - public synchronized List<String> getIdentifiers() { + public List<String> getIdentifiers() { + //no synchronized block needed for CopyOnWriteArrayList return Collections.unmodifiableList(identifiers); } /** - * @return the list of MIME types covered by the registered {@link FormatWriter}s. + * @return the list of MIME types covered by the registered {@link WriterFactory} instances. */ - public synchronized Collection<String> getMimeTypes() { - return Collections.unmodifiableCollection(mimeToWriter.keySet()); + public Collection<String> getMimeTypes() { + //no synchronized block needed for CopyOnWriteArraySet + return Collections.unmodifiableCollection(mimeTypes); } /** - * @return the list of all the registered {@link FormatWriter}s. + * @return the list of all the registered {@link WriterFactory} instances. */ - public synchronized List<WriterFactory> getWriters() { + public List<WriterFactory> getWriters() { + //no synchronized block needed for CopyOnWriteArrayList return Collections.unmodifiableList(writers); } /** - * Returns the {@link FormatWriter} identified by <code>id</code>. + * Returns the {@link WriterFactory} identified by <code>id</code>. * * @param id the writer identifier. - * @return the class of the {@link FormatWriter} matching the <code>id</code> - * or <code>null</code> if not found.s + * @return the {@link WriterFactory} matching the <code>id</code> + * or <code>null</code> if not found. */ - public synchronized WriterFactory getWriterByIdentifier(String id) { - return idToWriter.get(id); + public WriterFactory getWriterByIdentifier(String id) { + synchronized (idToWriter) { + return idToWriter.get(id); + } } /** @@ -199,42 +291,29 @@ public class WriterFactoryRegistry { * @param mimeType a MIMEType. * @return a list of matching writers or an empty list. */ - public synchronized Collection<WriterFactory> getWritersByMimeType(String mimeType) { - return mimeToWriter.get(mimeType); + public Collection<WriterFactory> getWritersByMimeType(String mimeType) { + //no synchronized block needed for synchronized map + //return CopyOnWriteArrayList to avoid ConcurrentModificationExceptions on iteration + List<WriterFactory> list = mimeToWriter.get(mimeType); + return list != null ? Collections.unmodifiableList(list) : Collections.emptyList(); } /** - * Returns an instance of {@link FormatWriter} ready to write on the given <code>os</code> + * Returns an instance of {@link FormatWriter} ready to write on the given * {@link OutputStream}. * - * @param id the identifier of the {@link FormatWriter} to crate an instance. + * @param id the identifier of the {@link FormatWriter} to instantiate. * @param os the output stream. * @return the not <code>null</code> {@link FormatWriter} instance. * @throws NullPointerException if the <code>id</code> doesn't match any registered writer. - */ - public synchronized FormatWriter getWriterInstanceByIdentifier(String id, OutputStream os) { - final WriterFactory writerClazz = getWriterByIdentifier(id); - if(writerClazz == null) - throw new NullPointerException( - String.format("Cannot find writer with id '%s' .", id) - ); - return createWriter(writerClazz, os); - } - - /** - * Crates a writer instance. * - * @param clazz class to instantiate. - * @param os output stream to pass as constructor argument. - * @return created instance. - * @throws IllegalArgumentException if an error occurs during instantiation. + * @deprecated since 2.3. Use {@link #getWriterByIdentifier(String)} + * in combination with {@link TripleWriterFactory#getTripleWriter(OutputStream, Settings)} instead. */ - private FormatWriter createWriter(WriterFactory clazz, OutputStream os) { - try { - return clazz.getRdfWriter(os); - } catch (Exception e) { - throw new IllegalArgumentException("Error while initializing format writer " + clazz + " .", e); - } + @Deprecated + public FormatWriter getWriterInstanceByIdentifier(String id, OutputStream os) { + return Objects.requireNonNull(getWriterByIdentifier(id), + "Cannot find writer with id " + id).getRdfWriter(os); } } http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/api/src/test/java/org/apache/any23/configuration/SettingsTest.java ---------------------------------------------------------------------- diff --git a/api/src/test/java/org/apache/any23/configuration/SettingsTest.java b/api/src/test/java/org/apache/any23/configuration/SettingsTest.java new file mode 100644 index 0000000..a5a7b6e --- /dev/null +++ b/api/src/test/java/org/apache/any23/configuration/SettingsTest.java @@ -0,0 +1,227 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.configuration; + +import org.junit.Test; + +import java.lang.reflect.ParameterizedType; +import java.lang.reflect.Type; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertSame; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; + +@SuppressWarnings("ResultOfMethodCallIgnored") +public class SettingsTest { + + @Test + public void testNonNullSetting() { + Setting<String> nonNull = Setting.newKey("nulltest", String.class).withValue("A nonnull string"); + try { + nonNull.withValue(null); + fail(); + } catch (IllegalArgumentException e) { + //test passes; ignore + } + } + + @Test + public void testNullableSetting() { + Setting<String> nullable = Setting.newKey("nulltest", String.class).withValue(null); + assertNull(nullable.withValue(null).getValue()); + } + + @Test + public void testDuplicateIdentifiers() { + try { + Setting<String> first = Setting.newKey("foo", String.class).withValue(""); + Setting<String> second = Setting.newKey("foo", String.class).withValue(""); + + Settings.of(first, second); + + fail(); + } catch (IllegalArgumentException e) { + //test passes; ignore + } + } + + @Test + public void testFind() { + Setting<String> key = Setting.newKey("foo", String.class).withValue("key"); + Setting<String> element = key.withValue("element"); + + Settings settings = Settings.of(element); + + Optional<Setting<String>> actual = settings.find(key); + + assertTrue(actual.isPresent()); + + assertSame(element, actual.get()); + + assertTrue(settings.contains(element)); + assertFalse(settings.contains(key)); + } + + @Test + public void testGetPresentSetting() { + Setting<String> key = Setting.newKey("foo", String.class).withValue("key"); + + Setting<String> actual = key.withValue("actual"); + Settings settings = Settings.of(actual); + + assertSame(actual.getValue(), settings.get(key)); + } + + @Test + public void testGetAbsentSetting() { + Setting<String> key = Setting.newKey("foo", String.class).withValue("key"); + + Setting<String> actual = Setting.newKey("foo", String.class).withValue("actual"); + Settings settings = Settings.of(actual); + + assertSame(key.getValue(), settings.get(key)); + } + + @Test + public void testGetNullSetting() { + Setting.Key<String> baseKey = Setting.newKey("foo", String.class); + + Settings settings = Settings.of(baseKey.withValue(null)); + assertNull(settings.get(baseKey.withValue("not null"))); + } + + @Test + public void testSettingType() { + assertEquals(CharSequence.class, Setting.newKey("foo", CharSequence.class).withValue("").getValueType()); + assertEquals(CharSequence.class, new Setting.Key<CharSequence>("foo"){}.withValue("").getValueType()); + + Type mapType = new Setting.Key<Map<String, Integer>>( + "foo"){}.withValue(Collections.emptyMap()).getValueType(); + + assertTrue(mapType instanceof ParameterizedType); + assertEquals("java.util.Map<java.lang.String, java.lang.Integer>", mapType.getTypeName()); + + class Key0<Bar, V> extends Setting.Key<V> { + Key0() { + super("foo"); + } + } + + class Key2<Baz, V, Bar> extends Key0<V, Bar> { } + + class Key3<V> extends Key2<Boolean, Integer, List<Optional<String>>> { } + + class Key4 extends Key3<Boolean> { } + + Type complicatedType = new Key4().withValue(Collections.emptyList()).getValueType(); + + assertTrue(complicatedType instanceof ParameterizedType); + assertEquals("java.util.List<java.util.Optional<java.lang.String>>", complicatedType.getTypeName()); + + class Key3Simple<V> extends Key2<Boolean, Integer, String> { } + + class Key4Simple extends Key3Simple<Boolean> { } + + Type simpleType = new Key4Simple().withValue("").getValueType(); + + assertEquals(String.class, simpleType); + } + + + + @Test + public void testBadSetting() { + try { + new Setting.Key("foo") {}; + fail(); + } catch (IllegalArgumentException e) { + //test passes; ignore + } + + try { + Setting.newKey("foo", null); + fail(); + } catch (IllegalArgumentException e) { + //test passes; ignore + } + + try { + Setting.newKey(null, Integer.class); + fail(); + } catch (IllegalArgumentException e) { + //test passes; ignore + } + + try { + Setting.newKey(" ", Integer.class); + fail(); + } catch (IllegalArgumentException e) { + //test passes; ignore + } + + try { + Setting.newKey("foo", boolean.class); + fail(); + } catch (IllegalArgumentException e) { + //test passes; ignore + } + + try { + Setting.newKey("foo", Integer[].class); + fail(); + } catch (IllegalArgumentException e) { + //test passes; ignore + } + + try { + new Setting.Key<Integer[]>("foo") {}; + fail(); + } catch (IllegalArgumentException e) { + //test passes; ignore + } + + try { + new Setting.Key<List<Integer>[]>("foo") {}; + fail(); + } catch (IllegalArgumentException e) { + //test passes; ignore + } + + class BadKeyCreator { + private <V> void badKey() { + new Setting.Key<V>("foo") {}; + } + } + + try { + new BadKeyCreator().badKey(); + fail(); + } catch (IllegalArgumentException e) { + //test passes; ignore + } + } + + +} http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/api/src/test/java/org/apache/any23/writer/TripleFormatTest.java ---------------------------------------------------------------------- diff --git a/api/src/test/java/org/apache/any23/writer/TripleFormatTest.java b/api/src/test/java/org/apache/any23/writer/TripleFormatTest.java new file mode 100644 index 0000000..f91202c --- /dev/null +++ b/api/src/test/java/org/apache/any23/writer/TripleFormatTest.java @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.writer; + +import org.eclipse.rdf4j.rio.RDFFormat; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotSame; +import static org.junit.Assert.assertSame; + +public class TripleFormatTest { + + @Test + public void testRdf4jRoundTripping() { + + RDFFormat[] formats = { + RDFFormat.TRIX, RDFFormat.NQUADS, RDFFormat.RDFA, RDFFormat.TRIG, + RDFFormat.N3, RDFFormat.RDFXML, RDFFormat.TURTLE, RDFFormat.JSONLD, + RDFFormat.NTRIPLES, RDFFormat.BINARY, RDFFormat.RDFJSON + }; + + for (RDFFormat expected : formats) { + TripleFormat tf = TripleFormat.of(expected); + + RDFFormat actual = tf.toRDFFormat(); + assertSame(expected, actual); + + tf.rdfFormat = null; + actual = tf.toRDFFormat(); + assertNotSame(expected, actual); + + assertEquals(expected.getName(), actual.getName()); + assertEquals(expected.getStandardURI(), actual.getStandardURI()); + assertEquals(expected.getCharset(), actual.getCharset()); + assertEquals(expected.getFileExtensions(), actual.getFileExtensions()); + assertEquals(expected.supportsContexts(), actual.supportsContexts()); + assertEquals(expected.supportsNamespaces(), actual.supportsNamespaces()); + } + + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/cli/src/main/java/org/apache/any23/cli/Rover.java ---------------------------------------------------------------------- diff --git a/cli/src/main/java/org/apache/any23/cli/Rover.java b/cli/src/main/java/org/apache/any23/cli/Rover.java index 5b49b39..ef912f7 100644 --- a/cli/src/main/java/org/apache/any23/cli/Rover.java +++ b/cli/src/main/java/org/apache/any23/cli/Rover.java @@ -25,13 +25,18 @@ import com.beust.jcommander.converters.FileConverter; import org.apache.any23.Any23; import org.apache.any23.configuration.Configuration; import org.apache.any23.configuration.DefaultConfiguration; +import org.apache.any23.configuration.Setting; +import org.apache.any23.configuration.Settings; import org.apache.any23.extractor.ExtractionParameters; import org.apache.any23.extractor.ExtractionParameters.ValidationMode; import org.apache.any23.filter.IgnoreAccidentalRDFa; import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments; import org.apache.any23.source.DocumentSource; import org.apache.any23.writer.BenchmarkTripleHandler; +import org.apache.any23.writer.DecoratingWriterFactory; +import org.apache.any23.writer.TripleWriterFactory; import org.apache.any23.writer.LoggingTripleHandler; +import org.apache.any23.writer.NTriplesWriterFactory; import org.apache.any23.writer.ReportingTripleHandler; import org.apache.any23.writer.TripleHandler; import org.apache.any23.writer.TripleHandlerException; @@ -41,12 +46,16 @@ import org.slf4j.LoggerFactory; import java.io.File; import java.io.FileNotFoundException; +import java.io.OutputStream; import java.io.PrintStream; import java.io.PrintWriter; import java.net.MalformedURLException; import java.net.URL; +import java.util.Collections; import java.util.LinkedList; import java.util.List; +import java.util.ListIterator; +import java.util.Objects; import static java.lang.String.format; @@ -57,15 +66,42 @@ import static java.lang.String.format; * @author Michele Mostarda ([email protected]) * @author Richard Cyganiak ([email protected]) * @author Gabriele Renzi + * @author Hans Brende ([email protected]) */ @Parameters(commandNames = { "rover" }, commandDescription = "Any23 Command Line Tool.") public class Rover extends BaseTool { - private static final List<String> FORMATS = WriterFactoryRegistry.getInstance().getIdentifiers(); + private static final Logger logger = LoggerFactory.getLogger(Rover.class); - private static final int DEFAULT_FORMAT_INDEX = 0; + private static final WriterFactoryRegistry registry = WriterFactoryRegistry.getInstance(); + private static final String DEFAULT_WRITER_IDENTIFIER = NTriplesWriterFactory.IDENTIFIER; + + static { + final Setting<Boolean> ALWAYS_SUPPRESS_CSS_TRIPLES = Setting.newKey( + "alwayssuppresscsstriples", Boolean.class) + .withValue(Boolean.TRUE); + final Settings supportedSettings = Settings.of(ALWAYS_SUPPRESS_CSS_TRIPLES); + + registry.register(new DecoratingWriterFactory() { + + @Override + public TripleHandler getTripleWriter(TripleHandler delegate, Settings settings) { + boolean always = settings.get(ALWAYS_SUPPRESS_CSS_TRIPLES); + return new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(delegate), always); + } + + @Override + public Settings getSupportedSettings() { + return supportedSettings; + } + + @Override + public String getIdentifier() { + return "notrivial"; + } + }); + } - private static final Logger logger = LoggerFactory.getLogger(Rover.class); @Parameter( names = { "-o", "--output" }, @@ -80,8 +116,10 @@ public class Rover extends BaseTool { @Parameter(names = { "-e", "--extractors" }, description = "a comma-separated list of extractors, e.g. rdf-xml,rdf-turtle") private List<String> extractors = new LinkedList<>(); - @Parameter(names = { "-f", "--format" }, description = "the output format") - private String format = FORMATS.get(DEFAULT_FORMAT_INDEX); + @Parameter(names = { "-f", "--format" }, description = "a comma-separated list of writer factories, e.g. notrivial,nquads") + private List<String> formats = new LinkedList<String>() {{ + add(DEFAULT_WRITER_IDENTIFIER); + }}; @Parameter( names = { "-l", "--log" }, @@ -93,7 +131,7 @@ public class Rover extends BaseTool { @Parameter(names = { "-s", "--stats" }, description = "Print out extraction statistics.") private boolean statistics; - @Parameter(names = { "-t", "--notrivial" }, description = "Filter trivial statements (e.g. CSS related ones).") + @Parameter(names = { "-t", "--notrivial" }, description = "Filter trivial statements (e.g. CSS related ones). [DEPRECATED: As of version 2.3, use --format instead.]") private boolean noTrivial; @Parameter(names = { "-p", "--pedantic" }, description = "Validate and fixes HTML content detecting commons issues.") @@ -127,16 +165,28 @@ public class Rover extends BaseTool { outputStream = out; } + private static TripleHandler getWriter(String id, OutputStream os) { + TripleWriterFactory f = (TripleWriterFactory)registry.getWriterByIdentifier(id); + Objects.requireNonNull(f, () -> "Invalid writer id '" + id + "'; admitted values: " + registry.getIdentifiers()); + return f.getTripleWriter(os, Settings.of()); //TODO parse TripleWriter settings from format list + } + + private static TripleHandler getWriter(String id, TripleHandler delegate) { + DecoratingWriterFactory f = (DecoratingWriterFactory)registry.getWriterByIdentifier(id); + Objects.requireNonNull(f, () -> "Invalid writer id '" + id + "'; admitted values: " + registry.getIdentifiers()); + return f.getTripleWriter(delegate, Settings.of()); //TODO parse delegate settings from format list + } + protected void configure() { - try { - tripleHandler = WriterFactoryRegistry.getInstance().getWriterInstanceByIdentifier(format, outputStream); - } catch (Exception e) { - throw new NullPointerException( - format("Invalid output format '%s', admitted values: %s", - format, - FORMATS - ) - ); + List<String> formats = this.formats; + if (formats.isEmpty()) { + formats = Collections.singletonList(DEFAULT_WRITER_IDENTIFIER); + } + ListIterator<String> l = formats.listIterator(formats.size()); + tripleHandler = getWriter(l.previous(), outputStream); + + while (l.hasPrevious()) { + tripleHandler = getWriter(l.previous(), tripleHandler); } if (logFile != null) { http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/cli/src/test/java/org/apache/any23/cli/ExtractorsFlowTest.java ---------------------------------------------------------------------- diff --git a/cli/src/test/java/org/apache/any23/cli/ExtractorsFlowTest.java b/cli/src/test/java/org/apache/any23/cli/ExtractorsFlowTest.java new file mode 100644 index 0000000..0b75f57 --- /dev/null +++ b/cli/src/test/java/org/apache/any23/cli/ExtractorsFlowTest.java @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.any23.cli; + +import org.apache.any23.cli.flows.PeopleExtractor; +import org.apache.any23.rdf.RDFUtils; +import org.apache.commons.io.FileUtils; +import org.eclipse.rdf4j.model.Model; +import org.eclipse.rdf4j.model.impl.TreeModel; +import org.eclipse.rdf4j.rio.Rio; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.BufferedInputStream; +import java.io.File; +import java.io.FileInputStream; +import java.lang.invoke.MethodHandles; +import java.util.Arrays; +import java.util.stream.Stream; + +/** + * This is example for task ANY23-396 + * + * @author Jacek Grzebyta ([email protected]) + * @author Hans Brende ([email protected]) + */ +public class ExtractorsFlowTest extends ToolTestBase { + + private static final String testingDatafile = "/org/apache/any23/extractor/csv/test-comma.csv"; + private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + public ExtractorsFlowTest() { + super(Rover.class); + } + + /** + * Emulates action described in ANY23-396. + */ + @Test + public void runTestFor396() throws Exception { + File outputFile = File.createTempFile("mockdata-", ".ttl", tempDirectory); + File logFile = File.createTempFile("log-exec-", ".txt", tempDirectory); + + runTool(String.format("-l %s -o %s -f people,turtle -e csv -d %s %s", + logFile.getAbsolutePath(), + outputFile.getAbsolutePath(), + PeopleExtractor.RAW_NS, + copyResourceToTempFile(testingDatafile).getAbsolutePath())); + + // populate expected model + Model expected = new TreeModel(); + Stream.of("Davide Palmisano", "Michele Mostarda", "Giovanni Tummarello") + .map(PeopleExtractor::createPerson).forEach(expected::addAll); + + if (log.isDebugEnabled()) { + log.debug("\n\nlog file content:\n{}", FileUtils.readFileToString(logFile, "utf-8")); + log.debug("\n\nData file: \n{}", FileUtils.readFileToString(outputFile, "utf-8")); + } + + Assert.assertTrue(assertCompareModels(expected, outputFile)); + } + + /** + * Compare expected model and received from input File. + */ + private boolean assertCompareModels(Model expected, File received) throws Exception { + Model receivedModel = new TreeModel(); + receivedModel.addAll(Arrays.asList(RDFUtils.parseRDF( + Rio.getParserFormatForFileName(received.getName()).orElseThrow(AssertionError::new), + new BufferedInputStream(new FileInputStream(received)), + received.toURI().toString() + ))); + + return receivedModel.containsAll(expected); + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/cli/src/test/java/org/apache/any23/cli/RoverTest.java ---------------------------------------------------------------------- diff --git a/cli/src/test/java/org/apache/any23/cli/RoverTest.java b/cli/src/test/java/org/apache/any23/cli/RoverTest.java index 15054e4..c2b7a86 100644 --- a/cli/src/test/java/org/apache/any23/cli/RoverTest.java +++ b/cli/src/test/java/org/apache/any23/cli/RoverTest.java @@ -90,6 +90,45 @@ public class RoverTest extends ToolTestBase { Assert.assertEquals(0, graphCounter); } + @Test + public void testDelegatingWriterFactory() throws Exception { + final File outFile = File.createTempFile("rover-test", "out", tempDirectory); + final String DEFAULT_GRAPH = "http://test/default/ns"; + final String stylesheet = "http://www.w3.org/1999/xhtml/vocab#stylesheet"; + + Assert.assertEquals("Unexpected exit code.", 0, runTool( + String.format( + "-o %s -f nquads %s -d %s", + outFile.getAbsolutePath(), + copyResourceToTempFile("/cli/basic-with-stylesheet.html").getAbsolutePath(), + DEFAULT_GRAPH + ) + )); + + String content = FileUtils.readFileContent(outFile); + + Assert.assertTrue(content.contains(stylesheet)); + + final int lineCountWithStylesheet = content.split("\\n").length; + + Assert.assertEquals("Unexpected exit code.", 0, runTool( + String.format( + "-o %s -f notrivial,nquads %s -d %s", + outFile.getAbsolutePath(), + copyResourceToTempFile("/cli/basic-with-stylesheet.html").getAbsolutePath(), + DEFAULT_GRAPH + ) + )); + + content = FileUtils.readFileContent(outFile); + + Assert.assertTrue(!content.contains(stylesheet)); + + final int lineCountWithoutStylesheet = content.split("\\n").length; + + Assert.assertEquals(lineCountWithStylesheet - 1, lineCountWithoutStylesheet); + } + /* BEGIN: online tests. */ @Test http://git-wip-us.apache.org/repos/asf/any23/blob/692c583f/cli/src/test/java/org/apache/any23/cli/flows/PeopleExtractor.java ---------------------------------------------------------------------- diff --git a/cli/src/test/java/org/apache/any23/cli/flows/PeopleExtractor.java b/cli/src/test/java/org/apache/any23/cli/flows/PeopleExtractor.java new file mode 100644 index 0000000..d1f31c0 --- /dev/null +++ b/cli/src/test/java/org/apache/any23/cli/flows/PeopleExtractor.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except csvModel compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to csvModel writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.cli.flows; + +import org.apache.any23.extractor.ExtractionContext; +import org.apache.any23.vocab.CSV; +import org.apache.any23.writer.CompositeTripleHandler; +import org.apache.any23.writer.TripleHandler; +import org.apache.any23.writer.TripleHandlerException; +import org.apache.commons.codec.digest.DigestUtils; +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.Literal; +import org.eclipse.rdf4j.model.Model; +import org.eclipse.rdf4j.model.Resource; +import org.eclipse.rdf4j.model.Statement; +import org.eclipse.rdf4j.model.Value; +import org.eclipse.rdf4j.model.ValueFactory; +import org.eclipse.rdf4j.model.impl.SimpleValueFactory; +import org.eclipse.rdf4j.model.impl.TreeModel; +import org.eclipse.rdf4j.model.util.Models; +import org.eclipse.rdf4j.model.vocabulary.RDF; +import org.eclipse.rdf4j.model.vocabulary.XMLSchema; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.lang.invoke.MethodHandles; +import java.util.Collections; +import java.util.Set; +import java.util.stream.Collectors; + +/** + * Proof of concept for ANY23-396 example. + */ +public class PeopleExtractor extends CompositeTripleHandler { + + private Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private static final CSV csv = CSV.getInstance(); + private static final ValueFactory vf = SimpleValueFactory.getInstance(); + public static final String RAW_NS = "urn:dataser:raw/"; + private static final IRI RAW_FIRST_NAME = vf.createIRI(RAW_NS, "FirstName"); + private static final IRI RAW_LAST_NAME = vf.createIRI(RAW_NS, "LastName"); + + private static final String NAMESPACE = "http://supercustom.net/ontology/"; + private static final IRI PERSON = vf.createIRI(NAMESPACE, "Person"); + private static final IRI FULL_NAME = vf.createIRI(NAMESPACE, "fullName"); + private static final IRI HASH = vf.createIRI(NAMESPACE, "hash"); + + public static Model createPerson(String fullName) { + IRI s = vf.createIRI("http://rdf.supercustom.net/data/", DigestUtils.sha1Hex(fullName)); + Model model = new TreeModel(); + model.add(s, RDF.TYPE, PERSON); + model.add(s, FULL_NAME, vf.createLiteral(fullName)); + model.add(s, HASH, vf.createLiteral(s.getLocalName(), XMLSchema.HEXBINARY)); + return model; + }; + + private final Model csvModel = new TreeModel(); + + public PeopleExtractor(TripleHandler delegate) { + super(Collections.singletonList(delegate)); + } + + @Override + public void receiveTriple(Resource s, IRI p, Value o, IRI g, ExtractionContext context) throws TripleHandlerException { + if ("csv".equals(context.getExtractorName())) { + csvModel.add(s, p, o, vf.createIRI(context.getUniqueID())); + } else { + super.receiveTriple(s, p, o, g, context); + } + } + + @Override + public void closeContext(ExtractionContext context) throws TripleHandlerException { + Set<Resource> subjects = csvModel.filter(null, RDF.TYPE, csv.rowType) + .stream().map(Statement::getSubject).collect(Collectors.toSet()); + + log.debug("List of rows: {}", subjects); + + for (Resource rowId : subjects) { + String firstName = Models.objectLiteral(csvModel.filter(rowId, RAW_FIRST_NAME, null)) + .map(Literal::getLabel).orElse(""); + + String lastName = Models.objectLiteral(csvModel.filter(rowId, RAW_LAST_NAME, null)) + .map(Literal::getLabel).orElse(""); + + String fullName = firstName + " " + lastName; + + for (Statement s : createPerson(fullName)) { + super.receiveTriple(s.getSubject(), s.getPredicate(), s.getObject(), null, context); + } + } + + csvModel.clear(); + + super.closeContext(context); + } + +} \ No newline at end of file
