ANY23-304 Add extractor for OpenIE
Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/2ecfbff1 Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/2ecfbff1 Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/2ecfbff1 Branch: refs/heads/master Commit: 2ecfbff1dddaf57689b725feddba47c7921f726d Parents: bc46c72 Author: Lewis John McGibbney <[email protected]> Authored: Thu Feb 23 17:26:03 2017 -0800 Committer: Lewis John McGibbney <[email protected]> Committed: Thu Feb 23 17:26:03 2017 -0800 ---------------------------------------------------------------------- .../configuration/DefaultConfiguration.java | 23 +- .../DefaultModifiableConfiguration.java | 4 +- .../java/org/apache/any23/vocab/Vocabulary.java | 26 +- .../resources/default-configuration.properties | 4 + .../extractor/SingleDocumentExtraction.java | 6 +- .../extractor/html/EmbeddedJSONLDExtractor.java | 4 +- .../any23/extractor/html/GeoExtractor.java | 7 +- .../any23/extractor/html/TagSoupParser.java | 2 - .../any23/extractor/xpath/XPathExtractor.java | 3 +- .../any23/extractor/yaml/YAMLExtractor.java | 58 +- .../java/org/apache/any23/rdf/RDFUtils.java | 50 +- .../java/org/apache/any23/util/StreamUtils.java | 69 +- .../any23/extractor/yaml/YAMLExtractorTest.java | 1 - openie/pom.xml | 154 +++++ .../apache/any23/openie/OpenIEExtractor.java | 129 ++++ .../any23/openie/OpenIEExtractorFactory.java | 52 ++ .../any23/openie/OpenIEExtractorTest.java | 87 +++ pom.xml | 1 + .../any23/extractor/openie/example-openie.html | 638 +++++++++++++++++++ 19 files changed, 1230 insertions(+), 88 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/api/src/main/java/org/apache/any23/configuration/DefaultConfiguration.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/configuration/DefaultConfiguration.java b/api/src/main/java/org/apache/any23/configuration/DefaultConfiguration.java index 6edaf34..170548e 100644 --- a/api/src/main/java/org/apache/any23/configuration/DefaultConfiguration.java +++ b/api/src/main/java/org/apache/any23/configuration/DefaultConfiguration.java @@ -48,6 +48,14 @@ public class DefaultConfiguration implements Configuration { protected final Properties properties; + protected DefaultConfiguration(Properties properties) { + this.properties = properties; + } + + private DefaultConfiguration() { + this( loadDefaultProperties() ); + } + /** * @return the singleton configuration instance. * Such instance is unmodifiable. @@ -74,22 +82,17 @@ public class DefaultConfiguration implements Configuration { return properties; } - protected DefaultConfiguration(Properties properties) { - this.properties = properties; - } - - private DefaultConfiguration() { - this( loadDefaultProperties() ); - } - + @Override public synchronized String[] getProperties() { return properties.keySet().toArray( new String[properties.size()] ); } + @Override public synchronized boolean defineProperty(String propertyName) { return properties.containsKey(propertyName); } + @Override public synchronized String getProperty(String propertyName, String defaultValue) { final String value = getPropertyValue(propertyName); if(value == null) { @@ -98,6 +101,7 @@ public class DefaultConfiguration implements Configuration { return value; } + @Override public synchronized String getPropertyOrFail(String propertyName) { final String propertyValue = getPropertyValue(propertyName); if(propertyValue == null) { @@ -111,6 +115,7 @@ public class DefaultConfiguration implements Configuration { return propertyValue; } + @Override public synchronized int getPropertyIntOrFail(String propertyName) { final String value = getPropertyOrFail(propertyName); final String trimValue = value.trim(); @@ -121,6 +126,7 @@ public class DefaultConfiguration implements Configuration { } } + @Override public synchronized boolean getFlagProperty(final String propertyName) { final String value = getPropertyOrFail(propertyName); if(value == null) { @@ -140,6 +146,7 @@ public class DefaultConfiguration implements Configuration { ); } + @Override public synchronized String getConfigurationDump() { final String[] defaultProperties = getProperties(); final StringBuilder sb = new StringBuilder(); http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/api/src/main/java/org/apache/any23/configuration/DefaultModifiableConfiguration.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/configuration/DefaultModifiableConfiguration.java b/api/src/main/java/org/apache/any23/configuration/DefaultModifiableConfiguration.java index 82ceaad..055d39c 100644 --- a/api/src/main/java/org/apache/any23/configuration/DefaultModifiableConfiguration.java +++ b/api/src/main/java/org/apache/any23/configuration/DefaultModifiableConfiguration.java @@ -30,8 +30,10 @@ public class DefaultModifiableConfiguration extends DefaultConfiguration impleme super(properties); } + @Override public synchronized String setProperty(String propertyName, String propertyValue) { - if( ! defineProperty(propertyName) ) throw new IllegalArgumentException( + if( ! defineProperty(propertyName) ) + throw new IllegalArgumentException( String.format("Property '%s' is not defined in configuration.", propertyName) ); return (String) properties.setProperty(propertyName, propertyValue); http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/api/src/main/java/org/apache/any23/vocab/Vocabulary.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/vocab/Vocabulary.java b/api/src/main/java/org/apache/any23/vocab/Vocabulary.java index 8c8204f..718f514 100644 --- a/api/src/main/java/org/apache/any23/vocab/Vocabulary.java +++ b/api/src/main/java/org/apache/any23/vocab/Vocabulary.java @@ -157,8 +157,8 @@ public abstract class Vocabulary { if(classes == null) { return new IRI[0]; } - final Collection<IRI> IRIs = classes.values(); - return IRIs.toArray( new IRI[ IRIs.size() ] ); + final Collection<IRI> iris = classes.values(); + return iris.toArray( new IRI[ iris.size() ] ); } /** @@ -168,8 +168,8 @@ public abstract class Vocabulary { if(properties == null) { return new IRI[0]; } - final Collection<IRI> IRIs = properties.values(); - return IRIs.toArray( new IRI[ IRIs.size() ] ); + final Collection<IRI> iris = properties.values(); + return iris.toArray( new IRI[ iris.size() ] ); } /** @@ -197,11 +197,11 @@ public abstract class Vocabulary { /** * Creates a IRI. * - * @param IRIStr the IRI string + * @param iriStr the IRI string * @return the IRI instance. */ - protected IRI createIRI(String IRIStr) { - return SimpleValueFactory.getInstance().createIRI(IRIStr); + protected IRI createIRI(String iriStr) { + return SimpleValueFactory.getInstance().createIRI(iriStr); } /** @@ -214,7 +214,7 @@ public abstract class Vocabulary { protected IRI createClass(String namespace, String resource) { IRI res = createIRI(namespace, resource); if(classes == null) { - classes = new HashMap<String, IRI>(10); + classes = new HashMap<>(10); } classes.put(resource, res); return res; @@ -230,7 +230,7 @@ public abstract class Vocabulary { protected IRI createProperty(String namespace, String property) { IRI res = createIRI(namespace, property); if(properties == null) { - properties = new HashMap<String, IRI>(10); + properties = new HashMap<>(10); } properties.put(property, res); return res; @@ -248,14 +248,16 @@ public abstract class Vocabulary { } private void fillResourceToCommentMap() { - if(resourceToCommentMap != null) return; - final Map<IRI,String> newMap = new HashMap<IRI, String>(); + if(resourceToCommentMap != null) + return; + final Map<IRI,String> newMap = new HashMap<>(); for (Field field : this.getClass().getFields()) { try { final Object value = field.get(this); if(value instanceof IRI) { final Comment comment = field.getAnnotation(Comment.class); - if(comment != null) newMap.put((IRI) value, comment.value()); + if(comment != null) + newMap.put((IRI) value, comment.value()); } } catch (IllegalAccessException iae) { throw new RuntimeException("Error while creating resource to comment map.", iae); http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/api/src/main/resources/default-configuration.properties ---------------------------------------------------------------------- diff --git a/api/src/main/resources/default-configuration.properties b/api/src/main/resources/default-configuration.properties index d047a83..4f68586 100644 --- a/api/src/main/resources/default-configuration.properties +++ b/api/src/main/resources/default-configuration.properties @@ -72,3 +72,7 @@ any23.extraction.head.meta=on # Allows to specify a CSV file separator and comment delimeter any23.extraction.csv.field=, any23.extraction.csv.comment=# + +# A confidence threshold for the OpenIE extractions +# Any extractions below this value will not be processed. +any23.extraction.openie.confidence.threshold=0.5 http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java index d88edf7..295f4e9 100644 --- a/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java +++ b/core/src/main/java/org/apache/any23/extractor/SingleDocumentExtraction.java @@ -231,10 +231,10 @@ public class SingleDocumentExtraction { log.debug(sb.toString()); } - final List<ResourceRoot> resourceRoots = new ArrayList<ResourceRoot>(); - final List<PropertyPath> propertyPaths = new ArrayList<PropertyPath>(); + final List<ResourceRoot> resourceRoots = new ArrayList<>(); + final List<PropertyPath> propertyPaths = new ArrayList<>(); final Map<String,Collection<IssueReport.Issue>> extractorToIssues = - new HashMap<String,Collection<IssueReport.Issue>>(); + new HashMap<>(); // Invoke all extractors. try { http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java index 818fc98..db58586 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/EmbeddedJSONLDExtractor.java @@ -56,7 +56,7 @@ public class EmbeddedJSONLDExtractor implements Extractor.TagSoupDOMExtractor { private IRI profile; - private Map<String, IRI> prefixes = new HashMap<String, IRI>(); + private Map<String, IRI> prefixes = new HashMap<>(); private String documentLang; @@ -137,7 +137,7 @@ public class EmbeddedJSONLDExtractor implements Extractor.TagSoupDOMExtractor { ExtractionContext extractionContext, ExtractionResult out) throws IOException, ExtractionException { List<Node> scriptNodes = DomUtils.findAll(in, "/HTML/HEAD/SCRIPT"); - Set<JSONLDScript> result = new HashSet<JSONLDScript>(); + Set<JSONLDScript> result = new HashSet<>(); extractor = new JSONLDExtractorFactory().createExtractor(); for (Node jsonldNode : scriptNodes) { NamedNodeMap attributes = jsonldNode.getAttributes(); http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/extractor/html/GeoExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/GeoExtractor.java b/core/src/main/java/org/apache/any23/extractor/html/GeoExtractor.java index d85af79..ed7e5d3 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/GeoExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/html/GeoExtractor.java @@ -50,7 +50,8 @@ public class GeoExtractor extends EntityBasedMicroformatExtractor { } protected boolean extractEntity(Node node, ExtractionResult out) { - if (null == node) return false; + if (null == node) + return false; //try lat & lon final HTMLDocument document = new HTMLDocument(node); HTMLDocument.TextField latNode = document.getSingularTextField("latitude" ); @@ -59,13 +60,13 @@ public class GeoExtractor extends EntityBasedMicroformatExtractor { String lon = lonNode.value(); if ("".equals(lat) || "".equals(lon)) { String[] both = document.getSingularUrlField("geo").value().split(";"); - if (both.length != 2) return false; + if (both.length != 2) + return false; lat = both[0]; lon = both[1]; } BNode geo = getBlankNodeFor(node); out.writeTriple(geo, RDF.TYPE, vVCARD.Location); - final String extractorName = getDescription().getExtractorName(); conditionallyAddStringProperty( latNode.source(), geo, vVCARD.latitude , lat http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java index e6eb9cd..9ef72f4 100644 --- a/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java +++ b/core/src/main/java/org/apache/any23/extractor/html/TagSoupParser.java @@ -25,8 +25,6 @@ import org.apache.xerces.xni.QName; import org.apache.xerces.xni.XMLAttributes; import org.apache.xerces.xni.XNIException; import org.cyberneko.html.parsers.DOMParser; -import org.eclipse.rdf4j.model.IRI; -import org.eclipse.rdf4j.model.impl.SimpleValueFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java b/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java index b04533c..1fe1b02 100644 --- a/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/xpath/XPathExtractor.java @@ -39,9 +39,10 @@ import java.util.List; */ public class XPathExtractor implements Extractor.TagSoupDOMExtractor { - private final List<XPathExtractionRule> xPathExtractionRules = new ArrayList<XPathExtractionRule>(); + private final List<XPathExtractionRule> xPathExtractionRules = new ArrayList<>(); public XPathExtractor() { + //default constructor } public XPathExtractor(List<XPathExtractionRule> rules) { http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java b/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java index 64548f1..19bccd1 100644 --- a/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java +++ b/core/src/main/java/org/apache/any23/extractor/yaml/YAMLExtractor.java @@ -17,8 +17,6 @@ package org.apache.any23.extractor.yaml; import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; -import java.util.Arrays; import java.util.Iterator; import java.util.List; import java.util.Map; @@ -29,9 +27,7 @@ import org.apache.any23.extractor.ExtractionResult; import org.apache.any23.extractor.Extractor; import org.apache.any23.extractor.ExtractorDescription; import org.apache.any23.rdf.RDFUtils; -import org.apache.any23.util.StringUtils; import org.apache.any23.vocab.YAML; -import org.apache.commons.lang.WordUtils; import org.eclipse.rdf4j.model.Resource; import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Value; @@ -64,10 +60,10 @@ public class YAMLExtractor implements Extractor.ContentExtractor { public void run(ExtractionParameters extractionParameters, ExtractionContext context, InputStream in, ExtractionResult out) throws IOException, ExtractionException { - IRI documentURI = context.getDocumentIRI(); - documentRoot = RDFUtils.uri(documentURI.toString() + "root"); + IRI documentIRI = context.getDocumentIRI(); + documentRoot = RDFUtils.iri(documentIRI.toString() + "root"); - log.debug("process: {}", documentURI.toString()); + log.debug("Processing: {}", documentIRI.toString()); out.writeNamespace(vocab.PREFIX, vocab.NS); out.writeNamespace(RDF.PREFIX, RDF.NAMESPACE); out.writeNamespace(RDFS.PREFIX, RDFS.NAMESPACE); @@ -77,10 +73,10 @@ public class YAMLExtractor implements Extractor.ContentExtractor { // Iterate over page(s) for (Object p : docIterate) { - Resource pageNode = YAMLExtractor.this.makeUri("document", documentURI); + Resource pageNode = RDFUtils.makeIRI("document", documentIRI, true); out.writeTriple(documentRoot, vocab.contains, pageNode); out.writeTriple(pageNode, RDF.TYPE, vocab.document); - out.writeTriple(pageNode, vocab.contains, buildNode(documentURI, p, out)); + out.writeTriple(pageNode, vocab.contains, buildNode(documentIRI, p, out)); } } @@ -99,9 +95,9 @@ public class YAMLExtractor implements Extractor.ContentExtractor { if (treeData == null) { return RDF.NIL; } else if (treeData instanceof Map) { - return processMap(fileURI, (Map) treeData, out); + return processMap(fileURI, (Map<String, Object>) treeData, out); } else if (treeData instanceof List) { - return processList(fileURI, (List) treeData, out); + return processList(fileURI, (List<?>) treeData, out); } else if (treeData instanceof Long) { return RDFUtils.literal(((Long) treeData)); } else if (treeData instanceof Integer) { @@ -120,9 +116,9 @@ public class YAMLExtractor implements Extractor.ContentExtractor { } private Value processMap(IRI file, Map<String, Object> node, ExtractionResult out) { - Resource nodeURI = YAMLExtractor.this.makeUri(file); + Resource nodeURI = RDFUtils.makeIRI(file); for (String k : node.keySet()) { - Resource predicate = makeUri(k, file, false); + Resource predicate = RDFUtils.makeIRI(k, file, true); Value value = buildNode(file, node.get(k), out); out.writeTriple(nodeURI, RDF.TYPE, vocab.node); out.writeTriple(nodeURI, (IRI) predicate, value); @@ -132,13 +128,13 @@ public class YAMLExtractor implements Extractor.ContentExtractor { return nodeURI; } - private Value processList(IRI fileURI, Iterable iter, ExtractionResult out) { + private Value processList(IRI fileURI, Iterable<?> iter, ExtractionResult out) { Resource node = YAMLExtractor.this.makeUri(); out.writeTriple(node, RDF.TYPE, RDF.LIST); Resource pList = null; // previous RDF iter node Resource cList = node; // cutternt RDF iter node - Iterator listIter = iter.iterator(); + Iterator<?> listIter = iter.iterator(); while (listIter.hasNext()) { // If previous RDF iter node is given lint with current one if (pList != null) { @@ -161,36 +157,4 @@ public class YAMLExtractor implements Extractor.ContentExtractor { nodeId++; return bnode; } - - private Resource makeUri(IRI docUri) { - return makeUri("node", docUri); - } - - private Resource makeUri(String type, IRI docUri) { - return makeUri(type, docUri, true); - } - - private Resource makeUri(String type, IRI docUri, boolean addId) { - - // preprocess string: converts - -> _ - // converts <space>: word1 word2 -> word1Word2 - String newType = StringUtils.implementJavaNaming(type); - - String uriString; - if (docUri.toString().endsWith("/")) { - uriString = docUri.toString() + newType; - } else { - uriString = docUri.toString() + "#" + newType; - } - - if (addId) { - uriString = uriString + "_" + Integer.toString(nodeId); - } - - Resource node = RDFUtils.uri(uriString); - if (addId) { - nodeId++; - } - return node; - } } http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/rdf/RDFUtils.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java index bbfe5ec..f6e3a8c 100644 --- a/core/src/main/java/org/apache/any23/rdf/RDFUtils.java +++ b/core/src/main/java/org/apache/any23/rdf/RDFUtils.java @@ -18,7 +18,9 @@ package org.apache.any23.rdf; import org.apache.any23.util.MathUtils; +import org.apache.any23.util.StringUtils; import org.eclipse.rdf4j.model.BNode; +import org.eclipse.rdf4j.model.IRI; import org.eclipse.rdf4j.model.Literal; import org.eclipse.rdf4j.model.Resource; import org.eclipse.rdf4j.model.Statement; @@ -60,6 +62,8 @@ import java.util.Optional; */ public class RDFUtils { + private static int nodeId = 0; + private static final ValueFactory valueFactory = SimpleValueFactory.getInstance(); /** @@ -71,7 +75,8 @@ public class RDFUtils { */ public static String fixAbsoluteIRI(String uri) { String fixed = fixIRIWithException(uri); - if (!fixed.matches("[a-zA-Z0-9]+:/.*")) throw new IllegalArgumentException("not a absolute org.eclipse.rdf4j.model.IRI: " + uri); + if (!fixed.matches("[a-zA-Z0-9]+:/.*")) + throw new IllegalArgumentException("not a absolute org.eclipse.rdf4j.model.IRI: " + uri); // Add trailing slash if org.eclipse.rdf4j.model.IRI has only authority but no path. if (fixed.matches("https?://[a-zA-Z0-9.-]+(:[0-9+])?")) { fixed = fixed + "/"; @@ -129,7 +134,8 @@ public class RDFUtils { * @return the unescaped string. */ public static String fixIRIWithException(String unescapedIRI) { - if (unescapedIRI == null) throw new IllegalArgumentException("org.eclipse.rdf4j.model.IRI was null"); + if (unescapedIRI == null) + throw new IllegalArgumentException("org.eclipse.rdf4j.model.IRI was null"); // Remove starting and ending whitespace String escapedIRI = unescapedIRI.trim(); @@ -141,7 +147,8 @@ public class RDFUtils { escapedIRI = escapedIRI.replaceAll("\n", ""); //'Remove starting "\" or '"' - if (escapedIRI.startsWith("\\") || escapedIRI.startsWith("\"")) escapedIRI = escapedIRI.substring(1); + if (escapedIRI.startsWith("\\") || escapedIRI.startsWith("\"")) + escapedIRI = escapedIRI.substring(1); //Remove ending "\" or '"' if (escapedIRI.endsWith("\\") || escapedIRI.endsWith("\"")) escapedIRI = escapedIRI.substring(0, escapedIRI.length() - 1); @@ -406,7 +413,8 @@ public class RDFUtils { * @return a value instance. */ public static Value toValue(String s) { - if ("a".equals(s)) return RDF.TYPE; + if ("a".equals(s)) + return RDF.TYPE; if (s.matches("[a-z0-9]+:.*")) { return PopularPrefixes.get().expand(s); } @@ -466,7 +474,8 @@ public class RDFUtils { * @throws IllegalArgumentException if no extension matches. */ public static Optional<RDFFormat> getFormatByExtension(String ext) { - if( ! ext.startsWith(".") ) ext = "." + ext; + if( ! ext.startsWith(".") ) + ext = "." + ext; return Rio.getParserFormatForFileName(ext); } @@ -564,6 +573,37 @@ public class RDFUtils { } } + public static Resource makeIRI(IRI docUri) { + return makeIRI("node", docUri); + } + + public static Resource makeIRI(String type, IRI docIRI) { + return makeIRI(type, docIRI, false); + } + + public static Resource makeIRI(String type, IRI docIRI, boolean addId) { + + // preprocess string: converts - -> _ + // converts <space>: word1 word2 -> word1Word2 + String newType = StringUtils.implementJavaNaming(type); + + String iriString; + if (docIRI.toString().endsWith("/")) { + iriString = docIRI.toString() + newType; + } else { + iriString = docIRI.toString() + "#" + newType; + } + + if (addId) { + iriString = iriString + "_" + Integer.toString(nodeId); + } + + Resource node = RDFUtils.iri(iriString); + if (addId) { + nodeId++; + } + return node; + } private RDFUtils() {} } http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/main/java/org/apache/any23/util/StreamUtils.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/util/StreamUtils.java b/core/src/main/java/org/apache/any23/util/StreamUtils.java index 2022f0e..a456655 100644 --- a/core/src/main/java/org/apache/any23/util/StreamUtils.java +++ b/core/src/main/java/org/apache/any23/util/StreamUtils.java @@ -17,10 +17,17 @@ package org.apache.any23.util; +import org.apache.commons.io.ByteOrderMark; +import org.apache.commons.io.input.BOMInputStream; +import org.apache.xerces.impl.io.MalformedByteSequenceException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.xml.sax.SAXException; import java.io.BufferedReader; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; import java.io.Closeable; import java.io.IOException; import java.io.InputStream; @@ -28,6 +35,18 @@ import java.io.InputStreamReader; import java.util.ArrayList; import java.util.List; +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.transform.Result; +import javax.xml.transform.Source; +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerException; +import javax.xml.transform.TransformerFactory; +import javax.xml.transform.TransformerFactoryConfigurationError; +import javax.xml.transform.dom.DOMSource; +import javax.xml.transform.stream.StreamResult; + /** * Contains general utility functions for handling streams. * @@ -93,9 +112,9 @@ public class StreamUtils { * @return the string content. * @throws IOException if an error occurs while consuming the <code>is</code> stream. */ - public static String asString(InputStream is) throws IOException { - return asString(is, false); - } + public static String asString(InputStream is) throws IOException { + return asString(is, false); + } /** * Closes the closable interface and reports error if any. @@ -112,4 +131,48 @@ public class StreamUtils { } } + /** + * Converts a {@link org.w3c.dom.Document} to an + * {@link java.io.InputStream} + * @throws TransformerFactoryConfigurationError + * @throws TransformerConfigurationException + */ + public static InputStream documentToInputStream(Document doc) + throws TransformerConfigurationException, TransformerFactoryConfigurationError { + ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); + Source xmlSource = new DOMSource(doc); + Result outputTarget = new StreamResult(outputStream); + try { + TransformerFactory.newInstance().newTransformer().transform(xmlSource, outputTarget); + } catch (TransformerException e) { + logger.error("Error during transformation: {}", e); + } + return new ByteArrayInputStream(outputStream.toByteArray()); + } + + public static Document inputStreamToDocument(InputStream is) throws MalformedByteSequenceException { + DocumentBuilderFactory factory = null; + DocumentBuilder builder = null; + Document doc = null; + + try { + factory = DocumentBuilderFactory.newInstance(); + builder = factory.newDocumentBuilder(); + } catch (ParserConfigurationException e) { + logger.error("Error converting InputStream to Document: {}", e); + } + + try { + BOMInputStream bomIn = new BOMInputStream(is, ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, + ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, ByteOrderMark.UTF_32LE); + if (bomIn.hasBOM()) { + @SuppressWarnings("unused") + int firstNonBOMByte = bomIn.read(); // Skips BOM + } + doc = builder.parse(bomIn); + } catch (SAXException | IOException e) { + logger.error("Error converting InputStream to Document: {}", e); + } + return doc; + } } http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java ---------------------------------------------------------------------- diff --git a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java index 0cf8d14..f2c85ba 100644 --- a/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java +++ b/core/src/test/java/org/apache/any23/extractor/yaml/YAMLExtractorTest.java @@ -27,7 +27,6 @@ import org.eclipse.rdf4j.model.Statement; import org.eclipse.rdf4j.model.vocabulary.RDF; import org.eclipse.rdf4j.model.vocabulary.RDFS; import org.eclipse.rdf4j.repository.RepositoryResult; -import org.semarglproject.vocab.XSD; import org.slf4j.Logger; import org.slf4j.LoggerFactory; http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/openie/pom.xml ---------------------------------------------------------------------- diff --git a/openie/pom.xml b/openie/pom.xml new file mode 100644 index 0000000..799684d --- /dev/null +++ b/openie/pom.xml @@ -0,0 +1,154 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <artifactId>apache-any23</artifactId> + <groupId>org.apache.any23</groupId> + <version>2.1-SNAPSHOT</version> + <relativePath></relativePath> + </parent> + + <repositories> + <repository> + <snapshots> + <enabled>false</enabled> + </snapshots> + <id>bintray-allenai-maven</id> + <name>bintray</name> + <url>http://allenai.bintray.com/maven</url> + </repository> + </repositories> + <pluginRepositories> + <pluginRepository> + <snapshots> + <enabled>false</enabled> + </snapshots> + <id>bintray-allenai-maven</id> + <name>bintray-plugins</name> + <url>http://allenai.bintray.com/maven</url> + </pluginRepository> + </pluginRepositories> + + <artifactId>apache-any23-openie</artifactId> + + <name>Apache Any23 :: OpenIE</name> + <description>Open Information Extraction module.</description> + + <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>apache-any23-core</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>apache-any23-test-resources</artifactId> + <version>${project.version}</version> + <scope>test</scope> + <type>test-jar</type> + </dependency> + <dependency> + <groupId>org.allenai.openie</groupId> + <artifactId>openie_2.11</artifactId> + <version>4.2.6</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>org.allenai.openie</groupId> + <artifactId>openie_2.11</artifactId> + <version>4.2.6</version> + <scope>compile</scope> + <type>pom</type> + </dependency> + <dependency> + <groupId>edu.washington.cs.knowitall</groupId> + <artifactId>openregex</artifactId> + <version>1.1.1</version> + <scope>runtime</scope> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + <scope>test</scope> + </dependency> + </dependencies> + + <build> + <resources> + <resource> + <directory>${basedir}/../</directory> + <targetPath>META-INF</targetPath> + <includes> + <include>LICENSE.txt</include> + <include>NOTICE.txt</include> + </includes> + </resource> + </resources> + <pluginManagement> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-assembly-plugin</artifactId> + <version>${maven-assembly-plugin.version}</version> + <executions> + <execution> + <id>assembly</id> + <phase>package</phase> + <goals> + <goal>single</goal> + </goals> + </execution> + </executions> + <configuration> + <attach>true</attach> + <skipAssembly>true</skipAssembly> + <tarLongFileMode>gnu</tarLongFileMode> + </configuration> + </plugin> + </plugins> + </pluginManagement> + </build> + + <profiles> + <profile> + <id>release</id> + <build> + <resources> + <resource> + <directory>${basedir}/../</directory> + <targetPath>${project.build.directory}/apidocs/META-INF</targetPath> + <includes> + <include>LICENSE.txt</include> + <include>NOTICE.txt</include> + </includes> + </resource> + </resources> + </build> + </profile> + + </profiles> + +</project> http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/openie/src/main/java/org/apache/any23/openie/OpenIEExtractor.java ---------------------------------------------------------------------- diff --git a/openie/src/main/java/org/apache/any23/openie/OpenIEExtractor.java b/openie/src/main/java/org/apache/any23/openie/OpenIEExtractor.java new file mode 100644 index 0000000..b8fda29 --- /dev/null +++ b/openie/src/main/java/org/apache/any23/openie/OpenIEExtractor.java @@ -0,0 +1,129 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.openie; + +import java.io.IOException; +import java.util.List; + +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerFactoryConfigurationError; + +import org.apache.any23.extractor.Extractor; +import org.apache.any23.configuration.Configuration; +import org.apache.any23.configuration.DefaultConfiguration; +import org.apache.any23.extractor.ExtractionContext; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.rdf.RDFUtils; +import org.apache.any23.util.StreamUtils; +import org.apache.tika.Tika; +import org.apache.tika.exception.TikaException; +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.Resource; +import org.eclipse.rdf4j.model.Value; +import org.eclipse.rdf4j.model.vocabulary.RDF; +import org.eclipse.rdf4j.model.vocabulary.RDFS; +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractionParameters; +import org.apache.any23.extractor.ExtractionResult; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; + +import edu.knowitall.openie.Argument; +import edu.knowitall.openie.Instance; +import edu.knowitall.openie.OpenIE; +import edu.knowitall.tool.parse.ClearParser; +import edu.knowitall.tool.postag.ClearPostagger; +import edu.knowitall.tool.srl.ClearSrl; +import edu.knowitall.tool.tokenize.ClearTokenizer; +import scala.collection.JavaConversions; +import scala.collection.Seq; + +/** + * An <a href="https://github.com/allenai/openie-standalone">OpenIE</a> + * extractor able to generate <i>RDF</i> statements from + * sentences representing relations in the text. + */ +public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor { + + private static final Logger LOG = LoggerFactory.getLogger(OpenIEExtractor.class); + + private IRI documentRoot; + + /** + * default constructor + */ + OpenIEExtractor() { + // default constructor + } + + /** + * @see org.apache.any23.extractor.Extractor#getDescription() + */ + @Override + public ExtractorDescription getDescription() { + return OpenIEExtractorFactory.getDescriptionInstance(); + } + + @Override + public void run(ExtractionParameters extractionParameters, + ExtractionContext context, Document in, ExtractionResult out) + throws IOException, ExtractionException { + + IRI documentIRI = context.getDocumentIRI(); + documentRoot = RDFUtils.iri(documentIRI.toString() + "root"); + out.writeNamespace(RDF.PREFIX, RDF.NAMESPACE); + out.writeNamespace(RDFS.PREFIX, RDFS.NAMESPACE); + LOG.debug("Processing: {}", documentIRI.toString()); + + OpenIE openIE = new OpenIE( + new ClearParser( + new ClearPostagger( + new ClearTokenizer())), new ClearSrl(), false, false); + + Seq<Instance> extractions = null; + Tika tika = new Tika(); + try { + extractions = openIE.extract(tika.parseToString(StreamUtils.documentToInputStream(in))); + } catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) { + LOG.error("Encountered error during OpenIE extraction.", e); + } catch (TikaException e) { + LOG.error("Encountered error whilst parsing InputStream with Tika.", e); + } + + List<Instance> listExtractions = JavaConversions.seqAsJavaList(extractions); + // for each extraction instance we can obtain a number of extraction elements + // instance.confidence() - a confidence value for the extraction itself + // instance.extr().context() - an optional representation of the context for this extraction + // instance.extr().arg1().text() - subject + // instance.extr().rel().text() - predicate + // instance.extr().arg2s().text() - object + for(Instance instance : listExtractions) { + final Configuration immutableConf = DefaultConfiguration.singleton(); + if (instance.confidence() > Double.parseDouble(immutableConf.getProperty("any23.extraction.openie.confidence.threshold", "0.5"))) { + List<Argument> listArg2s = JavaConversions.seqAsJavaList(instance.extr().arg2s()); + for(Argument argument : listArg2s) { + Resource subject = RDFUtils.makeIRI(instance.extr().arg1().text(), documentIRI); + IRI predicate = (IRI) RDFUtils.makeIRI(instance.extr().rel().text(), documentIRI); + Value object = RDFUtils.toValue(argument.text()); + out.writeTriple(subject, predicate, object); + } + } + } + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/openie/src/main/java/org/apache/any23/openie/OpenIEExtractorFactory.java ---------------------------------------------------------------------- diff --git a/openie/src/main/java/org/apache/any23/openie/OpenIEExtractorFactory.java b/openie/src/main/java/org/apache/any23/openie/OpenIEExtractorFactory.java new file mode 100644 index 0000000..4a1696a --- /dev/null +++ b/openie/src/main/java/org/apache/any23/openie/OpenIEExtractorFactory.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.openie; + +import java.util.Arrays; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.Prefixes; + +/** + * @author lewismc + * + */ +public class OpenIEExtractorFactory extends SimpleExtractorFactory<OpenIEExtractor> + implements ExtractorFactory<OpenIEExtractor> { + + public static final String NAME = "openie"; + + public static final Prefixes prefixes = null; + + private static final ExtractorDescription descriptionInstance = new OpenIEExtractorFactory(); + + public OpenIEExtractorFactory() { + super(NAME, prefixes, Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), "example-openie.html"); + } + + @Override + public OpenIEExtractor createExtractor() { + return new OpenIEExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java ---------------------------------------------------------------------- diff --git a/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java b/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java new file mode 100644 index 0000000..3561bdd --- /dev/null +++ b/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.openie; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; + +import org.apache.any23.extractor.ExtractionContext; +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractionParameters; +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractionResultImpl; +import org.apache.any23.rdf.RDFUtils; +import org.apache.any23.util.StreamUtils; +import org.apache.any23.writer.RDFXMLWriter; +import org.apache.any23.writer.TripleHandler; +import org.apache.any23.writer.TripleHandlerException; +import org.eclipse.rdf4j.model.IRI; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author lewismc + * + */ +public class OpenIEExtractorTest { + + private static final Logger logger = LoggerFactory.getLogger(OpenIEExtractorTest.class); + + private OpenIEExtractor extractor; + + @Before + public void setUp() throws Exception { + extractor = new OpenIEExtractor(); + } + + @After + public void tearDown() throws Exception { + extractor = null; + } + + //@Ignore("This typically results in a JVM crash... disabled for the time being.") + @Test + public void testExtractFromHTMLDocument() + throws IOException, ExtractionException, TripleHandlerException { + final IRI uri = RDFUtils.iri("http://podaac.jpl.nasa.gov/aquarius"); + extract(uri, "/org/apache/any23/extractor/openie/example-openie.html"); + } + + public void extract(IRI uri, String filePath) + throws IOException, ExtractionException, TripleHandlerException { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + final TripleHandler tHandler = new RDFXMLWriter(baos); + final ExtractionContext extractionContext = new ExtractionContext("rdf-openie", uri); + final ExtractionResult result = new ExtractionResultImpl(extractionContext, extractor, tHandler); + try { + extractor.run( + ExtractionParameters.newDefault(), + extractionContext, + StreamUtils.inputStreamToDocument(this.getClass().getResourceAsStream(filePath)), + result + ); + } finally { + logger.debug(baos.toString()); + tHandler.close(); + result.close(); + } + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/2ecfbff1/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 23ab57f..fffc7b5 100644 --- a/pom.xml +++ b/pom.xml @@ -204,6 +204,7 @@ <module>encoding</module> <module>core</module> <module>cli</module> + <module>openie</module> <module>plugins/basic-crawler</module> <module>plugins/html-scraper</module> <module>plugins/office-scraper</module>
