Repository: any23 Updated Branches: refs/heads/master 66ce1241a -> 394d36a0c
ANY23-321 Add openie toggle functionality to service Project: http://git-wip-us.apache.org/repos/asf/any23/repo Commit: http://git-wip-us.apache.org/repos/asf/any23/commit/706e891c Tree: http://git-wip-us.apache.org/repos/asf/any23/tree/706e891c Diff: http://git-wip-us.apache.org/repos/asf/any23/diff/706e891c Branch: refs/heads/master Commit: 706e891cf582736f90cfbe83bc1ef5d629e6dfd7 Parents: 0613280 Author: Lewis John McGibbney <[email protected]> Authored: Wed Jan 3 00:05:39 2018 +0000 Committer: Lewis John McGibbney <[email protected]> Committed: Wed Jan 3 00:05:39 2018 +0000 ---------------------------------------------------------------------- .../apache/any23/extractor/ExtractorGroup.java | 1 + .../apache/any23/plugin/Any23PluginManager.java | 23 +-- core/src/main/java/org/apache/any23/Any23.java | 8 +- .../any23/extractor/ExtractorRegistryImpl.java | 11 +- openie/pom.xml | 152 ----------------- .../any23/extractor/openie/OpenIEExtractor.java | 130 --------------- .../openie/OpenIEExtractorFactory.java | 52 ------ .../org.apache.any23.extractor.ExtractorFactory | 1 - .../any23/openie/OpenIEExtractorTest.java | 88 ---------- .../htmlscraper/HTMLScraperExtractor.java | 12 +- plugins/integration-test/pom.xml | 5 + .../java/org/apache/any23/plugin/PluginIT.java | 11 +- plugins/openie/pom.xml | 165 +++++++++++++++++++ .../extractor/openie/OpenIEExtractor.java | 137 +++++++++++++++ .../openie/OpenIEExtractorFactory.java | 52 ++++++ .../org.apache.any23.extractor.ExtractorFactory | 1 + .../any23/openie/OpenIEExtractorTest.java | 88 ++++++++++ pom.xml | 6 +- service/README.md | 49 ++++++ service/README.txt | 50 ------ service/pom.xml | 84 +++++++++- .../java/org/apache/any23/servlet/Servlet.java | 48 +++++- service/src/main/resources/form.html | 59 ++++++- .../main/webapp/resources/js/bootstrap-modal.js | 22 ++- src/site/apt/any23-plugins.apt | 9 +- 25 files changed, 734 insertions(+), 530 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/api/src/main/java/org/apache/any23/extractor/ExtractorGroup.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/extractor/ExtractorGroup.java b/api/src/main/java/org/apache/any23/extractor/ExtractorGroup.java index 9242ea6..4e77690 100644 --- a/api/src/main/java/org/apache/any23/extractor/ExtractorGroup.java +++ b/api/src/main/java/org/apache/any23/extractor/ExtractorGroup.java @@ -61,6 +61,7 @@ public class ExtractorGroup implements Iterable<ExtractorFactory<?>> { return new ExtractorGroup(matching); } + @Override public Iterator<ExtractorFactory<?>> iterator() { return factories.iterator(); } http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/api/src/main/java/org/apache/any23/plugin/Any23PluginManager.java ---------------------------------------------------------------------- diff --git a/api/src/main/java/org/apache/any23/plugin/Any23PluginManager.java b/api/src/main/java/org/apache/any23/plugin/Any23PluginManager.java index 5898210..3cd0829 100644 --- a/api/src/main/java/org/apache/any23/plugin/Any23PluginManager.java +++ b/api/src/main/java/org/apache/any23/plugin/Any23PluginManager.java @@ -116,7 +116,7 @@ public class Any23PluginManager { * @return list of exceptions raised during the loading. */ public synchronized Throwable[] loadJARs(File... jars) { - final List<Throwable> result = new ArrayList<Throwable>(); + final List<Throwable> result = new ArrayList<>(); for (File jar : jars) { try { loadJAR(jar); @@ -158,7 +158,7 @@ public class Any23PluginManager { * @return list of exceptions raised during the loading. */ public synchronized Throwable[] loadClassDirs(File... classDirs) { - final List<Throwable> result = new ArrayList<Throwable>(); + final List<Throwable> result = new ArrayList<>(); for (File classDir : classDirs) { try { loadClassDir(classDir); @@ -178,14 +178,15 @@ public class Any23PluginManager { * Loads all the JARs detected in a given directory. * * @param jarDir directory containing the JARs to be loaded. + * Example '/usr/local/apache-tomcat-7.0.72/webapps/apache-any23-service-2.2-SNAPSHOT/WEB-INF/lib/apache-any23-openie' * @return <code>true</code> if all JARs in dir are loaded. */ public synchronized boolean loadJARDir(File jarDir) { if(jarDir == null) throw new NullPointerException("JAR dir must be not null."); - if( ! jarDir.exists() ) + if(!jarDir.exists() ) throw new IllegalArgumentException("Given directory doesn't exist:" + jarDir.getAbsolutePath()); - if(! jarDir.isDirectory() ) + if(!jarDir.isDirectory() ) throw new IllegalArgumentException( "given file exists and it is not a directory: " + jarDir.getAbsolutePath() ); @@ -210,7 +211,7 @@ public class Any23PluginManager { * @return list of errors occurred during loading. */ public synchronized Throwable[] loadFiles(File... files) { - final List<Throwable> errors = new ArrayList<Throwable>(); + final List<Throwable> errors = new ArrayList<>(); for(File file : files) { try { if (file.isFile() && file.getName().endsWith(".jar")) { @@ -263,6 +264,7 @@ public class Any23PluginManager { * @return not <code>null</code> list of plugin classes. * @throws IOException if there is an error obtaining Extractors. */ + @SuppressWarnings("rawtypes") public synchronized Iterator<ExtractorFactory> getExtractors() throws IOException { return getPlugins(ExtractorFactory.class); } @@ -312,7 +314,8 @@ public class Any23PluginManager { final StringBuilder report = new StringBuilder(); try { - final List<ExtractorFactory<?>> newFactoryList = new ArrayList<ExtractorFactory<?>>(); + final List<ExtractorFactory<?>> newFactoryList = new ArrayList<>(); + @SuppressWarnings("rawtypes") Iterator<ExtractorFactory> extractors = getExtractors(); while (extractors.hasNext()) { ExtractorFactory<?> factory = extractors.next(); @@ -386,7 +389,7 @@ public class Any23PluginManager { */ private File[] getPluginLocations(String pluginDirsList) { final String[] locationsStr = pluginDirsList.split(PLUGIN_DIRS_LIST_SEPARATOR); - final List<File> locations = new ArrayList<File>(); + final List<File> locations = new ArrayList<>(); for(String locationStr : locationsStr) { final File location = new File(locationStr); if( ! location.exists()) { @@ -404,7 +407,7 @@ public class Any23PluginManager { */ private static final class DynamicClassLoader extends URLClassLoader { - private final Set<String> addedURLs = new HashSet<String>(); + private final Set<String> addedURLs = new HashSet<>(); private final List<File> jars; @@ -412,8 +415,8 @@ public class Any23PluginManager { public DynamicClassLoader(URL[] urls) { super(urls, Any23PluginManager.class.getClassLoader()); - jars = new ArrayList<File>(); - dirs = new ArrayList<File>(); + jars = new ArrayList<>(); + dirs = new ArrayList<>(); } public DynamicClassLoader() { http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/core/src/main/java/org/apache/any23/Any23.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/Any23.java b/core/src/main/java/org/apache/any23/Any23.java index 9be8a28..cba13d8 100644 --- a/core/src/main/java/org/apache/any23/Any23.java +++ b/core/src/main/java/org/apache/any23/Any23.java @@ -98,7 +98,8 @@ public class Any23 { * @param extractorGroup the group of extractors to be applied. */ public Any23(Configuration configuration, ExtractorGroup extractorGroup) { - if(configuration == null) throw new NullPointerException("configuration must be not null."); + if(configuration == null) + throw new NullPointerException("configuration must be not null."); this.configuration = configuration; logger.debug( configuration.getConfigurationDump() ); @@ -259,7 +260,8 @@ public class Any23 { * @throws IOException if an error occurs while initializing the internal {@link org.apache.any23.http.HTTPClient}. */ public DocumentSource createDocumentSource(String documentIRI) throws URISyntaxException, IOException { - if(documentIRI == null) throw new NullPointerException("documentIRI cannot be null."); + if(documentIRI == null) + throw new NullPointerException("documentIRI cannot be null."); if (documentIRI.toLowerCase().startsWith("file:")) { return new FileDocumentSource( new File(new URI(documentIRI)) ); } @@ -453,7 +455,7 @@ public class Any23 { } private String getAcceptHeader() { - Collection<MIMEType> mimeTypes = new ArrayList<MIMEType>(); + Collection<MIMEType> mimeTypes = new ArrayList<>(); for (ExtractorFactory<?> factory : factories) { mimeTypes.addAll(factory.getSupportedMIMETypes()); } http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java b/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java index 86dc982..ca3bb98 100644 --- a/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java +++ b/core/src/main/java/org/apache/any23/extractor/ExtractorRegistryImpl.java @@ -30,9 +30,15 @@ import java.util.List; * Singleton class acting as a register for all the various * {@link Extractor}. */ +@SuppressWarnings("rawtypes") public class ExtractorRegistryImpl extends org.eclipse.rdf4j.common.lang.service.ServiceRegistry<String, ExtractorFactory> implements ExtractorRegistry { /** + * The instance. + */ + private static ExtractorRegistry instance = null; + + /** * Public constructor for ExtractorRegistryImpl. Should normally call getInstance. */ public ExtractorRegistryImpl() { @@ -40,11 +46,6 @@ public class ExtractorRegistryImpl extends org.eclipse.rdf4j.common.lang.service } /** - * The instance. - */ - private static ExtractorRegistry instance = null; - - /** * @return returns the {@link ExtractorRegistry} instance. */ public static ExtractorRegistry getInstance() { http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/openie/pom.xml ---------------------------------------------------------------------- diff --git a/openie/pom.xml b/openie/pom.xml deleted file mode 100644 index 7440812..0000000 --- a/openie/pom.xml +++ /dev/null @@ -1,152 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - - <parent> - <artifactId>apache-any23</artifactId> - <groupId>org.apache.any23</groupId> - <version>2.2-SNAPSHOT</version> - <relativePath>../</relativePath> - </parent> - - <repositories> - <repository> - <snapshots> - <enabled>false</enabled> - </snapshots> - <id>bintray-allenai-maven</id> - <name>bintray</name> - <url>http://allenai.bintray.com/maven</url> - </repository> - </repositories> - <pluginRepositories> - <pluginRepository> - <snapshots> - <enabled>false</enabled> - </snapshots> - <id>bintray-allenai-maven</id> - <name>bintray-plugins</name> - <url>http://allenai.bintray.com/maven</url> - </pluginRepository> - </pluginRepositories> - - <artifactId>apache-any23-openie</artifactId> - - <name>Apache Any23 :: OpenIE</name> - <description>Open Information Extraction module.</description> - - <dependencies> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>apache-any23-core</artifactId> - <version>${project.version}</version> - </dependency> - <dependency> - <groupId>${project.groupId}</groupId> - <artifactId>apache-any23-test-resources</artifactId> - <version>${project.version}</version> - <scope>test</scope> - <type>test-jar</type> - </dependency> - <dependency> - <groupId>org.allenai.openie</groupId> - <artifactId>openie_2.11</artifactId> - <version>4.2.6</version> - <scope>compile</scope> - </dependency> - <dependency> - <groupId>org.allenai.openie</groupId> - <artifactId>openie_2.11</artifactId> - <version>4.2.6</version> - <scope>compile</scope> - <type>pom</type> - </dependency> - <dependency> - <groupId>edu.washington.cs.knowitall</groupId> - <artifactId>openregex</artifactId> - <version>1.1.1</version> - <scope>runtime</scope> - </dependency> - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-log4j12</artifactId> - <scope>test</scope> - </dependency> - </dependencies> - - <build> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-surefire-plugin</artifactId> - <configuration> - <skipTests>true</skipTests> - </configuration> - </plugin> - </plugins> - <pluginManagement> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-assembly-plugin</artifactId> - <version>${maven-assembly-plugin.version}</version> - <executions> - <execution> - <id>assembly</id> - <phase>package</phase> - <goals> - <goal>single</goal> - </goals> - </execution> - </executions> - <configuration> - <attach>true</attach> - <skipAssembly>true</skipAssembly> - <tarLongFileMode>gnu</tarLongFileMode> - </configuration> - </plugin> - </plugins> - </pluginManagement> - </build> - - <profiles> - <profile> - <id>release</id> - <build> - <resources> - <resource> - <directory>${basedir}/../</directory> - <targetPath>${project.build.directory}/apidocs/META-INF</targetPath> - <includes> - <include>LICENSE.txt</include> - <include>NOTICE.txt</include> - </includes> - </resource> - </resources> - </build> - </profile> - - </profiles> - -</project> http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/openie/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractor.java ---------------------------------------------------------------------- diff --git a/openie/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractor.java b/openie/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractor.java deleted file mode 100644 index 812ed9c..0000000 --- a/openie/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractor.java +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.any23.extractor.openie; - -import java.io.IOException; -import java.util.List; - -import javax.xml.transform.TransformerConfigurationException; -import javax.xml.transform.TransformerFactoryConfigurationError; - -import org.apache.any23.extractor.Extractor; -import org.apache.any23.configuration.Configuration; -import org.apache.any23.configuration.DefaultConfiguration; -import org.apache.any23.extractor.ExtractionContext; -import org.apache.any23.extractor.ExtractorDescription; -import org.apache.any23.rdf.RDFUtils; -import org.apache.any23.util.StreamUtils; -import org.apache.tika.Tika; -import org.apache.tika.exception.TikaException; -import org.eclipse.rdf4j.model.IRI; -import org.eclipse.rdf4j.model.Resource; -import org.eclipse.rdf4j.model.Value; -import org.eclipse.rdf4j.model.vocabulary.RDF; -import org.eclipse.rdf4j.model.vocabulary.RDFS; -import org.apache.any23.extractor.ExtractionException; -import org.apache.any23.extractor.ExtractionParameters; -import org.apache.any23.extractor.ExtractionResult; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.w3c.dom.Document; - -import edu.knowitall.openie.Argument; -import edu.knowitall.openie.Instance; -import edu.knowitall.openie.OpenIE; -import edu.knowitall.tool.parse.ClearParser; -import edu.knowitall.tool.postag.ClearPostagger; -import edu.knowitall.tool.srl.ClearSrl; -import edu.knowitall.tool.tokenize.ClearTokenizer; -import scala.collection.JavaConversions; -import scala.collection.Seq; - -/** - * An <a href="https://github.com/allenai/openie-standalone">OpenIE</a> - * extractor able to generate <i>RDF</i> statements from - * sentences representing relations in the text. - */ -public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor { - - private static final Logger LOG = LoggerFactory.getLogger(OpenIEExtractor.class); - - private IRI documentRoot; - - /** - * default constructor - */ - public OpenIEExtractor() { - // default constructor - } - - /** - * @see org.apache.any23.extractor.Extractor#getDescription() - */ - @Override - public ExtractorDescription getDescription() { - return OpenIEExtractorFactory.getDescriptionInstance(); - } - - @Override - public void run(ExtractionParameters extractionParameters, - ExtractionContext context, Document in, ExtractionResult out) - throws IOException, ExtractionException { - - IRI documentIRI = context.getDocumentIRI(); - documentRoot = RDFUtils.iri(documentIRI.toString() + "root"); - out.writeNamespace(RDF.PREFIX, RDF.NAMESPACE); - out.writeNamespace(RDFS.PREFIX, RDFS.NAMESPACE); - LOG.debug("Processing: {}", documentIRI.toString()); - - OpenIE openIE = new OpenIE( - new ClearParser( - new ClearPostagger( - new ClearTokenizer())), new ClearSrl(), false, false); - - Seq<Instance> extractions = null; - Tika tika = new Tika(); - try { - extractions = openIE.extract(tika.parseToString(StreamUtils.documentToInputStream(in))); - } catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) { - LOG.error("Encountered error during OpenIE extraction.", e); - } catch (TikaException e) { - LOG.error("Encountered error whilst parsing InputStream with Tika.", e); - } - - List<Instance> listExtractions = JavaConversions.seqAsJavaList(extractions); - // for each extraction instance we can obtain a number of extraction elements - // instance.confidence() - a confidence value for the extraction itself - // instance.extr().context() - an optional representation of the context for this extraction - // instance.extr().arg1().text() - subject - // instance.extr().rel().text() - predicate - // instance.extr().arg2s().text() - object - final Configuration immutableConf = DefaultConfiguration.singleton(); - Double threshold = Double.parseDouble(immutableConf.getProperty("any23.extraction.openie.confidence.threshold", "0.5")); - for(Instance instance : listExtractions) { - if (instance.confidence() > threshold) { - List<Argument> listArg2s = JavaConversions.seqAsJavaList(instance.extr().arg2s()); - for(Argument argument : listArg2s) { - Resource subject = RDFUtils.makeIRI(instance.extr().arg1().text(), documentIRI); - IRI predicate = (IRI) RDFUtils.makeIRI(instance.extr().rel().text(), documentIRI); - Value object = RDFUtils.toValue(argument.text()); - out.writeTriple(subject, predicate, object); - } - } - } - } -} http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/openie/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractorFactory.java ---------------------------------------------------------------------- diff --git a/openie/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractorFactory.java b/openie/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractorFactory.java deleted file mode 100644 index 31760d2..0000000 --- a/openie/src/main/java/org/apache/any23/extractor/openie/OpenIEExtractorFactory.java +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.any23.extractor.openie; - -import java.util.Arrays; - -import org.apache.any23.extractor.ExtractorDescription; -import org.apache.any23.extractor.ExtractorFactory; -import org.apache.any23.extractor.SimpleExtractorFactory; -import org.apache.any23.rdf.Prefixes; - -/** - * @author lewismc - * - */ -public class OpenIEExtractorFactory extends SimpleExtractorFactory<OpenIEExtractor> - implements ExtractorFactory<OpenIEExtractor> { - - public static final String NAME = "openie"; - - public static final Prefixes prefixes = null; - - private static final ExtractorDescription descriptionInstance = new OpenIEExtractorFactory(); - - public OpenIEExtractorFactory() { - super(NAME, prefixes, Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), "example-openie.html"); - } - - @Override - public OpenIEExtractor createExtractor() { - return new OpenIEExtractor(); - } - - public static ExtractorDescription getDescriptionInstance() { - return descriptionInstance; - } - -} http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/openie/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory ---------------------------------------------------------------------- diff --git a/openie/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory b/openie/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory deleted file mode 100644 index 4faf7ce..0000000 --- a/openie/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory +++ /dev/null @@ -1 +0,0 @@ -org.apache.any23.extractor.openie.OpenIEExtractorFactory \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java ---------------------------------------------------------------------- diff --git a/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java b/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java deleted file mode 100644 index 9455311..0000000 --- a/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.any23.openie; - -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; - -import org.apache.any23.extractor.ExtractionContext; -import org.apache.any23.extractor.ExtractionException; -import org.apache.any23.extractor.ExtractionParameters; -import org.apache.any23.extractor.ExtractionResult; -import org.apache.any23.extractor.ExtractionResultImpl; -import org.apache.any23.extractor.openie.OpenIEExtractor; -import org.apache.any23.rdf.RDFUtils; -import org.apache.any23.util.StreamUtils; -import org.apache.any23.writer.RDFXMLWriter; -import org.apache.any23.writer.TripleHandler; -import org.apache.any23.writer.TripleHandlerException; -import org.eclipse.rdf4j.model.IRI; -import org.junit.After; -import org.junit.Before; -import org.junit.Test; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -/** - * @author lewismc - * - */ -public class OpenIEExtractorTest { - - private static final Logger logger = LoggerFactory.getLogger(OpenIEExtractorTest.class); - - private OpenIEExtractor extractor; - - @Before - public void setUp() throws Exception { - extractor = new OpenIEExtractor(); - } - - @After - public void tearDown() throws Exception { - extractor = null; - } - - @Test - public void testExtractFromHTMLDocument() - throws IOException, ExtractionException, TripleHandlerException { - final IRI uri = RDFUtils.iri("http://podaac.jpl.nasa.gov/aquarius"); - extract(uri, "/org/apache/any23/extractor/openie/example-openie.html"); - } - - public void extract(IRI uri, String filePath) - throws IOException, ExtractionException, TripleHandlerException { - FileOutputStream fos = new FileOutputStream(File.createTempFile("OpenIEExtractorTest", "tmp")); - final TripleHandler tHandler = new RDFXMLWriter(fos); - final ExtractionContext extractionContext = new ExtractionContext("rdf-openie", uri); - final ExtractionResult result = new ExtractionResultImpl(extractionContext, extractor, tHandler); - try { - extractor.run( - ExtractionParameters.newDefault(), - extractionContext, - StreamUtils.inputStreamToDocument(this.getClass().getResourceAsStream(filePath)), - result - ); - } finally { - logger.debug(fos.toString()); - tHandler.close(); - result.close(); - } - } - -} http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractor.java ---------------------------------------------------------------------- diff --git a/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractor.java b/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractor.java index ab7d34a..94a3210 100644 --- a/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractor.java +++ b/plugins/html-scraper/src/main/java/org/apache/any23/plugin/htmlscraper/HTMLScraperExtractor.java @@ -46,16 +46,16 @@ import java.util.List; */ public class HTMLScraperExtractor implements Extractor.ContentExtractor { - public final static IRI PAGE_CONTENT_DE_PROPERTY = + public static final IRI PAGE_CONTENT_DE_PROPERTY = SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/de"); - public final static IRI PAGE_CONTENT_AE_PROPERTY = + public static final IRI PAGE_CONTENT_AE_PROPERTY = SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/ae"); - public final static IRI PAGE_CONTENT_LCE_PROPERTY = + public static final IRI PAGE_CONTENT_LCE_PROPERTY = SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/lce"); - public final static IRI PAGE_CONTENT_CE_PROPERTY = + public static final IRI PAGE_CONTENT_CE_PROPERTY = SimpleValueFactory.getInstance().createIRI(SINDICE.NS + "pagecontent/ce"); - private final List<ExtractionRule> extractionRules = new ArrayList<ExtractionRule>(); + private final List<ExtractionRule> extractionRules = new ArrayList<>(); public HTMLScraperExtractor() { loadDefaultRules(); @@ -66,7 +66,7 @@ public class HTMLScraperExtractor implements Extractor.ContentExtractor { } public String[] getTextExtractors() { - final List<String> extractors = new ArrayList<String>(); + final List<String> extractors = new ArrayList<>(); for(ExtractionRule er : extractionRules) { extractors.add(er.name); } http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/plugins/integration-test/pom.xml ---------------------------------------------------------------------- diff --git a/plugins/integration-test/pom.xml b/plugins/integration-test/pom.xml index ab062cd..c99a7e0 100644 --- a/plugins/integration-test/pom.xml +++ b/plugins/integration-test/pom.xml @@ -57,6 +57,11 @@ <artifactId>apache-any23-basic-crawler</artifactId> <version>1.0.6-SNAPSHOT</version> </dependency> + <dependency> + <groupId>org.apache.any23.plugins</groupId> + <artifactId>apache-any23-openie</artifactId> + <version>${project.parent.version}</version> + </dependency> <!-- BEGIN: Test Dependencies --> <dependency> http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java ---------------------------------------------------------------------- diff --git a/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java b/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java index 1b69463..e8e4505 100644 --- a/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java +++ b/plugins/integration-test/src/test/java/org/apache/any23/plugin/PluginIT.java @@ -56,6 +56,9 @@ public class PluginIT { private static final File CRAWLER_TARGET_DIR = new File(PLUGIN_DIR + "basic-crawler/target/classes"); private static final File CRAWLER_DEPENDENCY_DIR = new File(PLUGIN_DIR + "basic-crawler/target/dependency"); + private static final File OPENIE_TARGET_DIR = new File(PLUGIN_DIR + "openie/target/classes"); + private static final File OPENIE_DEPENDENCY_DIR = new File(PLUGIN_DIR + "openie/target/dependency"); + private Any23PluginManager manager; @Before @@ -79,13 +82,15 @@ public class PluginIT { public void testDetectExtractorPlugins() throws IOException, InstantiationException, IllegalAccessException { final ExtractorGroup extractorGroup = manager.getApplicableExtractors( new ExtractorRegistryImpl(), - HTML_SCRAPER_TARGET_DIR, // Required to satisfy class dependencies. + HTML_SCRAPER_TARGET_DIR, HTML_SCRAPER_DEPENDENCY_DIR, OFFICE_SCRAPER_TARGET_DIR, - OFFICE_SCRAPER_DEPENDENCY_DIR // Required to satisfy class dependencies. + OFFICE_SCRAPER_DEPENDENCY_DIR, + OPENIE_TARGET_DIR, + OPENIE_DEPENDENCY_DIR ); try { - Class.forName("org.apache.any23.extractor.openie.OpenIEExtractor", false, this.getClass().getClassLoader()); + Class.forName("org.apache.any23.plugin.extractor.openie.OpenIEExtractor", false, this.getClass().getClassLoader()); assertEquals("Did not find the number of expected extractors", NUM_OF_EXTRACTORS_INCL_OPENIE , extractorGroup.getNumOfExtractors() ); http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/plugins/openie/pom.xml ---------------------------------------------------------------------- diff --git a/plugins/openie/pom.xml b/plugins/openie/pom.xml new file mode 100644 index 0000000..64c6806 --- /dev/null +++ b/plugins/openie/pom.xml @@ -0,0 +1,165 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.any23</groupId> + <artifactId>apache-any23</artifactId> + <version>2.2-SNAPSHOT</version> + <relativePath>../../pom.xml</relativePath> + </parent> + + <groupId>org.apache.any23.plugins</groupId> + <artifactId>apache-any23-openie</artifactId> + + <name>Apache Any23 :: Plugins :: OpenIE</name> + <description>Open Information Extraction module.</description> + + <repositories> + <repository> + <snapshots> + <enabled>false</enabled> + </snapshots> + <id>bintray-allenai-maven</id> + <name>bintray</name> + <url>http://allenai.bintray.com/maven</url> + </repository> + </repositories> + <pluginRepositories> + <pluginRepository> + <snapshots> + <enabled>false</enabled> + </snapshots> + <id>bintray-allenai-maven</id> + <name>bintray-plugins</name> + <url>http://allenai.bintray.com/maven</url> + </pluginRepository> + </pluginRepositories> + + <dependencies> + <dependency> + <groupId>org.apache.any23</groupId> + <artifactId>apache-any23-core</artifactId> + <version>${project.version}</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.apache.any23</groupId> + <artifactId>apache-any23-test-resources</artifactId> + <version>${project.version}</version> + <scope>test</scope> + <type>test-jar</type> + </dependency> + <dependency> + <groupId>org.allenai.openie</groupId> + <artifactId>openie_2.11</artifactId> + <version>${openie_2.11.version}</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>org.allenai.openie</groupId> + <artifactId>openie_2.11</artifactId> + <version>${openie_2.11.version}</version> + <scope>compile</scope> + <type>pom</type> + </dependency> + <dependency> + <groupId>edu.washington.cs.knowitall</groupId> + <artifactId>openregex</artifactId> + <version>${openregex.version}</version> + <scope>runtime</scope> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + <scope>test</scope> + </dependency> + </dependencies> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <configuration> + <skipTests>true</skipTests> + </configuration> + </plugin> + <!-- Generates the distribution package --> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-assembly-plugin</artifactId> + <configuration> + <appendAssemblyId>false</appendAssemblyId> + <descriptors> + <descriptor>${basedir}/src/main/assembly/bin.xml</descriptor> + </descriptors> + </configuration> + </plugin> + </plugins> + <pluginManagement> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-assembly-plugin</artifactId> + <version>${maven-assembly-plugin.version}</version> + <executions> + <execution> + <id>assembly</id> + <phase>package</phase> + <goals> + <goal>single</goal> + </goals> + </execution> + </executions> + <configuration> + <attach>true</attach> + <skipAssembly>true</skipAssembly> + <tarLongFileMode>gnu</tarLongFileMode> + </configuration> + </plugin> + </plugins> + </pluginManagement> + </build> + + <profiles> + <profile> + <id>release</id> + <build> + <resources> + <resource> + <directory>${basedir}/../</directory> + <targetPath>${project.build.directory}/apidocs/META-INF</targetPath> + <includes> + <include>LICENSE.txt</include> + <include>NOTICE.txt</include> + </includes> + </resource> + </resources> + </build> + </profile> + + </profiles> + +</project> http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractor.java ---------------------------------------------------------------------- diff --git a/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractor.java b/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractor.java new file mode 100644 index 0000000..1b6a9cf --- /dev/null +++ b/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractor.java @@ -0,0 +1,137 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.plugin.extractor.openie; + +import java.io.IOException; +import java.util.List; + +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.TransformerFactoryConfigurationError; + +import org.apache.any23.extractor.Extractor; +import org.apache.any23.configuration.Configuration; +import org.apache.any23.configuration.DefaultConfiguration; +import org.apache.any23.extractor.ExtractionContext; +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.plugin.Author; +import org.apache.any23.plugin.ExtractorPlugin; +import org.apache.any23.rdf.RDFUtils; +import org.apache.any23.util.StreamUtils; +import org.apache.tika.Tika; +import org.apache.tika.exception.TikaException; +import org.eclipse.rdf4j.model.IRI; +import org.eclipse.rdf4j.model.Resource; +import org.eclipse.rdf4j.model.Value; +import org.eclipse.rdf4j.model.vocabulary.RDF; +import org.eclipse.rdf4j.model.vocabulary.RDFS; +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractionParameters; +import org.apache.any23.extractor.ExtractionResult; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; + +import edu.knowitall.openie.Argument; +import edu.knowitall.openie.Instance; +import edu.knowitall.openie.OpenIE; +import edu.knowitall.tool.parse.ClearParser; +import edu.knowitall.tool.postag.ClearPostagger; +import edu.knowitall.tool.srl.ClearSrl; +import edu.knowitall.tool.tokenize.ClearTokenizer; +import scala.collection.JavaConversions; +import scala.collection.Seq; + +/** + * An <a href="https://github.com/allenai/openie-standalone">OpenIE</a> + * extractor able to generate <i>RDF</i> statements from + * sentences representing relations in the text. + */ +@Author(name="Lewis John McGibbney ([email protected])") +public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor, ExtractorPlugin { + + private static final Logger LOG = LoggerFactory.getLogger(OpenIEExtractor.class); + + /** + * default constructor + */ + public OpenIEExtractor() { + // default constructor + } + + /** + * @see org.apache.any23.extractor.Extractor#getDescription() + */ + @Override + public ExtractorDescription getDescription() { + return OpenIEExtractorFactory.getDescriptionInstance(); + } + + @Override + public void run(ExtractionParameters extractionParameters, + ExtractionContext context, Document in, ExtractionResult out) + throws IOException, ExtractionException { + + IRI documentIRI = context.getDocumentIRI(); + RDFUtils.iri(documentIRI.toString() + "root"); + out.writeNamespace(RDF.PREFIX, RDF.NAMESPACE); + out.writeNamespace(RDFS.PREFIX, RDFS.NAMESPACE); + LOG.debug("Processing: {}", documentIRI.toString()); + + OpenIE openIE = new OpenIE( + new ClearParser( + new ClearPostagger( + new ClearTokenizer())), new ClearSrl(), false, false); + + Seq<Instance> extractions = null; + Tika tika = new Tika(); + try { + extractions = openIE.extract(tika.parseToString(StreamUtils.documentToInputStream(in))); + } catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) { + LOG.error("Encountered error during OpenIE extraction.", e); + } catch (TikaException e) { + LOG.error("Encountered error whilst parsing InputStream with Tika.", e); + } + + List<Instance> listExtractions = JavaConversions.seqAsJavaList(extractions); + // for each extraction instance we can obtain a number of extraction elements + // instance.confidence() - a confidence value for the extraction itself + // instance.extr().context() - an optional representation of the context for this extraction + // instance.extr().arg1().text() - subject + // instance.extr().rel().text() - predicate + // instance.extr().arg2s().text() - object + final Configuration immutableConf = DefaultConfiguration.singleton(); + Double threshold = Double.parseDouble(immutableConf.getProperty("any23.extraction.openie.confidence.threshold", "0.5")); + for(Instance instance : listExtractions) { + if (instance.confidence() > threshold) { + List<Argument> listArg2s = JavaConversions.seqAsJavaList(instance.extr().arg2s()); + for(Argument argument : listArg2s) { + Resource subject = RDFUtils.makeIRI(instance.extr().arg1().text(), documentIRI); + IRI predicate = (IRI) RDFUtils.makeIRI(instance.extr().rel().text(), documentIRI); + Value object = RDFUtils.toValue(argument.text()); + out.writeTriple(subject, predicate, object); + } + } + } + } + + @Override + public ExtractorFactory<?> getExtractorFactory() { + return (ExtractorFactory<?>) OpenIEExtractorFactory.getDescriptionInstance(); + } +} http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractorFactory.java ---------------------------------------------------------------------- diff --git a/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractorFactory.java b/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractorFactory.java new file mode 100644 index 0000000..1c86c62 --- /dev/null +++ b/plugins/openie/src/main/java/org/apache/any23/plugin/extractor/openie/OpenIEExtractorFactory.java @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.plugin.extractor.openie; + +import java.util.Arrays; + +import org.apache.any23.extractor.ExtractorDescription; +import org.apache.any23.extractor.ExtractorFactory; +import org.apache.any23.extractor.SimpleExtractorFactory; +import org.apache.any23.rdf.Prefixes; + +/** + * @author lewismc + * + */ +public class OpenIEExtractorFactory extends SimpleExtractorFactory<OpenIEExtractor> + implements ExtractorFactory<OpenIEExtractor> { + + public static final String NAME = "openie"; + + public static final Prefixes prefixes = null; + + private static final ExtractorDescription descriptionInstance = new OpenIEExtractorFactory(); + + public OpenIEExtractorFactory() { + super(NAME, prefixes, Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"), "example-openie.html"); + } + + @Override + public OpenIEExtractor createExtractor() { + return new OpenIEExtractor(); + } + + public static ExtractorDescription getDescriptionInstance() { + return descriptionInstance; + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/plugins/openie/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory ---------------------------------------------------------------------- diff --git a/plugins/openie/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory b/plugins/openie/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory new file mode 100644 index 0000000..10ebf16 --- /dev/null +++ b/plugins/openie/src/main/resources/META-INF/services/org.apache.any23.extractor.ExtractorFactory @@ -0,0 +1 @@ +org.apache.any23.plugin.extractor.openie.OpenIEExtractorFactory \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/plugins/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java ---------------------------------------------------------------------- diff --git a/plugins/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java b/plugins/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java new file mode 100644 index 0000000..dcc4e8f --- /dev/null +++ b/plugins/openie/src/test/java/org/apache/any23/openie/OpenIEExtractorTest.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.any23.openie; + +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; + +import org.apache.any23.extractor.ExtractionContext; +import org.apache.any23.extractor.ExtractionException; +import org.apache.any23.extractor.ExtractionParameters; +import org.apache.any23.extractor.ExtractionResult; +import org.apache.any23.extractor.ExtractionResultImpl; +import org.apache.any23.plugin.extractor.openie.OpenIEExtractor; +import org.apache.any23.rdf.RDFUtils; +import org.apache.any23.util.StreamUtils; +import org.apache.any23.writer.RDFXMLWriter; +import org.apache.any23.writer.TripleHandler; +import org.apache.any23.writer.TripleHandlerException; +import org.eclipse.rdf4j.model.IRI; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * @author lewismc + * + */ +public class OpenIEExtractorTest { + + private static final Logger logger = LoggerFactory.getLogger(OpenIEExtractorTest.class); + + private OpenIEExtractor extractor; + + @Before + public void setUp() throws Exception { + extractor = new OpenIEExtractor(); + } + + @After + public void tearDown() throws Exception { + extractor = null; + } + + @Test + public void testExtractFromHTMLDocument() + throws IOException, ExtractionException, TripleHandlerException { + final IRI uri = RDFUtils.iri("http://podaac.jpl.nasa.gov/aquarius"); + extract(uri, "/org/apache/any23/extractor/openie/example-openie.html"); + } + + public void extract(IRI uri, String filePath) + throws IOException, ExtractionException, TripleHandlerException { + FileOutputStream fos = new FileOutputStream(File.createTempFile("OpenIEExtractorTest", "tmp")); + final TripleHandler tHandler = new RDFXMLWriter(fos); + final ExtractionContext extractionContext = new ExtractionContext("rdf-openie", uri); + final ExtractionResult result = new ExtractionResultImpl(extractionContext, extractor, tHandler); + try { + extractor.run( + ExtractionParameters.newDefault(), + extractionContext, + StreamUtils.inputStreamToDocument(this.getClass().getResourceAsStream(filePath)), + result + ); + } finally { + logger.debug(fos.toString()); + tHandler.close(); + result.close(); + } + } + +} http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 9f69936..df1059e 100644 --- a/pom.xml +++ b/pom.xml @@ -204,10 +204,10 @@ <module>encoding</module> <module>core</module> <module>cli</module> - <module>openie</module> <module>plugins/basic-crawler</module> <module>plugins/html-scraper</module> <module>plugins/office-scraper</module> + <module>plugins/openie</module> <module>plugins/integration-test</module> <module>service</module> </modules> @@ -248,6 +248,8 @@ <semargl.version>0.7</semargl.version> <slf4j.logger.version>1.7.25</slf4j.logger.version> <tika.version>1.17</tika.version> + <openie_2.11.version>4.2.6</openie_2.11.version> + <openregex.version>1.1.1</openregex.version> <!-- Overridden in profiles to add JDK specific arguments to surefire --> <surefire-extra-args /> @@ -270,7 +272,7 @@ <buildnumber-maven-plugin.version>1.4</buildnumber-maven-plugin.version> <maven-compiler-plugin.version>3.6.1</maven-compiler-plugin.version> <maven-jar-plugin.version>3.0.2</maven-jar-plugin.version> - <maven-surefire-plugin.version>2.20</maven-surefire-plugin.version> + <maven-surefire-plugin.version>2.20.1</maven-surefire-plugin.version> <jacoco-maven-plugin.version>0.7.9</jacoco-maven-plugin.version> <maven-site-plugin.version>3.6</maven-site-plugin.version> <maven-changes-plugin.version>2.12.1</maven-changes-plugin.version> http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/service/README.md ---------------------------------------------------------------------- diff --git a/service/README.md b/service/README.md new file mode 100644 index 0000000..0de9b8a --- /dev/null +++ b/service/README.md @@ -0,0 +1,49 @@ +# Any23 Web Service + +This is the root dir of the Any23 Web-Service module. + +Apache Any23 provides a Web-Service that can be used to extract RDF from Web documents. + +## Generate Web-Service Packaging + +To generate the desired Web-service package, execute 'mvn package' from this directory. + +``` +$ cd $ANY23-HOME/service +$ mvn package +``` +From this directory it generates roughly the following... +``` +. +âââ pom.xml +âââ README.txt +âââ src +â  âââ main +â  â  âââ assembly +â  â  âââ bin +â  â  âââ java +â  â  âââ resources +â  â  âââ webapp +â  âââ test +â  âââ java +â  âââ resources +âââ target + âââ any23-service-${version}.war + âââ any23-service-${version}-without-deps.war + âââ apache-any23-service-${version}-bin-server-embedded.tar.gz <<< + âââ apache-any23-service-${version}-bin-server-embedded.zip <<< + âââ apache-any23-service-${version}-bin.tar.gz <<< + âââ apache-any23-service-${version}-bin-without-deps.tar.gz <<< + âââ apache-any23-service-${version}-bin-without-deps.zip <<< + âââ apache-any23-service-${version}-bin.zip <<< + âââ archive-tmp + âââ classes + âââ generated-sources + âââ maven-archiver + âââ maven-shared-archive-resources + âââ surefire + âââ surefire-reports + âââ test-classes +``` + +Specific README's for each of the artifacts can be found in either ./target/*.tar.gz || ./target/*.zip (annotated above with '<<<'), where much more detailed information sources can be located. http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/service/README.txt ---------------------------------------------------------------------- diff --git a/service/README.txt b/service/README.txt deleted file mode 100644 index a4d26d0..0000000 --- a/service/README.txt +++ /dev/null @@ -1,50 +0,0 @@ -============= -Any23 Web Service -============= - -This is the root dir of the Any23 Web-Service module. - -Apache Any23 provides a Web-Service that can be used to extract RDF from Web documents. - -Generate Web-Service Packaging -=============================== - -To generate the desired Web-service package, execute 'mvn package' from this directory. - -$cd $ANY23-HOME/service -$ mvn package - -From this directory it generates: -. -âââ pom.xml -âââ README.txt -âââ src -â  âââ main -â  â  âââ assembly -â  â  âââ bin -â  â  âââ java -â  â  âââ resources -â  â  âââ webapp -â  âââ test -â  âââ java -â  âââ resources -âââ target - âââ any23-service-${version}.war - âââ any23-service-${version}-without-deps.war - âââ apache-any23-service-${version}-bin-server-embedded.tar.gz <<< - âââ apache-any23-service-${version}-bin-server-embedded.zip <<< - âââ apache-any23-service-${version}-bin.tar.gz <<< - âââ apache-any23-service-${version}-bin-without-deps.tar.gz <<< - âââ apache-any23-service-${version}-bin-without-deps.zip <<< - âââ apache-any23-service-${version}-bin.zip <<< - âââ archive-tmp - âââ classes - âââ generated-sources - âââ maven-archiver - âââ maven-shared-archive-resources - âââ surefire - âââ surefire-reports - âââ test-classes -... - -Specific README's can be found in either ./target/*.tar.gz || ./target/*.zip (annotated above with '<<<'), where much more detailed information sources can be located. http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/service/pom.xml ---------------------------------------------------------------------- diff --git a/service/pom.xml b/service/pom.xml index fe4911f..d5b275f 100644 --- a/service/pom.xml +++ b/service/pom.xml @@ -34,15 +34,41 @@ <properties> <!-- the following property is used in the bash script as well, don't remove it! --> <jetty.runner.version>8.1.16.v20140903</jetty.runner.version> + <output.directory>${project.build.directory}/${project.artifactId}-${project.version}/WEB-INF/lib/apache-any23-openie</output.directory> </properties> <dependencies> - <!-- Core Module --> + <!-- Any23 Modules --> <dependency> <groupId>org.apache.any23</groupId> <artifactId>apache-any23-core</artifactId> <version>${project.version}</version> </dependency> + <dependency> + <groupId>org.apache.any23.plugins</groupId> + <artifactId>apache-any23-openie</artifactId> + <version>${project.version}</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.allenai.openie</groupId> + <artifactId>openie_2.11</artifactId> + <version>${openie_2.11.version}</version> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.allenai.openie</groupId> + <artifactId>openie_2.11</artifactId> + <version>${openie_2.11.version}</version> + <scope>provided</scope> + <type>pom</type> + </dependency> + <dependency> + <groupId>edu.washington.cs.knowitall</groupId> + <artifactId>openregex</artifactId> + <version>${openregex.version}</version> + <scope>provided</scope> + </dependency> <!-- Logging --> <dependency> @@ -181,6 +207,62 @@ </descriptors> </configuration> </plugin> + + <!-- Used to provide dynamic OpenIE toggling within service --> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-dependency-plugin</artifactId> + <executions> + <execution> + <id>copy</id> + <phase>prepare-package</phase> + <goals> + <goal>copy</goal> + </goals> + <configuration> + <artifactItems> + <artifactItem> + <groupId>org.apache.any23.plugins</groupId> + <artifactId>apache-any23-openie</artifactId> + <version>${project.version}</version> + <outputDirectory>${output.directory}</outputDirectory> + </artifactItem> + <artifactItem> + <groupId>org.allenai.openie</groupId> + <artifactId>openie_2.11</artifactId> + <version>${openie_2.11.version}</version> + <outputDirectory>${output.directory}</outputDirectory> + </artifactItem> + <artifactItem> + <groupId>org.allenai.openie</groupId> + <artifactId>openie_2.11</artifactId> + <version>${openie_2.11.version}</version> + <type>pom</type> + <outputDirectory>${output.directory}</outputDirectory> + </artifactItem> + <artifactItem> + <groupId>edu.washington.cs.knowitall</groupId> + <artifactId>openregex</artifactId> + <version>${openregex.version}</version> + <outputDirectory>${output.directory}</outputDirectory> + </artifactItem> + </artifactItems> + <!-- other configurations here --> + </configuration> + </execution> + </executions> + </plugin> + + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <configuration> + <classpathDependencyExcludes> + <classpathDependencyExclude>org.apache.any23.plugins:apache-any23-openie</classpathDependencyExclude> + </classpathDependencyExcludes> + </configuration> + </plugin> + </plugins> </build> http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/service/src/main/java/org/apache/any23/servlet/Servlet.java ---------------------------------------------------------------------- diff --git a/service/src/main/java/org/apache/any23/servlet/Servlet.java b/service/src/main/java/org/apache/any23/servlet/Servlet.java index b63d052..1ab542c 100644 --- a/service/src/main/java/org/apache/any23/servlet/Servlet.java +++ b/service/src/main/java/org/apache/any23/servlet/Servlet.java @@ -20,6 +20,7 @@ package org.apache.any23.servlet; import org.apache.any23.configuration.DefaultConfiguration; import org.apache.any23.extractor.ExtractionParameters; import org.apache.any23.http.HTTPClient; +import org.apache.any23.plugin.Any23PluginManager; import org.apache.any23.servlet.conneg.Any23Negotiator; import org.apache.any23.servlet.conneg.MediaRangeSpec; import org.apache.any23.source.ByteArrayDocumentSource; @@ -35,6 +36,8 @@ import javax.servlet.ServletException; import javax.servlet.http.HttpServlet; import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletResponse; + +import java.io.File; import java.io.IOException; import java.net.URISyntaxException; import java.util.regex.Pattern; @@ -69,17 +72,43 @@ public class Servlet extends HttpServlet { final String format = getFormatFromRequestOrNegotiation(req); final boolean report = isReport(req); final boolean annotate = isAnnotated(req); + final boolean openie = isOpenIE(req); if (format == null) { - responder.sendError(406, "Client accept header does not include a supported output format", report); - return; + try { + responder.sendError(406, "Client accept header does not include a supported output format", report); + return; + } catch (IOException e) { + LOG.error("Unable to send error for null request format.", e); + } } final String uri = getInputIRIFromRequest(req); if (uri == null) { - responder.sendError(404, "Missing IRI in GET request. Try /format/http://example.com/myfile", report); - return; + try { + responder.sendError(404, "Missing IRI in GET request. Try /format/http://example.com/myfile", report); + return; + } catch (Exception e) { + LOG.error("Unable to send error for null request IRI.", e); + } + } + if (openie) { + Any23PluginManager pManager = Any23PluginManager.getInstance(); + //Dynamically adding Jar's to the Classpath via the following logic + //is absolutely dependant on the 'apache-any23-openie' directory being + //present within the webapp /lib directory. This is specified within + //the maven-dependency-plugin. + File webappClasspath = new File(getClass().getClassLoader().getResource("").getPath()); + File openIEJarPath = new File(webappClasspath.getParentFile().getPath() + "/lib/apache-any23-openie"); + boolean loadedJars = pManager.loadJARDir(openIEJarPath); + if (loadedJars) { + LOG.info("Successful dynamic classloading of apache-any23-openie directory from webapp lib."); + } } final ExtractionParameters eps = getExtractionParameters(req); - responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report, annotate); + try { + responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report, annotate); + } catch (IOException e) { + LOG.error("Unable to run extraction on HTTPDocumentSource.", e); + } } @Override @@ -87,6 +116,7 @@ public class Servlet extends HttpServlet { final WebResponder responder = new WebResponder(this, resp); final boolean report = isReport(req); final boolean annotate = isAnnotated(req); + final boolean openie = isOpenIE(req); if (req.getContentType() == null) { responder.sendError(400, "Invalid POST request, no Content-Type for the message body specified", report); return; @@ -97,6 +127,10 @@ public class Servlet extends HttpServlet { responder.sendError(406, "Client accept header does not include a supported output format", report); return; } + if (openie) { + Any23PluginManager pManager = Any23PluginManager.getInstance(); + pManager.loadJARDir(new File(getClass().getResource("apache-any23-openie").getPath())); + } final ExtractionParameters eps = getExtractionParameters(req); if ("application/x-www-form-urlencoded".equals(getContentTypeHeader(req))) { if (uri != null) { @@ -283,4 +317,8 @@ public class Servlet extends HttpServlet { return request.getParameter("annotate") != null; } + private boolean isOpenIE(HttpServletRequest request) { + return request.getParameter("openie") != null; + } + } \ No newline at end of file http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/service/src/main/resources/form.html ---------------------------------------------------------------------- diff --git a/service/src/main/resources/form.html b/service/src/main/resources/form.html index 374d017..a5cf937 100644 --- a/service/src/main/resources/form.html +++ b/service/src/main/resources/form.html @@ -80,6 +80,15 @@ function showModal( id ) </div> </div> <div class="control-group"> + <label class="control-label" for="url-openie-get">OpenIE</label> + <div class="controls"> + <label class="checkbox"> + <input id="url-openie-get" type="checkbox" name="openie"> + <a href="javascript:showModal( '#sPopup-openie' );">[?]</a> + </label> + </div> + </div> + <div class="control-group"> <label class="control-label" for="url-validation-get">Validation</label> <div class="controls"> <select id="url-validation-get" name="validation-mode" onclick="if(document.getElementById('url-validation-get').value.indexOf('validate') == 0) { document.getElementById('url-report-get').checked = true; }"> @@ -103,7 +112,7 @@ function showModal( id ) <label class="control-label" for="url-annotate-get">Annotate</label> <div class="controls"> <label class="checkbox"> - <input id="url-annotate-get" type="checkbox" type="checkbox" name="annotate"> + <input id="url-annotate-get" type="checkbox" name="annotate"> <a href="javascript:showModal( '#sPopup-annotate' );">[?]</a> </label> </div> @@ -149,6 +158,15 @@ function showModal( id ) </div> </div> <div class="control-group"> + <label class="control-label" for="openie-on-post">OpenIE</label> + <div class="controls"> + <label class="checkbox"> + <input id="url-openie-post" type="checkbox" name="openie"> + <a href="javascript:showModal( '#sPopup-openie' );">[?]</a> + </label> + </div> + </div> + <div class="control-group"> <label class="control-label" for="url-validation-post">Validation</label> <div class="controls"> <select id="url-validation-post" name="validation-mode" onclick="if(document.getElementById('url-validation-post').value.indexOf('validate') == 0) { document.getElementById('url-report-post').checked = true; }"> @@ -172,7 +190,7 @@ function showModal( id ) <label class="control-label" for="url-annotate-post">Annotate</label> <div class="controls"> <label class="checkbox"> - <input id="url-annotate-post" type="checkbox" type="checkbox" name="annotate"> + <input id="url-annotate-post" type="checkbox" name="annotate"> <a href="javascript:showModal( '#sPopup-annotate' );">[?]</a> </label> </div> @@ -224,8 +242,10 @@ function showModal( id ) </tr> <tr><th>annotate</th><td>If specified the output RDF will contain extractor specific scope comments.<br/>Possible values: <code>on</code>/<code>off</code></td></tr> <tr><th>report</th><td>If specified will produce a full XML report containing extraction and validation issues other than produced metadata.<br/>Possible values: <code>on</code>/<code>off</code></td></tr> + <tr><th>openie</th><td>If specified the <a href="https://github.com/allenai/openie-standalone" target="_blank"> + Open Information Extraction (Open IE) system</a> will be activated (default off).<br/>Possible values: <code>on</code>/<code>off</code></td></tr> </table> - Such URL will return an HTTP <i>302</i> redirect to <code><span class="app-base-uri">http://...</span>any23/<em>format</em></code>.<br/> + Formatting the URL according to the above will return an HTTP <i>302</i> redirect to <code><span class="app-base-uri">http://...</span>any23/<em>format</em></code>.<br/> <p>The response is the input document converted to the desired output format.</p> <h3>Direct POST API</h3> @@ -278,6 +298,8 @@ Content-Length: 174 </tr> <tr><th>annotate</th><td>If specified the output RDF will contain extractor specific scope comments.<br/>Possible values: <code>on</code>/<code>off</code></td></tr> <tr><th>report</th><td>If specified will produce a full XML report containing extraction and validation issues other than produced metadata.<br/>Possible values: <code>on</code>/<code>off</code></td></tr> + <tr><th>openie</th><td>If specified the <a href="https://github.com/allenai/openie-standalone" target="_blank"> + Open Information Extraction (Open IE) system</a> will be activated (default off).<br/>Possible values: <code>on</code>/<code>off</code></td></tr> </table> <h3>Output formats</h3> @@ -285,11 +307,11 @@ Content-Length: 174 <ul> <li><code>best</code> for content negotiation according to the client's <code>Accept</code> HTTP header</li> <li><code>turtle</code>, <code>ttl</code>, <code>n3</code> for - <a href="http://www.w3.org/TeamSubmission/turtle/" target="_blank">Turtle</a>/<a href="http://www.w3.org/DesignIssues/Notation3" target="_blank">N3</a></li> + <a href="https://www.w3.org/TR/turtle/" target="_blank">Turtle</a>/<a href="https://www.w3.org/TeamSubmission/n3/" target="_blank">N3</a></li> <li><code>ntriples</code>, <code>nt</code> for - <a href="http://www.w3.org/TR/rdf-testcases/#ntriples" target="_blank">N-Triples</a></li> + <a href="https://www.w3.org/TR/n-triples/" target="_blank">N-Triples</a></li> <li><code>nquads</code>, <code>nq</code> for - <a href="http://sw.deri.org/2008/07/n-quads/" target="_blank">N-Quads</a></li> + <a href="https://www.w3.org/TR/n-quads/" target="_blank">N-Quads</a></li> <li><code>trix</code> for <a href="http://www.w3.org/2004/03/trix/" target="_blank">TriX</a></li> <li><code>rdfxml</code>, <code>rdf</code>, <code>xml</code> for @@ -323,6 +345,27 @@ Content-Length: 174 <p><b>Apache Any23 v.${project.version} (${implementation.build.tstamp})</b></p> <p><a href="http://any23.apache.org/" target="_blank">Any23 project homepage</a> | Hosted at <a href="http://apache.org/" target="_blank">Apache Software Foundation</a></p> + <div id="sPopup-openie" class="modal hide fade"> + <div class="modal-header"> + <button type="button" class="close">Ã</button> + <h3>Open Information Extraction</h3> + </div> + <div class="modal-body"> + <p> + If the <i>OpenIE</i> checkbox is selected, the <b>Any23</b> service will activate the + <a href="https://github.com/allenai/openie-standalone" target="_blank">Open Information Extraction (Open IE) system</a>, + enhancing extraction results.</p> + <p>The Open IE system runs over sentences and creates extractions that represent relations in text, in the case + of Any23, this results in triples. The confidence of relationships extracted from text are based on a + configurable threshold established in + <code>https://github.com/apache/any23/blob/master/api/src/main/resources/default-configuration.properties</code>. + </p> + </div> + <div class="modal-footer"> + <a href="#" class="btn">Close</a> + </div> + </div> + <div id="sPopup-fix" class="modal hide fade"> <div class="modal-header"> <button type="button" class="close" >Ã</button> @@ -330,8 +373,8 @@ Content-Length: 174 </div> <div class="modal-body"> <p> - The <b>Any23</b> service tries to fix some <a href="http://www.deri.ie/fileadmin/documents/DERI-TR-2009-07-28.pdf" target="_blank">common issues</a> - before performing a metadata extraction. The fixing is performed according a set of fully customizable rules. + The <b>Any23</b> service tries to fix some common issues before performing a metadata + extraction. The fixing is performed according a set of fully customizable rules. </p> <p> The following <i>Validation</i> options are available. http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/service/src/main/webapp/resources/js/bootstrap-modal.js ---------------------------------------------------------------------- diff --git a/service/src/main/webapp/resources/js/bootstrap-modal.js b/service/src/main/webapp/resources/js/bootstrap-modal.js index 38fd0c8..11b951e 100644 --- a/service/src/main/webapp/resources/js/bootstrap-modal.js +++ b/service/src/main/webapp/resources/js/bootstrap-modal.js @@ -17,11 +17,9 @@ * limitations under the License. * ========================================================= */ - !function ($) { - "use strict"; // jshint ;_; - + "use strict"; /* MODAL CLASS DEFINITION * ====================== */ @@ -46,7 +44,8 @@ this.$element.trigger(e) - if (this.isShown || e.isDefaultPrevented()) return + if (this.isShown || e.isDefaultPrevented()) + return $('body').addClass('modal-open') @@ -85,7 +84,8 @@ this.$element.trigger(e) - if (!this.isShown || e.isDefaultPrevented()) return + if (!this.isShown || e.isDefaultPrevented()) + return this.isShown = false @@ -141,7 +141,8 @@ this.$backdrop.click($.proxy(this.hide, this)) } - if (doAnimate) this.$backdrop[0].offsetWidth // force reflow + if (doAnimate) + this.$backdrop[0].offsetWidth // force reflow this.$backdrop.addClass('in') @@ -186,9 +187,12 @@ var $this = $(this) , data = $this.data('modal') , options = $.extend({}, $.fn.modal.defaults, $this.data(), typeof option == 'object' && option) - if (!data) $this.data('modal', (data = new Modal(this, options))) - if (typeof option == 'string') data[option]() - else if (options.show) data.show() + if (!data) + $this.data('modal', (data = new Modal(this, options))) + if (typeof option == 'string') + data[option]() + else if (options.show) + data.show() }) } http://git-wip-us.apache.org/repos/asf/any23/blob/706e891c/src/site/apt/any23-plugins.apt ---------------------------------------------------------------------- diff --git a/src/site/apt/any23-plugins.apt b/src/site/apt/any23-plugins.apt index f429e2d..b79a27a 100644 --- a/src/site/apt/any23-plugins.apt +++ b/src/site/apt/any23-plugins.apt @@ -49,11 +49,10 @@ export CLASSPATH_PREFIX=../../../plugins/basic-crawler/target/any23-basic-crawle * adding its <JAR> to the <$HOME/.any23/plugins> directory. - A plugin can be added to the <Apache Any23 library API> by using the - {{{./apidocs/org/apache/any23/plugin/Any23PluginManager.html}Any23PluginManager}}#createInstance(Configuration configuration, File... pluginLocations) - method. - - TODO: plugin support in Apache Any23 Service + A plugin can be added to the <Apache Any23 library API> by first creating a static instance of + {{{./apidocs/org/apache/any23/plugin/Any23PluginManager.html}Any23PluginManager}}#getInstance(). + Once this is done there is a variety of options to configure and register a plugins, etc. An example + of dynamic plugin loading can be seen via the OpenIE toggle in the Any23 Service. Any implementation of <ExtractorPlugin> will automatically registered to the {{{./apidocs/org/apache/any23/extractor/ExtractorRegistry.html}ExtractorRegistry}}.
