Author: mattmann
Date: Mon Apr 27 01:19:05 2015
New Revision: 1676164
URL: http://svn.apache.org/r1676164
Log:
Fix for OODT-829 Implement an Apache Tika based Server Side Extractor (Radu
Manole via mattmann) this closes #19.
Added:
oodt/trunk/filemgr/src/main/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TikaAutoDetectExtractor.java
oodt/trunk/filemgr/src/test/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TestTikaAutoDetectExtractor.java
Modified:
oodt/trunk/CHANGES.txt
oodt/trunk/filemgr/pom.xml
oodt/trunk/filemgr/src/main/resources/examples/core/product-types.xml
Modified: oodt/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/oodt/trunk/CHANGES.txt?rev=1676164&r1=1676163&r2=1676164&view=diff
==============================================================================
--- oodt/trunk/CHANGES.txt (original)
+++ oodt/trunk/CHANGES.txt Mon Apr 27 01:19:05 2015
@@ -2,6 +2,8 @@ Apache OODT Change Log
======================
Release 0.9 - Current Development
+* OODT-829 Implement an Apache Tika based Server Side Extractor (Radu Manole
via mattmann)
+
* OODT-826 Add the capability to check external preconditions before ingesting
a file.
* OODT-832 Move streaming items to seperate top-level component (starchmd)
Modified: oodt/trunk/filemgr/pom.xml
URL:
http://svn.apache.org/viewvc/oodt/trunk/filemgr/pom.xml?rev=1676164&r1=1676163&r2=1676164&view=diff
==============================================================================
--- oodt/trunk/filemgr/pom.xml (original)
+++ oodt/trunk/filemgr/pom.xml Mon Apr 27 01:19:05 2015
@@ -151,6 +151,11 @@
</repositories>
<dependencies>
<dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ <version>1.7</version>
+ </dependency>
+ <dependency>
<groupId>org.apache.oodt</groupId>
<artifactId>oodt-commons</artifactId>
<version>${project.parent.version}</version>
Added:
oodt/trunk/filemgr/src/main/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TikaAutoDetectExtractor.java
URL:
http://svn.apache.org/viewvc/oodt/trunk/filemgr/src/main/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TikaAutoDetectExtractor.java?rev=1676164&view=auto
==============================================================================
---
oodt/trunk/filemgr/src/main/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TikaAutoDetectExtractor.java
(added)
+++
oodt/trunk/filemgr/src/main/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TikaAutoDetectExtractor.java
Mon Apr 27 01:19:05 2015
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.oodt.cas.filemgr.metadata.extractors.examples;
+
+//JDK imports
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Arrays;
+
+//OODT imports
+import
org.apache.oodt.cas.filemgr.metadata.extractors.AbstractFilemgrMetExtractor;
+import org.apache.oodt.cas.filemgr.structs.Product;
+import org.apache.oodt.cas.metadata.Metadata;
+import org.apache.oodt.cas.metadata.exceptions.MetExtractionException;
+
+//TIKA imports
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.*;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+
+public class TikaAutoDetectExtractor extends AbstractFilemgrMetExtractor {
+
+ public void doConfigure() {
+ }
+
+ public Metadata doExtract(Product product, Metadata met) throws
MetExtractionException {
+ Metadata outMetadata = new Metadata();
+
+ merge(met, outMetadata);
+ Metadata tikaMetadata = getMetadataFromTika(product);
+ merge(tikaMetadata, outMetadata);
+
+ return outMetadata;
+ }
+
+ private Metadata getMetadataFromTika(Product product) throws
MetExtractionException {
+ try {
+ File file = getProductFile(product);
+ FileInputStream inputStream = new FileInputStream(file);
+ org.apache.tika.metadata.Metadata tikaMetadata = new
org.apache.tika.metadata.Metadata();
+ Parser parser = new AutoDetectParser();
+ parser.parse(inputStream, new DefaultHandler(), tikaMetadata, new
ParseContext());
+ return transform(tikaMetadata);
+
+ } catch (FileNotFoundException e) {
+ throw new MetExtractionException(
+ "Unable to find file: Reason: " + e.getMessage());
+ } catch (TikaException e) {
+ throw new MetExtractionException(
+ "Unable to parse the document: Reason: " + e.getMessage());
+ } catch (SAXException e) {
+ throw new MetExtractionException(
+ " Unable to process the SAX events : Reason: " +
e.getMessage());
+ } catch (IOException e) {
+ throw new MetExtractionException(
+ "Unable to read the document stream: Reason: " +
e.getMessage());
+ }
+ }
+
+ private Metadata transform(org.apache.tika.metadata.Metadata tikaMetadata){
+ Metadata metadata = new Metadata();
+
+ String[] names = tikaMetadata.names();
+ for (String name : names){
+ metadata.addMetadata(name,
Arrays.asList(tikaMetadata.getValues(name)));
+ }
+
+ return metadata;
+ }
+}
Modified: oodt/trunk/filemgr/src/main/resources/examples/core/product-types.xml
URL:
http://svn.apache.org/viewvc/oodt/trunk/filemgr/src/main/resources/examples/core/product-types.xml?rev=1676164&r1=1676163&r2=1676164&view=diff
==============================================================================
--- oodt/trunk/filemgr/src/main/resources/examples/core/product-types.xml
(original)
+++ oodt/trunk/filemgr/src/main/resources/examples/core/product-types.xml Mon
Apr 27 01:19:05 2015
@@ -89,6 +89,14 @@
</extractor>
-->
+
+ <!--
+ The below enables the TikaAutoDetectExtractor. It does not take any
configuration parameters
+ and simple runs Apache Tika on the reference for the Product and
assumes it's a single file
+ product (Product.STRUCTURE_FLAT).
+
+ <extractor
class="org.apache.oodt.cas.filemgr.metadata.extractors.examples.TikaAutoDetectExtractor"/>
+ -->
</metExtractors>
</type>
</cas:producttypes>
Added:
oodt/trunk/filemgr/src/test/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TestTikaAutoDetectExtractor.java
URL:
http://svn.apache.org/viewvc/oodt/trunk/filemgr/src/test/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TestTikaAutoDetectExtractor.java?rev=1676164&view=auto
==============================================================================
---
oodt/trunk/filemgr/src/test/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TestTikaAutoDetectExtractor.java
(added)
+++
oodt/trunk/filemgr/src/test/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TestTikaAutoDetectExtractor.java
Mon Apr 27 01:19:05 2015
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.oodt.cas.filemgr.metadata.extractors.examples;
+
+//JDK imports
+import java.net.URL;
+
+//Junit imports
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+// JUnit static imports
+import static junit.framework.Assert.*;
+
+//OODT imports
+import org.apache.oodt.cas.filemgr.structs.Product;
+import org.apache.oodt.cas.filemgr.structs.Reference;
+import org.apache.oodt.cas.metadata.Metadata;
+import org.apache.oodt.cas.metadata.exceptions.MetExtractionException;
+
+
+@RunWith(JUnit4.class)
+public class TestTikaAutoDetectExtractor {
+
+ @Test
+ public void test() throws MetExtractionException {
+ TikaAutoDetectExtractor tikaExtractor = new TikaAutoDetectExtractor();
+
+ Metadata emptyMetadata = new Metadata();
+ Reference ref = new Reference();
+ URL file = this.getClass().getResource("/test.txt");
+ ref.setOrigReference(file.toString());
+ ref.setDataStoreReference(file.toString());
+
+ Product product = new Product();
+ product.getProductReferences().add(ref);
+ product.setProductStructure(Product.STRUCTURE_FLAT);
+
+ Metadata outputMetadata = tikaExtractor.doExtract(product,
emptyMetadata);
+
+ assertNotNull(outputMetadata);
+ assertTrue(outputMetadata.getAllKeys().size() > 0);
+ assertTrue(outputMetadata.containsKey("X-Parsed-By"));
+ assertFalse(outputMetadata.getMetadata("X-Parsed-By") ==
"org.apache.tika.parser.EmptyParser");
+ }
+}