Author: mattmann
Date: Mon Apr 27 01:19:05 2015
New Revision: 1676164

URL: http://svn.apache.org/r1676164
Log:
Fix for  OODT-829 Implement an Apache Tika based Server Side Extractor (Radu 
Manole via mattmann) this closes #19.

Added:
    
oodt/trunk/filemgr/src/main/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TikaAutoDetectExtractor.java
    
oodt/trunk/filemgr/src/test/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TestTikaAutoDetectExtractor.java
Modified:
    oodt/trunk/CHANGES.txt
    oodt/trunk/filemgr/pom.xml
    oodt/trunk/filemgr/src/main/resources/examples/core/product-types.xml

Modified: oodt/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/oodt/trunk/CHANGES.txt?rev=1676164&r1=1676163&r2=1676164&view=diff
==============================================================================
--- oodt/trunk/CHANGES.txt (original)
+++ oodt/trunk/CHANGES.txt Mon Apr 27 01:19:05 2015
@@ -2,6 +2,8 @@ Apache OODT Change Log
 ======================
 Release 0.9 - Current Development
 
+* OODT-829 Implement an Apache Tika based Server Side Extractor (Radu Manole 
via mattmann)
+
 * OODT-826 Add the capability to check external preconditions before ingesting 
a file.
 
 * OODT-832 Move streaming items to seperate top-level component (starchmd)

Modified: oodt/trunk/filemgr/pom.xml
URL: 
http://svn.apache.org/viewvc/oodt/trunk/filemgr/pom.xml?rev=1676164&r1=1676163&r2=1676164&view=diff
==============================================================================
--- oodt/trunk/filemgr/pom.xml (original)
+++ oodt/trunk/filemgr/pom.xml Mon Apr 27 01:19:05 2015
@@ -151,6 +151,11 @@
   </repositories>
   <dependencies>
     <dependency>
+    <groupId>org.apache.tika</groupId>
+      <artifactId>tika-parsers</artifactId>
+      <version>1.7</version>
+    </dependency>
+    <dependency>
       <groupId>org.apache.oodt</groupId>
       <artifactId>oodt-commons</artifactId>
       <version>${project.parent.version}</version>    

Added: 
oodt/trunk/filemgr/src/main/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TikaAutoDetectExtractor.java
URL: 
http://svn.apache.org/viewvc/oodt/trunk/filemgr/src/main/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TikaAutoDetectExtractor.java?rev=1676164&view=auto
==============================================================================
--- 
oodt/trunk/filemgr/src/main/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TikaAutoDetectExtractor.java
 (added)
+++ 
oodt/trunk/filemgr/src/main/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TikaAutoDetectExtractor.java
 Mon Apr 27 01:19:05 2015
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.oodt.cas.filemgr.metadata.extractors.examples;
+
+//JDK imports
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.Arrays;
+
+//OODT imports
+import 
org.apache.oodt.cas.filemgr.metadata.extractors.AbstractFilemgrMetExtractor;
+import org.apache.oodt.cas.filemgr.structs.Product;
+import org.apache.oodt.cas.metadata.Metadata;
+import org.apache.oodt.cas.metadata.exceptions.MetExtractionException;
+
+//TIKA imports
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.*;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+
+public class TikaAutoDetectExtractor extends AbstractFilemgrMetExtractor  {
+
+    public void doConfigure() {
+    }
+
+    public Metadata doExtract(Product product, Metadata met) throws 
MetExtractionException {
+        Metadata outMetadata = new Metadata();
+
+        merge(met, outMetadata);
+        Metadata tikaMetadata = getMetadataFromTika(product);
+        merge(tikaMetadata, outMetadata);
+
+        return outMetadata;
+    }
+
+    private Metadata getMetadataFromTika(Product product) throws 
MetExtractionException {
+        try {
+            File file = getProductFile(product);
+            FileInputStream inputStream = new FileInputStream(file);
+            org.apache.tika.metadata.Metadata tikaMetadata = new 
org.apache.tika.metadata.Metadata();
+            Parser parser = new AutoDetectParser();
+            parser.parse(inputStream, new DefaultHandler(), tikaMetadata, new 
ParseContext());
+            return transform(tikaMetadata);
+
+        } catch (FileNotFoundException e) {
+            throw new MetExtractionException(
+                    "Unable to find file: Reason: " + e.getMessage());
+        } catch (TikaException e) {
+            throw new MetExtractionException(
+                    "Unable to parse the document: Reason: " + e.getMessage());
+        } catch (SAXException e) {
+            throw new MetExtractionException(
+                    " Unable to process the SAX events : Reason: " + 
e.getMessage());
+        } catch (IOException e) {
+            throw new MetExtractionException(
+                    "Unable to read the document stream: Reason: " + 
e.getMessage());
+        }
+    }
+
+    private Metadata transform(org.apache.tika.metadata.Metadata tikaMetadata){
+        Metadata metadata = new Metadata();
+
+        String[] names = tikaMetadata.names();
+        for (String name : names){
+            metadata.addMetadata(name, 
Arrays.asList(tikaMetadata.getValues(name)));
+        }
+
+        return metadata;
+    }
+}

Modified: oodt/trunk/filemgr/src/main/resources/examples/core/product-types.xml
URL: 
http://svn.apache.org/viewvc/oodt/trunk/filemgr/src/main/resources/examples/core/product-types.xml?rev=1676164&r1=1676163&r2=1676164&view=diff
==============================================================================
--- oodt/trunk/filemgr/src/main/resources/examples/core/product-types.xml 
(original)
+++ oodt/trunk/filemgr/src/main/resources/examples/core/product-types.xml Mon 
Apr 27 01:19:05 2015
@@ -89,6 +89,14 @@
         </extractor>
       
        -->
+
+     <!--
+       The below enables the TikaAutoDetectExtractor. It does not take any 
configuration parameters
+       and simple runs Apache Tika on the reference for the Product and 
assumes it's a single file 
+       product (Product.STRUCTURE_FLAT).
+
+       <extractor 
class="org.apache.oodt.cas.filemgr.metadata.extractors.examples.TikaAutoDetectExtractor"/>
+      -->     
     </metExtractors>
   </type>
 </cas:producttypes>

Added: 
oodt/trunk/filemgr/src/test/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TestTikaAutoDetectExtractor.java
URL: 
http://svn.apache.org/viewvc/oodt/trunk/filemgr/src/test/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TestTikaAutoDetectExtractor.java?rev=1676164&view=auto
==============================================================================
--- 
oodt/trunk/filemgr/src/test/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TestTikaAutoDetectExtractor.java
 (added)
+++ 
oodt/trunk/filemgr/src/test/java/org/apache/oodt/cas/filemgr/metadata/extractors/examples/TestTikaAutoDetectExtractor.java
 Mon Apr 27 01:19:05 2015
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.oodt.cas.filemgr.metadata.extractors.examples;
+
+//JDK imports
+import java.net.URL;
+
+//Junit imports
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+// JUnit static imports
+import static junit.framework.Assert.*;
+
+//OODT imports
+import org.apache.oodt.cas.filemgr.structs.Product;
+import org.apache.oodt.cas.filemgr.structs.Reference;
+import org.apache.oodt.cas.metadata.Metadata;
+import org.apache.oodt.cas.metadata.exceptions.MetExtractionException;
+
+
+@RunWith(JUnit4.class)
+public class TestTikaAutoDetectExtractor {
+
+    @Test
+    public void test() throws MetExtractionException {
+        TikaAutoDetectExtractor tikaExtractor = new TikaAutoDetectExtractor();
+
+        Metadata emptyMetadata = new Metadata();
+        Reference ref = new Reference();
+        URL file = this.getClass().getResource("/test.txt");
+        ref.setOrigReference(file.toString());
+        ref.setDataStoreReference(file.toString());
+
+        Product product = new Product();
+        product.getProductReferences().add(ref);
+        product.setProductStructure(Product.STRUCTURE_FLAT);
+
+        Metadata outputMetadata = tikaExtractor.doExtract(product, 
emptyMetadata);
+
+        assertNotNull(outputMetadata);
+        assertTrue(outputMetadata.getAllKeys().size() > 0);
+        assertTrue(outputMetadata.containsKey("X-Parsed-By"));
+        assertFalse(outputMetadata.getMetadata("X-Parsed-By") == 
"org.apache.tika.parser.EmptyParser");
+    }
+}


Reply via email to