Author: jukka
Date: Mon Jan 13 17:59:44 2014
New Revision: 1557795

URL: http://svn.apache.org/r1557795
Log:
TIKA-1217: Integrate with Java-7 FileTypeDetector API

Patch by Peter Ansell

Added:
    tika/trunk/tika-java7/
    tika/trunk/tika-java7/pom.xml
    tika/trunk/tika-java7/src/
    tika/trunk/tika-java7/src/main/
    tika/trunk/tika-java7/src/main/java/
    tika/trunk/tika-java7/src/main/java/org/
    tika/trunk/tika-java7/src/main/java/org/apache/
    tika/trunk/tika-java7/src/main/java/org/apache/tika/
    tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/
    
tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/TikaFileTypeDetector.java
    
tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/package-info.java
    tika/trunk/tika-java7/src/main/resources/
    tika/trunk/tika-java7/src/main/resources/META-INF/
    tika/trunk/tika-java7/src/main/resources/META-INF/services/
    
tika/trunk/tika-java7/src/main/resources/META-INF/services/java.nio.file.spi.FileTypeDetector
    tika/trunk/tika-java7/src/test/
    tika/trunk/tika-java7/src/test/java/
    tika/trunk/tika-java7/src/test/java/org/
    tika/trunk/tika-java7/src/test/java/org/apache/
    tika/trunk/tika-java7/src/test/java/org/apache/tika/
    tika/trunk/tika-java7/src/test/java/org/apache/tika/filetypedetector/
    
tika/trunk/tika-java7/src/test/java/org/apache/tika/filetypedetector/TikaFileTypeDetectorTest.java
    tika/trunk/tika-java7/src/test/resources/
    tika/trunk/tika-java7/src/test/resources/test-documents/
    tika/trunk/tika-java7/src/test/resources/test-documents/test.html
Modified:
    tika/trunk/pom.xml

Modified: tika/trunk/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/pom.xml?rev=1557795&r1=1557794&r2=1557795&view=diff
==============================================================================
--- tika/trunk/pom.xml (original)
+++ tika/trunk/pom.xml Mon Jan 13 17:59:44 2014
@@ -202,6 +202,15 @@ A release vote template has been generat
         </plugins>
       </build>
     </profile>
+    <profile>
+      <id>java7</id>
+      <activation>
+        <jdk>[1.7,]</jdk>
+      </activation>
+      <modules>
+        <module>tika-java7</module>
+      </modules>
+    </profile>
   </profiles>
 
   <description>The Apache Tika™ toolkit detects and extracts metadata and 
structured text content from various documents using existing parser libraries. 
</description>

Added: tika/trunk/tika-java7/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-java7/pom.xml?rev=1557795&view=auto
==============================================================================
--- tika/trunk/tika-java7/pom.xml (added)
+++ tika/trunk/tika-java7/pom.xml Mon Jan 13 17:59:44 2014
@@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"; 
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <groupId>org.apache.tika</groupId>
+    <artifactId>tika-parent</artifactId>
+    <version>1.5-SNAPSHOT</version>
+    <relativePath>../tika-parent/pom.xml</relativePath>
+  </parent>
+
+  <artifactId>tika-java7</artifactId>
+  <packaging>bundle</packaging>
+
+  <name>Apache Tika Java-7 Components</name>
+  <description>Java-7 reliant components, including FileTypeDetector 
implementations</description>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-scr-plugin</artifactId>
+        <version>1.7.4</version>
+      </plugin>
+      <plugin>
+        <!-- builds the bundle -->
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Export-Package>
+              org.apache.tika.filetypedetector
+            </Export-Package>
+            <Private-Package />
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-compiler-plugin</artifactId>
+        <version>3.1</version>
+        <configuration>
+          <source>1.7</source>
+          <target>1.7</target>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+
+  <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-core</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-parsers</artifactId>
+      <version>${project.version}</version>
+    </dependency>
+    <dependency>
+      <groupId>biz.aQute</groupId>
+      <artifactId>bndlib</artifactId>
+      <scope>provided</scope>
+    </dependency>
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+      <version>4.11</version>
+    </dependency>
+  </dependencies>
+
+  <url>http://tika.apache.org/</url>
+  <organization>
+       <name>The Apache Software Foundation</name>
+       <url>http://www.apache.org</url>
+  </organization>
+  <scm>
+       <url>http://svn.apache.org/viewvc/tika/trunk/tika-java7</url>
+       
<connection>scm:svn:http://svn.apache.org/repos/asf/tika/trunk/tika-java7</connection>
+       
<developerConnection>scm:svn:https://svn.apache.org/repos/asf/tika/trunk/tika-java7</developerConnection>
+  </scm>
+  <issueManagement>
+       <system>JIRA</system>
+       <url>https://issues.apache.org/jira/browse/TIKA</url>
+  </issueManagement>
+  <ciManagement>
+       <system>Jenkins</system>
+       <url>https://builds.apache.org/job/Tika-trunk/</url>
+  </ciManagement>
+</project>

Added: 
tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/TikaFileTypeDetector.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/TikaFileTypeDetector.java?rev=1557795&view=auto
==============================================================================
--- 
tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/TikaFileTypeDetector.java
 (added)
+++ 
tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/TikaFileTypeDetector.java
 Mon Jan 13 17:59:44 2014
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.filetypedetector;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.spi.FileTypeDetector;
+
+import org.apache.tika.Tika;
+import org.apache.tika.mime.MimeTypes;
+
+public class TikaFileTypeDetector extends FileTypeDetector {
+    private final Tika tika = new Tika();
+    
+    public TikaFileTypeDetector() {
+        super();
+    }
+    
+    @Override
+    public String probeContentType(Path path) throws IOException {
+        // Try to detect based on the file name only for efficiency
+        String fileNameDetect = tika.detect(path.toString());
+        if(!fileNameDetect.equals(MimeTypes.OCTET_STREAM)) {
+            return fileNameDetect;
+        }
+        
+        // Then check the file content if necessary
+        String fileContentDetect = tika.detect(path.toFile());
+        if(!fileContentDetect.equals(MimeTypes.OCTET_STREAM)) {
+            return fileContentDetect;
+        }
+        
+        // Specification says to return null if we could not 
+        // conclusively determine the file type
+        return null;
+    }
+    
+}

Added: 
tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/package-info.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/package-info.java?rev=1557795&view=auto
==============================================================================
--- 
tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/package-info.java
 (added)
+++ 
tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/package-info.java
 Mon Jan 13 17:59:44 2014
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tika Java-7 FileTypeDetector implementations.
+ */
[email protected]("1.0.0")
+package org.apache.tika.filetypedetector;
\ No newline at end of file

Added: 
tika/trunk/tika-java7/src/main/resources/META-INF/services/java.nio.file.spi.FileTypeDetector
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-java7/src/main/resources/META-INF/services/java.nio.file.spi.FileTypeDetector?rev=1557795&view=auto
==============================================================================
--- 
tika/trunk/tika-java7/src/main/resources/META-INF/services/java.nio.file.spi.FileTypeDetector
 (added)
+++ 
tika/trunk/tika-java7/src/main/resources/META-INF/services/java.nio.file.spi.FileTypeDetector
 Mon Jan 13 17:59:44 2014
@@ -0,0 +1 @@
+org.apache.tika.filetypedetector.TikaFileTypeDetector
\ No newline at end of file

Added: 
tika/trunk/tika-java7/src/test/java/org/apache/tika/filetypedetector/TikaFileTypeDetectorTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-java7/src/test/java/org/apache/tika/filetypedetector/TikaFileTypeDetectorTest.java?rev=1557795&view=auto
==============================================================================
--- 
tika/trunk/tika-java7/src/test/java/org/apache/tika/filetypedetector/TikaFileTypeDetectorTest.java
 (added)
+++ 
tika/trunk/tika-java7/src/test/java/org/apache/tika/filetypedetector/TikaFileTypeDetectorTest.java
 Mon Jan 13 17:59:44 2014
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.filetypedetector;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.spi.FileTypeDetector;
+import java.util.Iterator;
+import java.util.ServiceLoader;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class TikaFileTypeDetectorTest {
+    
+    @Rule
+    public TemporaryFolder tempDir = new TemporaryFolder();
+    
+    private Path testDirectory = null;
+    
+    private static final String TEST_CLASSPATH = "/test-documents/test.html";
+    private static final String TEST_HTML = "test.html";
+    private static final String TEST_UNRECOGNISED_EXTENSION = 
"test.unrecognisedextension";
+    
+    @Before
+    public void setUp() throws Exception {
+        testDirectory = tempDir.newFolder().toPath();
+        Files.copy(this.getClass().getResourceAsStream(TEST_CLASSPATH), 
+                testDirectory.resolve(TEST_HTML));
+        Files.copy(this.getClass().getResourceAsStream(TEST_CLASSPATH), 
+                testDirectory.resolve(TEST_UNRECOGNISED_EXTENSION));
+    }
+    
+    @After
+    public void tearDown() throws Exception {
+    }
+    
+    @Test
+    public final void testDirectAccess() throws Exception {
+        String contentType = new 
TikaFileTypeDetector().probeContentType(testDirectory.resolve(TEST_HTML));
+        assertNotNull(contentType);
+        assertEquals("text/html", contentType);
+    }
+    
+    @Test
+    public final void testFilesProbeContentTypePathExtension() throws 
Exception {
+        String contentType = 
Files.probeContentType(testDirectory.resolve(TEST_HTML));
+        assertNotNull(contentType);
+        assertEquals("text/html", contentType);
+    }
+    
+    @Test
+    public final void testFilesProbeContentTypePathUnrecognised() throws 
Exception {
+        String contentType = 
Files.probeContentType(testDirectory.resolve(TEST_UNRECOGNISED_EXTENSION));
+        assertNotNull(contentType);
+        assertEquals("text/html", contentType);
+    }
+    
+    @Test
+    public final void testMetaInfServicesLoad() throws Exception {
+        ServiceLoader<FileTypeDetector> serviceLoader = 
ServiceLoader.load(FileTypeDetector.class);
+        
+        Iterator<FileTypeDetector> iterator = serviceLoader.iterator();
+        assertTrue(iterator.hasNext());
+        
+        while(iterator.hasNext()) {
+            FileTypeDetector fileTypeDetector = iterator.next();
+            assertNotNull(fileTypeDetector);
+            assertTrue(fileTypeDetector instanceof TikaFileTypeDetector);
+        }
+    }
+}

Added: tika/trunk/tika-java7/src/test/resources/test-documents/test.html
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-java7/src/test/resources/test-documents/test.html?rev=1557795&view=auto
==============================================================================
--- tika/trunk/tika-java7/src/test/resources/test-documents/test.html (added)
+++ tika/trunk/tika-java7/src/test/resources/test-documents/test.html Mon Jan 
13 17:59:44 2014
@@ -0,0 +1,10 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 
"http://www.w3.org/TR/html4/loose.dtd";>
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
+<title>Hello World</title>
+</head>
+<body>
+  <p>Hello World!<p/>
+</body>
+</html>
\ No newline at end of file


Reply via email to