Author: jukka
Date: Mon Jan 13 17:59:44 2014
New Revision: 1557795
URL: http://svn.apache.org/r1557795
Log:
TIKA-1217: Integrate with Java-7 FileTypeDetector API
Patch by Peter Ansell
Added:
tika/trunk/tika-java7/
tika/trunk/tika-java7/pom.xml
tika/trunk/tika-java7/src/
tika/trunk/tika-java7/src/main/
tika/trunk/tika-java7/src/main/java/
tika/trunk/tika-java7/src/main/java/org/
tika/trunk/tika-java7/src/main/java/org/apache/
tika/trunk/tika-java7/src/main/java/org/apache/tika/
tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/
tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/TikaFileTypeDetector.java
tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/package-info.java
tika/trunk/tika-java7/src/main/resources/
tika/trunk/tika-java7/src/main/resources/META-INF/
tika/trunk/tika-java7/src/main/resources/META-INF/services/
tika/trunk/tika-java7/src/main/resources/META-INF/services/java.nio.file.spi.FileTypeDetector
tika/trunk/tika-java7/src/test/
tika/trunk/tika-java7/src/test/java/
tika/trunk/tika-java7/src/test/java/org/
tika/trunk/tika-java7/src/test/java/org/apache/
tika/trunk/tika-java7/src/test/java/org/apache/tika/
tika/trunk/tika-java7/src/test/java/org/apache/tika/filetypedetector/
tika/trunk/tika-java7/src/test/java/org/apache/tika/filetypedetector/TikaFileTypeDetectorTest.java
tika/trunk/tika-java7/src/test/resources/
tika/trunk/tika-java7/src/test/resources/test-documents/
tika/trunk/tika-java7/src/test/resources/test-documents/test.html
Modified:
tika/trunk/pom.xml
Modified: tika/trunk/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/pom.xml?rev=1557795&r1=1557794&r2=1557795&view=diff
==============================================================================
--- tika/trunk/pom.xml (original)
+++ tika/trunk/pom.xml Mon Jan 13 17:59:44 2014
@@ -202,6 +202,15 @@ A release vote template has been generat
</plugins>
</build>
</profile>
+ <profile>
+ <id>java7</id>
+ <activation>
+ <jdk>[1.7,]</jdk>
+ </activation>
+ <modules>
+ <module>tika-java7</module>
+ </modules>
+ </profile>
</profiles>
<description>The Apache Tika⢠toolkit detects and extracts metadata and
structured text content from various documents using existing parser libraries.
</description>
Added: tika/trunk/tika-java7/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-java7/pom.xml?rev=1557795&view=auto
==============================================================================
--- tika/trunk/tika-java7/pom.xml (added)
+++ tika/trunk/tika-java7/pom.xml Mon Jan 13 17:59:44 2014
@@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parent</artifactId>
+ <version>1.5-SNAPSHOT</version>
+ <relativePath>../tika-parent/pom.xml</relativePath>
+ </parent>
+
+ <artifactId>tika-java7</artifactId>
+ <packaging>bundle</packaging>
+
+ <name>Apache Tika Java-7 Components</name>
+ <description>Java-7 reliant components, including FileTypeDetector
implementations</description>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-scr-plugin</artifactId>
+ <version>1.7.4</version>
+ </plugin>
+ <plugin>
+ <!-- builds the bundle -->
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Export-Package>
+ org.apache.tika.filetypedetector
+ </Export-Package>
+ <Private-Package />
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-compiler-plugin</artifactId>
+ <version>3.1</version>
+ <configuration>
+ <source>1.7</source>
+ <target>1.7</target>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>${project.groupId}</groupId>
+ <artifactId>tika-parsers</artifactId>
+ <version>${project.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>biz.aQute</groupId>
+ <artifactId>bndlib</artifactId>
+ <scope>provided</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ <version>4.11</version>
+ </dependency>
+ </dependencies>
+
+ <url>http://tika.apache.org/</url>
+ <organization>
+ <name>The Apache Software Foundation</name>
+ <url>http://www.apache.org</url>
+ </organization>
+ <scm>
+ <url>http://svn.apache.org/viewvc/tika/trunk/tika-java7</url>
+
<connection>scm:svn:http://svn.apache.org/repos/asf/tika/trunk/tika-java7</connection>
+
<developerConnection>scm:svn:https://svn.apache.org/repos/asf/tika/trunk/tika-java7</developerConnection>
+ </scm>
+ <issueManagement>
+ <system>JIRA</system>
+ <url>https://issues.apache.org/jira/browse/TIKA</url>
+ </issueManagement>
+ <ciManagement>
+ <system>Jenkins</system>
+ <url>https://builds.apache.org/job/Tika-trunk/</url>
+ </ciManagement>
+</project>
Added:
tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/TikaFileTypeDetector.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/TikaFileTypeDetector.java?rev=1557795&view=auto
==============================================================================
---
tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/TikaFileTypeDetector.java
(added)
+++
tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/TikaFileTypeDetector.java
Mon Jan 13 17:59:44 2014
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.filetypedetector;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.spi.FileTypeDetector;
+
+import org.apache.tika.Tika;
+import org.apache.tika.mime.MimeTypes;
+
+public class TikaFileTypeDetector extends FileTypeDetector {
+ private final Tika tika = new Tika();
+
+ public TikaFileTypeDetector() {
+ super();
+ }
+
+ @Override
+ public String probeContentType(Path path) throws IOException {
+ // Try to detect based on the file name only for efficiency
+ String fileNameDetect = tika.detect(path.toString());
+ if(!fileNameDetect.equals(MimeTypes.OCTET_STREAM)) {
+ return fileNameDetect;
+ }
+
+ // Then check the file content if necessary
+ String fileContentDetect = tika.detect(path.toFile());
+ if(!fileContentDetect.equals(MimeTypes.OCTET_STREAM)) {
+ return fileContentDetect;
+ }
+
+ // Specification says to return null if we could not
+ // conclusively determine the file type
+ return null;
+ }
+
+}
Added:
tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/package-info.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/package-info.java?rev=1557795&view=auto
==============================================================================
---
tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/package-info.java
(added)
+++
tika/trunk/tika-java7/src/main/java/org/apache/tika/filetypedetector/package-info.java
Mon Jan 13 17:59:44 2014
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Tika Java-7 FileTypeDetector implementations.
+ */
[email protected]("1.0.0")
+package org.apache.tika.filetypedetector;
\ No newline at end of file
Added:
tika/trunk/tika-java7/src/main/resources/META-INF/services/java.nio.file.spi.FileTypeDetector
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-java7/src/main/resources/META-INF/services/java.nio.file.spi.FileTypeDetector?rev=1557795&view=auto
==============================================================================
---
tika/trunk/tika-java7/src/main/resources/META-INF/services/java.nio.file.spi.FileTypeDetector
(added)
+++
tika/trunk/tika-java7/src/main/resources/META-INF/services/java.nio.file.spi.FileTypeDetector
Mon Jan 13 17:59:44 2014
@@ -0,0 +1 @@
+org.apache.tika.filetypedetector.TikaFileTypeDetector
\ No newline at end of file
Added:
tika/trunk/tika-java7/src/test/java/org/apache/tika/filetypedetector/TikaFileTypeDetectorTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-java7/src/test/java/org/apache/tika/filetypedetector/TikaFileTypeDetectorTest.java?rev=1557795&view=auto
==============================================================================
---
tika/trunk/tika-java7/src/test/java/org/apache/tika/filetypedetector/TikaFileTypeDetectorTest.java
(added)
+++
tika/trunk/tika-java7/src/test/java/org/apache/tika/filetypedetector/TikaFileTypeDetectorTest.java
Mon Jan 13 17:59:44 2014
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.filetypedetector;
+
+import static org.junit.Assert.*;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.spi.FileTypeDetector;
+import java.util.Iterator;
+import java.util.ServiceLoader;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Rule;
+import org.junit.Test;
+import org.junit.rules.TemporaryFolder;
+
+public class TikaFileTypeDetectorTest {
+
+ @Rule
+ public TemporaryFolder tempDir = new TemporaryFolder();
+
+ private Path testDirectory = null;
+
+ private static final String TEST_CLASSPATH = "/test-documents/test.html";
+ private static final String TEST_HTML = "test.html";
+ private static final String TEST_UNRECOGNISED_EXTENSION =
"test.unrecognisedextension";
+
+ @Before
+ public void setUp() throws Exception {
+ testDirectory = tempDir.newFolder().toPath();
+ Files.copy(this.getClass().getResourceAsStream(TEST_CLASSPATH),
+ testDirectory.resolve(TEST_HTML));
+ Files.copy(this.getClass().getResourceAsStream(TEST_CLASSPATH),
+ testDirectory.resolve(TEST_UNRECOGNISED_EXTENSION));
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ }
+
+ @Test
+ public final void testDirectAccess() throws Exception {
+ String contentType = new
TikaFileTypeDetector().probeContentType(testDirectory.resolve(TEST_HTML));
+ assertNotNull(contentType);
+ assertEquals("text/html", contentType);
+ }
+
+ @Test
+ public final void testFilesProbeContentTypePathExtension() throws
Exception {
+ String contentType =
Files.probeContentType(testDirectory.resolve(TEST_HTML));
+ assertNotNull(contentType);
+ assertEquals("text/html", contentType);
+ }
+
+ @Test
+ public final void testFilesProbeContentTypePathUnrecognised() throws
Exception {
+ String contentType =
Files.probeContentType(testDirectory.resolve(TEST_UNRECOGNISED_EXTENSION));
+ assertNotNull(contentType);
+ assertEquals("text/html", contentType);
+ }
+
+ @Test
+ public final void testMetaInfServicesLoad() throws Exception {
+ ServiceLoader<FileTypeDetector> serviceLoader =
ServiceLoader.load(FileTypeDetector.class);
+
+ Iterator<FileTypeDetector> iterator = serviceLoader.iterator();
+ assertTrue(iterator.hasNext());
+
+ while(iterator.hasNext()) {
+ FileTypeDetector fileTypeDetector = iterator.next();
+ assertNotNull(fileTypeDetector);
+ assertTrue(fileTypeDetector instanceof TikaFileTypeDetector);
+ }
+ }
+}
Added: tika/trunk/tika-java7/src/test/resources/test-documents/test.html
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-java7/src/test/resources/test-documents/test.html?rev=1557795&view=auto
==============================================================================
--- tika/trunk/tika-java7/src/test/resources/test-documents/test.html (added)
+++ tika/trunk/tika-java7/src/test/resources/test-documents/test.html Mon Jan
13 17:59:44 2014
@@ -0,0 +1,10 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
+<html>
+<head>
+<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
+<title>Hello World</title>
+</head>
+<body>
+ <p>Hello World!<p/>
+</body>
+</html>
\ No newline at end of file