Author: jukka
Date: Tue Jan 26 22:54:57 2010
New Revision: 903470
URL: http://svn.apache.org/viewvc?rev=903470&view=rev
Log:
TIKA-141: Mime Content Type detection of a web document from its URL.
Add MetadataHelper class with centralized utility methods that pull out as much
input metadata from a given File or URL when accessing its content. Use those
methods wherever appropriate.
Added:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/metadata/
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/metadata/test.txt
Modified:
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
Modified:
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=903470&r1=903469&r2=903470&view=diff
==============================================================================
--- lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
(original)
+++ lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
Tue Jan 26 22:54:57 2010
@@ -17,7 +17,6 @@
package org.apache.tika.cli;
import java.io.File;
-import java.io.FileInputStream;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
@@ -40,6 +39,7 @@
import org.apache.log4j.WriterAppender;
import org.apache.tika.gui.TikaGUI;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.MetadataHelper;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -159,17 +159,10 @@
InputStream input;
File file = new File(arg);
if (file.isFile()) {
- metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
- input = new FileInputStream(file);
+ input = MetadataHelper.getInputStream(file, metadata);
} else {
- URL url = new URL(arg);
- String path = url.getPath();
- int slash = path.lastIndexOf('/');
- String name = path.substring(slash + 1);
- if (name.length() > 0) {
- metadata.set(Metadata.RESOURCE_NAME_KEY, name);
- }
- input = url.openStream();
+ input =
+ MetadataHelper.getInputStream(new URL(arg), metadata);
}
try {
parser.parse(
Modified:
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java?rev=903470&r1=903469&r2=903470&view=diff
==============================================================================
---
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
(original)
+++
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
Tue Jan 26 22:54:57 2010
@@ -21,7 +21,7 @@
import java.awt.datatransfer.Transferable;
import java.awt.event.InputEvent;
import java.io.File;
-import java.io.FileInputStream;
+import java.io.InputStream;
import java.util.List;
import java.util.StringTokenizer;
import java.util.ArrayList;
@@ -32,11 +32,19 @@
import javax.swing.JComponent;
import javax.swing.TransferHandler;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.MetadataHelper;
+
/**
* Utility class that turns drag-and-drop events into Tika parse requests.
*/
class ParsingTransferHandler extends TransferHandler {
+ /**
+ * Serial version UID.
+ */
+ private static final long serialVersionUID = -557932290014044494L;
+
private final TransferHandler delegate;
private final TikaGUI tika;
@@ -65,20 +73,29 @@
return false;
}
+ @SuppressWarnings("unchecked")
public boolean importData(
JComponent component, Transferable transferable) {
try {
List<File> files = null;
if
(transferable.isDataFlavorSupported(DataFlavor.javaFileListFlavor)) {
- files = (List<File>)
transferable.getTransferData(DataFlavor.javaFileListFlavor);
+ files = (List<File>) transferable.getTransferData(
+ DataFlavor.javaFileListFlavor);
} else if (transferable.isDataFlavorSupported(urlListFlavor)) {
- tika.importStream(new URL((String)
transferable.getTransferData(urlListFlavor)).openStream());
+ Object data = transferable.getTransferData(urlListFlavor);
+ Metadata metadata = new Metadata();
+ InputStream stream = MetadataHelper.getInputStream(
+ new URL(data.toString()), metadata);
+ tika.importStream(stream, metadata);
} else if (transferable.isDataFlavorSupported(uriListFlavor)) {
files = uriToFileList((String)
transferable.getTransferData(uriListFlavor));
}
for (File file : files) {
- tika.importStream(new FileInputStream(file));
+ Metadata metadata = new Metadata();
+ InputStream stream =
+ MetadataHelper.getInputStream(file, metadata);
+ tika.importStream(stream, metadata);
}
return true;
} catch (Exception e) {
Modified:
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=903470&r1=903469&r2=903470&view=diff
==============================================================================
--- lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
(original)
+++ lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
Tue Jan 26 22:54:57 2010
@@ -57,6 +57,11 @@
public class TikaGUI extends JFrame {
/**
+ * Serial version UID.
+ */
+ private static final long serialVersionUID = 5883906936187059495L;
+
+ /**
* Main method. Sets the Swing look and feel to the operating system
* settings, and starts the Tika GUI with an {...@link AutoDetectParser}
* instance as the default parser.
@@ -134,7 +139,8 @@
this.context.set(Parser.class, parser);
}
- public void importStream(InputStream input) throws IOException {
+ public void importStream(InputStream input, Metadata md)
+ throws IOException {
try {
StringWriter htmlBuffer = new StringWriter();
StringWriter textBuffer = new StringWriter();
@@ -145,7 +151,6 @@
getHtmlHandler(htmlBuffer),
getTextContentHandler(textBuffer),
getXmlContentHandler(xmlBuffer));
- Metadata md = new Metadata();
input = new ProgressMonitorInputStream(
this, "Parsing stream", input);
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java?rev=903470&r1=903469&r2=903470&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
(original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Tue Jan
26 22:54:57 2010
@@ -18,7 +18,6 @@
import java.io.BufferedInputStream;
import java.io.File;
-import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
@@ -29,6 +28,7 @@
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.MetadataHelper;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
@@ -139,9 +139,10 @@
* @throws IOException if the file can not be read
*/
public String detect(File file) throws FileNotFoundException, IOException {
- InputStream stream = new FileInputStream(file);
+ Metadata metadata = new Metadata();
+ InputStream stream = MetadataHelper.getInputStream(file, metadata);
try {
- return detect(stream, getFileMetadata(file));
+ return detect(stream, metadata);
} finally {
stream.close();
}
@@ -160,9 +161,10 @@
* @throws IOException if the resource can not be read
*/
public String detect(URL url) throws IOException {
- InputStream stream = url.openStream();
+ Metadata metadata = new Metadata();
+ InputStream stream = MetadataHelper.getInputStream(url, metadata);
try {
- return detect(stream, getUrlMetadata(url));
+ return detect(stream, metadata);
} finally {
stream.close();
}
@@ -225,7 +227,9 @@
* @throws IOException if the file can not be read or parsed
*/
public Reader parse(File file) throws FileNotFoundException, IOException {
- return parse(new FileInputStream(file), getFileMetadata(file));
+ Metadata metadata = new Metadata();
+ InputStream stream = MetadataHelper.getInputStream(file, metadata);
+ return parse(stream, metadata);
}
/**
@@ -237,7 +241,9 @@
* @throws IOException if the resource can not be read or parsed
*/
public Reader parse(URL url) throws IOException {
- return parse(url.openStream(), getUrlMetadata(url));
+ Metadata metadata = new Metadata();
+ InputStream stream = MetadataHelper.getInputStream(url, metadata);
+ return parse(stream, metadata);
}
/**
@@ -291,7 +297,9 @@
*/
public String parseToString(File file)
throws FileNotFoundException, IOException, TikaException {
- return parseToString(new FileInputStream(file), getFileMetadata(file));
+ Metadata metadata = new Metadata();
+ InputStream stream = MetadataHelper.getInputStream(file, metadata);
+ return parseToString(stream, metadata);
}
/**
@@ -304,23 +312,9 @@
* @throws TikaException if the resource can not be parsed
*/
public String parseToString(URL url) throws IOException, TikaException {
- return parseToString(url.openStream(), getUrlMetadata(url));
- }
-
- private static Metadata getFileMetadata(File file) {
Metadata metadata = new Metadata();
- metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
- return metadata;
- }
-
- private static Metadata getUrlMetadata(URL url) {
- Metadata metadata = new Metadata();
- String path = url.getPath();
- int slash = path.lastIndexOf('/');
- if (slash + 1 < path.length()) {
- metadata.set(Metadata.RESOURCE_NAME_KEY, path.substring(slash +
1));
- }
- return metadata;
+ InputStream stream = MetadataHelper.getInputStream(url, metadata);
+ return parseToString(stream, metadata);
}
}
Added:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java?rev=903470&view=auto
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
(added)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
Tue Jan 26 22:54:57 2010
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.net.URLConnection;
+
+/**
+ * Collection of static helper methods for handling metadata.
+ *
+ * @since Apache Tika 0.7
+ */
+public class MetadataHelper {
+
+ /**
+ * Private constructor to prevent instantiation.
+ */
+ private MetadataHelper() {
+ }
+
+ /**
+ * Returns the contents of the given file, and sets any related metadata
+ * entries.
+ *
+ * @param file the file to be read
+ * @param metadata where the file metadata is stored
+ * @return file content
+ * @throws FileNotFoundException if the file does not exist
+ */
+ public static InputStream getInputStream(File file, Metadata metadata)
+ throws FileNotFoundException {
+ String name = file.getName();
+ if (name.length() > 0) {
+ metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+ }
+
+ long length = file.length();
+ if (length > 0) {
+ metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length));
+ }
+
+ return new FileInputStream(file);
+ }
+
+ /**
+ * Returns the content at the given URL, and sets any related
+ * metadata entries.
+ *
+ * @param url the URL of the resource to be read
+ * @param metadata where the resource metadata is stored
+ * @return resource content
+ * @throws FileNotFoundException if the URL can not be accessed
+ */
+ public static InputStream getInputStream(URL url, Metadata metadata)
+ throws IOException {
+ URLConnection connection = url.openConnection();
+
+ String path = url.getPath();
+ int slash = path.lastIndexOf('/');
+ if (slash + 1 < path.length()) { // works even with -1!
+ metadata.set(Metadata.RESOURCE_NAME_KEY, path.substring(slash +
1));
+ }
+
+ String type = connection.getContentType();
+ if (type != null) {
+ metadata.set(Metadata.CONTENT_TYPE, type);
+ }
+
+ String encoding = connection.getContentEncoding();
+ if (encoding != null) {
+ metadata.set(Metadata.CONTENT_TYPE, encoding);
+ }
+
+ int length = connection.getContentLength();
+ if (length >= 0) {
+ metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
+ }
+
+ return connection.getInputStream();
+ }
+
+}
Added:
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java?rev=903470&view=auto
==============================================================================
---
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java
(added)
+++
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java
Tue Jan 26 22:54:57 2010
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+import java.io.File;
+import java.net.URL;
+
+import junit.framework.TestCase;
+
+/**
+ * Unit tests for {...@link MetadataHelper}.
+ */
+public class TestMetadataHelper extends TestCase {
+
+ public void testGetInputStream() throws Exception {
+ URL url = TestMetadataHelper.class.getResource("test.txt");
+ File file = new File(url.toURI());
+
+ Metadata urlMetadata = new Metadata();
+ MetadataHelper.getInputStream(url, urlMetadata).close();
+ assertEquals("test.txt", urlMetadata.get(Metadata.RESOURCE_NAME_KEY));
+ assertEquals(
+ Long.toString(file.length()),
+ urlMetadata.get(Metadata.CONTENT_LENGTH));
+
+ Metadata fileMetadata = new Metadata();
+ MetadataHelper.getInputStream(file, fileMetadata).close();
+ assertEquals("test.txt", fileMetadata.get(Metadata.RESOURCE_NAME_KEY));
+ assertEquals(
+ Long.toString(file.length()),
+ fileMetadata.get(Metadata.CONTENT_LENGTH));
+ }
+
+}
Added:
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/metadata/test.txt
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/metadata/test.txt?rev=903470&view=auto
==============================================================================
---
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/metadata/test.txt
(added)
+++
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/metadata/test.txt
Tue Jan 26 22:54:57 2010
@@ -0,0 +1 @@
+Hello, World!
\ No newline at end of file