Author: jukka
Date: Tue Jan 26 23:12:34 2010
New Revision: 903481
URL: http://svn.apache.org/viewvc?rev=903481&view=rev
Log:
TIKA-141: Mime Content Type detection of a web document from its URL.
Java provides better input metadata for file:// URLs than it does for normal
Files, so let's just always use URLs!
Modified:
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java
Modified:
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=903481&r1=903480&r2=903481&view=diff
==============================================================================
--- lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
(original)
+++ lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
Tue Jan 26 23:12:34 2010
@@ -156,14 +156,15 @@
System.in, type.getContentHandler(),
metadata, context);
} else {
- InputStream input;
+ URL url;
File file = new File(arg);
if (file.isFile()) {
- input = MetadataHelper.getInputStream(file, metadata);
+ url = file.toURI().toURL();
} else {
- input =
- MetadataHelper.getInputStream(new URL(arg), metadata);
+ url = new URL(arg);
}
+ InputStream input =
+ MetadataHelper.getInputStream(url, metadata);
try {
parser.parse(
input, type.getContentHandler(),
Modified:
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java?rev=903481&r1=903480&r2=903481&view=diff
==============================================================================
---
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
(original)
+++
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
Tue Jan 26 23:12:34 2010
@@ -88,13 +88,14 @@
new URL(data.toString()), metadata);
tika.importStream(stream, metadata);
} else if (transferable.isDataFlavorSupported(uriListFlavor)) {
- files = uriToFileList((String)
transferable.getTransferData(uriListFlavor));
+ files = uriToFileList(
+
transferable.getTransferData(uriListFlavor).toString());
}
for (File file : files) {
Metadata metadata = new Metadata();
- InputStream stream =
- MetadataHelper.getInputStream(file, metadata);
+ InputStream stream = MetadataHelper.getInputStream(
+ file.toURI().toURL(), metadata);
tika.importStream(stream, metadata);
}
return true;
Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java?rev=903481&r1=903480&r2=903481&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
(original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Tue Jan
26 23:12:34 2010
@@ -18,7 +18,6 @@
import java.io.BufferedInputStream;
import java.io.File;
-import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.Reader;
@@ -135,17 +134,10 @@
*
* @param file the file
* @return detected media type
- * @throws FileNotFoundException if the file does not exist
* @throws IOException if the file can not be read
*/
- public String detect(File file) throws FileNotFoundException, IOException {
- Metadata metadata = new Metadata();
- InputStream stream = MetadataHelper.getInputStream(file, metadata);
- try {
- return detect(stream, metadata);
- } finally {
- stream.close();
- }
+ public String detect(File file) throws IOException {
+ return detect(file.toURI().toURL());
}
/**
@@ -223,13 +215,10 @@
*
* @param file the file to be parsed
* @return extracted text content
- * @throws FileNotFoundException if the given file does not exist
* @throws IOException if the file can not be read or parsed
*/
- public Reader parse(File file) throws FileNotFoundException, IOException {
- Metadata metadata = new Metadata();
- InputStream stream = MetadataHelper.getInputStream(file, metadata);
- return parse(stream, metadata);
+ public Reader parse(File file) throws IOException {
+ return parse(file.toURI().toURL());
}
/**
@@ -291,15 +280,11 @@
*
* @param file the file to be parsed
* @return extracted text content
- * @throws FileNotFoundException if the file does not exist
* @throws IOException if the file can not be read
* @throws TikaException if the file can not be parsed
*/
- public String parseToString(File file)
- throws FileNotFoundException, IOException, TikaException {
- Metadata metadata = new Metadata();
- InputStream stream = MetadataHelper.getInputStream(file, metadata);
- return parseToString(stream, metadata);
+ public String parseToString(File file) throws IOException, TikaException {
+ return parseToString(file.toURI().toURL());
}
/**
Modified:
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java?rev=903481&r1=903480&r2=903481&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
(original)
+++
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
Tue Jan 26 23:12:34 2010
@@ -16,8 +16,6 @@
*/
package org.apache.tika.metadata;
-import java.io.File;
-import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
@@ -38,30 +36,6 @@
}
/**
- * Returns the contents of the given file, and sets any related metadata
- * entries.
- *
- * @param file the file to be read
- * @param metadata where the file metadata is stored
- * @return file content
- * @throws FileNotFoundException if the file does not exist
- */
- public static InputStream getInputStream(File file, Metadata metadata)
- throws FileNotFoundException {
- String name = file.getName();
- if (name.length() > 0) {
- metadata.set(Metadata.RESOURCE_NAME_KEY, name);
- }
-
- long length = file.length();
- if (length > 0) {
- metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length));
- }
-
- return new FileInputStream(file);
- }
-
- /**
* Returns the content at the given URL, and sets any related
* metadata entries.
*
Modified:
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java
URL:
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java?rev=903481&r1=903480&r2=903481&view=diff
==============================================================================
---
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java
(original)
+++
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java
Tue Jan 26 23:12:34 2010
@@ -28,21 +28,12 @@
public void testGetInputStream() throws Exception {
URL url = TestMetadataHelper.class.getResource("test.txt");
- File file = new File(url.toURI());
-
- Metadata urlMetadata = new Metadata();
- MetadataHelper.getInputStream(url, urlMetadata).close();
- assertEquals("test.txt", urlMetadata.get(Metadata.RESOURCE_NAME_KEY));
- assertEquals(
- Long.toString(file.length()),
- urlMetadata.get(Metadata.CONTENT_LENGTH));
-
- Metadata fileMetadata = new Metadata();
- MetadataHelper.getInputStream(file, fileMetadata).close();
- assertEquals("test.txt", fileMetadata.get(Metadata.RESOURCE_NAME_KEY));
+ Metadata metadata = new Metadata();
+ MetadataHelper.getInputStream(url, metadata).close();
+ assertEquals("test.txt", metadata.get(Metadata.RESOURCE_NAME_KEY));
assertEquals(
- Long.toString(file.length()),
- fileMetadata.get(Metadata.CONTENT_LENGTH));
+ Long.toString(new File(url.toURI()).length()),
+ metadata.get(Metadata.CONTENT_LENGTH));
}
}