Author: jukka
Date: Tue Jan 26 23:12:34 2010
New Revision: 903481

URL: http://svn.apache.org/viewvc?rev=903481&view=rev
Log:
TIKA-141: Mime Content Type detection of a web document from its URL.

Java provides better input metadata for file:// URLs than it does for normal 
Files, so let's just always use URLs!

Modified:
    lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
    
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
    
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
    
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java

Modified: 
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=903481&r1=903480&r2=903481&view=diff
==============================================================================
--- lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
(original)
+++ lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
Tue Jan 26 23:12:34 2010
@@ -156,14 +156,15 @@
                         System.in, type.getContentHandler(),
                         metadata, context);
             } else {
-                InputStream input;
+                URL url;
                 File file = new File(arg);
                 if (file.isFile()) {
-                    input = MetadataHelper.getInputStream(file, metadata);
+                    url = file.toURI().toURL();
                 } else {
-                    input =
-                        MetadataHelper.getInputStream(new URL(arg), metadata);
+                    url = new URL(arg);
                 }
+                InputStream input =
+                    MetadataHelper.getInputStream(url, metadata);
                 try {
                     parser.parse(
                             input, type.getContentHandler(),

Modified: 
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java?rev=903481&r1=903480&r2=903481&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
 (original)
+++ 
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
 Tue Jan 26 23:12:34 2010
@@ -88,13 +88,14 @@
                         new URL(data.toString()), metadata);
                 tika.importStream(stream, metadata);
             } else if (transferable.isDataFlavorSupported(uriListFlavor)) {
-                files = uriToFileList((String) 
transferable.getTransferData(uriListFlavor));
+                files = uriToFileList(
+                        
transferable.getTransferData(uriListFlavor).toString());
             }
 
             for (File file : files) {
                 Metadata metadata = new Metadata();
-                InputStream stream =
-                    MetadataHelper.getInputStream(file, metadata);
+                InputStream stream = MetadataHelper.getInputStream(
+                        file.toURI().toURL(), metadata);
                 tika.importStream(stream, metadata);
             }
             return true;

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java?rev=903481&r1=903480&r2=903481&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java 
(original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Tue Jan 
26 23:12:34 2010
@@ -18,7 +18,6 @@
 
 import java.io.BufferedInputStream;
 import java.io.File;
-import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.Reader;
@@ -135,17 +134,10 @@
      *
      * @param file the file
      * @return detected media type
-     * @throws FileNotFoundException if the file does not exist
      * @throws IOException if the file can not be read
      */
-    public String detect(File file) throws FileNotFoundException, IOException {
-        Metadata metadata = new Metadata();
-        InputStream stream = MetadataHelper.getInputStream(file, metadata);
-        try {
-            return detect(stream, metadata);
-        } finally {
-            stream.close();
-        }
+    public String detect(File file) throws IOException {
+        return detect(file.toURI().toURL());
     }
 
     /**
@@ -223,13 +215,10 @@
      *
      * @param file the file to be parsed
      * @return extracted text content
-     * @throws FileNotFoundException if the given file does not exist
      * @throws IOException if the file can not be read or parsed
      */
-    public Reader parse(File file) throws FileNotFoundException, IOException {
-        Metadata metadata = new Metadata();
-        InputStream stream = MetadataHelper.getInputStream(file, metadata);
-        return parse(stream, metadata);
+    public Reader parse(File file) throws IOException {
+        return parse(file.toURI().toURL());
     }
 
     /**
@@ -291,15 +280,11 @@
      *
      * @param file the file to be parsed
      * @return extracted text content
-     * @throws FileNotFoundException if the file does not exist
      * @throws IOException if the file can not be read
      * @throws TikaException if the file can not be parsed
      */
-    public String parseToString(File file)
-            throws FileNotFoundException, IOException, TikaException {
-        Metadata metadata = new Metadata();
-        InputStream stream = MetadataHelper.getInputStream(file, metadata);
-        return parseToString(stream, metadata);
+    public String parseToString(File file) throws IOException, TikaException {
+        return parseToString(file.toURI().toURL());
     }
 
     /**

Modified: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java?rev=903481&r1=903480&r2=903481&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
 (original)
+++ 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
 Tue Jan 26 23:12:34 2010
@@ -16,8 +16,6 @@
  */
 package org.apache.tika.metadata;
 
-import java.io.File;
-import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
@@ -38,30 +36,6 @@
     }
 
     /**
-     * Returns the contents of the given file, and sets any related metadata
-     * entries.
-     *
-     * @param file the file to be read
-     * @param metadata where the file metadata is stored
-     * @return file content
-     * @throws FileNotFoundException if the file does not exist
-     */
-    public static InputStream getInputStream(File file, Metadata metadata)
-            throws FileNotFoundException {
-        String name = file.getName();
-        if (name.length() > 0) {
-            metadata.set(Metadata.RESOURCE_NAME_KEY, name);
-        }
-
-        long length = file.length();
-        if (length > 0) {
-            metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length));
-        }
-
-        return new FileInputStream(file);
-    }
-
-    /**
      * Returns the content at the given URL, and sets any related
      * metadata entries.
      *

Modified: 
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java?rev=903481&r1=903480&r2=903481&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java
 (original)
+++ 
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java
 Tue Jan 26 23:12:34 2010
@@ -28,21 +28,12 @@
 
     public void testGetInputStream() throws Exception {
         URL url = TestMetadataHelper.class.getResource("test.txt");
-        File file = new File(url.toURI());
-
-        Metadata urlMetadata = new Metadata();
-        MetadataHelper.getInputStream(url, urlMetadata).close();
-        assertEquals("test.txt", urlMetadata.get(Metadata.RESOURCE_NAME_KEY));
-        assertEquals(
-                Long.toString(file.length()),
-                urlMetadata.get(Metadata.CONTENT_LENGTH));
-
-        Metadata fileMetadata = new Metadata();
-        MetadataHelper.getInputStream(file, fileMetadata).close();
-        assertEquals("test.txt", fileMetadata.get(Metadata.RESOURCE_NAME_KEY));
+        Metadata metadata = new Metadata();
+        MetadataHelper.getInputStream(url, metadata).close();
+        assertEquals("test.txt", metadata.get(Metadata.RESOURCE_NAME_KEY));
         assertEquals(
-                Long.toString(file.length()),
-                fileMetadata.get(Metadata.CONTENT_LENGTH));
+                Long.toString(new File(url.toURI()).length()),
+                metadata.get(Metadata.CONTENT_LENGTH));
     }
 
 }


Reply via email to