Author: jukka
Date: Tue Jan 26 22:54:57 2010
New Revision: 903470

URL: http://svn.apache.org/viewvc?rev=903470&view=rev
Log:
TIKA-141: Mime Content Type detection of a web document from its URL.

Add MetadataHelper class with centralized utility methods that pull out as much 
input metadata from a given File or URL when accessing its content. Use those 
methods wherever appropriate.

Added:
    
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
    
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java
    lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/metadata/
    
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/metadata/test.txt
Modified:
    lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
    
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
    lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
    lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java

Modified: 
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java?rev=903470&r1=903469&r2=903470&view=diff
==============================================================================
--- lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
(original)
+++ lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
Tue Jan 26 22:54:57 2010
@@ -17,7 +17,6 @@
 package org.apache.tika.cli;
 
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.InputStream;
 import java.io.OutputStreamWriter;
 import java.io.PrintStream;
@@ -40,6 +39,7 @@
 import org.apache.log4j.WriterAppender;
 import org.apache.tika.gui.TikaGUI;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.MetadataHelper;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -159,17 +159,10 @@
                 InputStream input;
                 File file = new File(arg);
                 if (file.isFile()) {
-                    metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
-                    input = new FileInputStream(file);
+                    input = MetadataHelper.getInputStream(file, metadata);
                 } else {
-                    URL url = new URL(arg);
-                    String path = url.getPath();
-                    int slash = path.lastIndexOf('/');
-                    String name = path.substring(slash + 1);
-                    if (name.length() > 0) {
-                        metadata.set(Metadata.RESOURCE_NAME_KEY, name);
-                    }
-                    input = url.openStream();
+                    input =
+                        MetadataHelper.getInputStream(new URL(arg), metadata);
                 }
                 try {
                     parser.parse(

Modified: 
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java?rev=903470&r1=903469&r2=903470&view=diff
==============================================================================
--- 
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
 (original)
+++ 
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/ParsingTransferHandler.java
 Tue Jan 26 22:54:57 2010
@@ -21,7 +21,7 @@
 import java.awt.datatransfer.Transferable;
 import java.awt.event.InputEvent;
 import java.io.File;
-import java.io.FileInputStream;
+import java.io.InputStream;
 import java.util.List;
 import java.util.StringTokenizer;
 import java.util.ArrayList;
@@ -32,11 +32,19 @@
 import javax.swing.JComponent;
 import javax.swing.TransferHandler;
 
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.MetadataHelper;
+
 /**
  * Utility class that turns drag-and-drop events into Tika parse requests.
  */
 class ParsingTransferHandler extends TransferHandler {
 
+    /**
+     * Serial version UID.
+     */
+    private static final long serialVersionUID = -557932290014044494L;
+
     private final TransferHandler delegate;
 
     private final TikaGUI tika;
@@ -65,20 +73,29 @@
         return false;
     }
 
+    @SuppressWarnings("unchecked")
     public boolean importData(
             JComponent component, Transferable transferable) {
         try {
             List<File> files = null;
             if 
(transferable.isDataFlavorSupported(DataFlavor.javaFileListFlavor)) {
-                files = (List<File>) 
transferable.getTransferData(DataFlavor.javaFileListFlavor);
+                files = (List<File>) transferable.getTransferData(
+                        DataFlavor.javaFileListFlavor);
             } else if (transferable.isDataFlavorSupported(urlListFlavor)) {
-                tika.importStream(new URL((String) 
transferable.getTransferData(urlListFlavor)).openStream());
+                Object data = transferable.getTransferData(urlListFlavor);
+                Metadata metadata = new Metadata();
+                InputStream stream = MetadataHelper.getInputStream(
+                        new URL(data.toString()), metadata);
+                tika.importStream(stream, metadata);
             } else if (transferable.isDataFlavorSupported(uriListFlavor)) {
                 files = uriToFileList((String) 
transferable.getTransferData(uriListFlavor));
             }
 
             for (File file : files) {
-                tika.importStream(new FileInputStream(file));
+                Metadata metadata = new Metadata();
+                InputStream stream =
+                    MetadataHelper.getInputStream(file, metadata);
+                tika.importStream(stream, metadata);
             }
             return true;
         } catch (Exception e) {

Modified: 
lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java?rev=903470&r1=903469&r2=903470&view=diff
==============================================================================
--- lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java 
(original)
+++ lucene/tika/trunk/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java 
Tue Jan 26 22:54:57 2010
@@ -57,6 +57,11 @@
 public class TikaGUI extends JFrame {
 
     /**
+     * Serial version UID.
+     */
+    private static final long serialVersionUID = 5883906936187059495L;
+
+    /**
      * Main method. Sets the Swing look and feel to the operating system
      * settings, and starts the Tika GUI with an {...@link AutoDetectParser}
      * instance as the default parser.
@@ -134,7 +139,8 @@
         this.context.set(Parser.class, parser);
     }
 
-   public void importStream(InputStream input) throws IOException {
+   public void importStream(InputStream input, Metadata md)
+           throws IOException {
         try {
             StringWriter htmlBuffer = new StringWriter();
             StringWriter textBuffer = new StringWriter();
@@ -145,7 +151,6 @@
                     getHtmlHandler(htmlBuffer),
                     getTextContentHandler(textBuffer),
                     getXmlContentHandler(xmlBuffer));
-            Metadata md = new Metadata();
 
             input = new ProgressMonitorInputStream(
                     this, "Parsing stream", input);

Modified: lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java?rev=903470&r1=903469&r2=903470&view=diff
==============================================================================
--- lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java 
(original)
+++ lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/Tika.java Tue Jan 
26 22:54:57 2010
@@ -18,7 +18,6 @@
 
 import java.io.BufferedInputStream;
 import java.io.File;
-import java.io.FileInputStream;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
@@ -29,6 +28,7 @@
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.MetadataHelper;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
@@ -139,9 +139,10 @@
      * @throws IOException if the file can not be read
      */
     public String detect(File file) throws FileNotFoundException, IOException {
-        InputStream stream = new FileInputStream(file);
+        Metadata metadata = new Metadata();
+        InputStream stream = MetadataHelper.getInputStream(file, metadata);
         try {
-            return detect(stream, getFileMetadata(file));
+            return detect(stream, metadata);
         } finally {
             stream.close();
         }
@@ -160,9 +161,10 @@
      * @throws IOException if the resource can not be read
      */
     public String detect(URL url) throws IOException {
-        InputStream stream = url.openStream();
+        Metadata metadata = new Metadata();
+        InputStream stream = MetadataHelper.getInputStream(url, metadata);
         try {
-            return detect(stream, getUrlMetadata(url));
+            return detect(stream, metadata);
         } finally {
             stream.close();
         }
@@ -225,7 +227,9 @@
      * @throws IOException if the file can not be read or parsed
      */
     public Reader parse(File file) throws FileNotFoundException, IOException {
-        return parse(new FileInputStream(file), getFileMetadata(file));
+        Metadata metadata = new Metadata();
+        InputStream stream = MetadataHelper.getInputStream(file, metadata);
+        return parse(stream, metadata);
     }
 
     /**
@@ -237,7 +241,9 @@
      * @throws IOException if the resource can not be read or parsed
      */
     public Reader parse(URL url) throws IOException {
-        return parse(url.openStream(), getUrlMetadata(url));
+        Metadata metadata = new Metadata();
+        InputStream stream = MetadataHelper.getInputStream(url, metadata);
+        return parse(stream, metadata);
     }
 
     /**
@@ -291,7 +297,9 @@
      */
     public String parseToString(File file)
             throws FileNotFoundException, IOException, TikaException {
-        return parseToString(new FileInputStream(file), getFileMetadata(file));
+        Metadata metadata = new Metadata();
+        InputStream stream = MetadataHelper.getInputStream(file, metadata);
+        return parseToString(stream, metadata);
     }
 
     /**
@@ -304,23 +312,9 @@
      * @throws TikaException if the resource can not be parsed
      */
     public String parseToString(URL url) throws IOException, TikaException {
-        return parseToString(url.openStream(), getUrlMetadata(url));
-    }
-
-    private static Metadata getFileMetadata(File file) {
         Metadata metadata = new Metadata();
-        metadata.set(Metadata.RESOURCE_NAME_KEY, file.getName());
-        return metadata;
-    }
-
-    private static Metadata getUrlMetadata(URL url) {
-        Metadata metadata = new Metadata();
-        String path = url.getPath();
-        int slash = path.lastIndexOf('/');
-        if (slash + 1 < path.length()) {
-            metadata.set(Metadata.RESOURCE_NAME_KEY, path.substring(slash + 
1));
-        }
-        return metadata;
+        InputStream stream = MetadataHelper.getInputStream(url, metadata);
+        return parseToString(stream, metadata);
     }
 
 }

Added: 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java?rev=903470&view=auto
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
 (added)
+++ 
lucene/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/MetadataHelper.java
 Tue Jan 26 22:54:57 2010
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.net.URLConnection;
+
+/**
+ * Collection of static helper methods for handling metadata.
+ *
+ * @since Apache Tika 0.7
+ */
+public class MetadataHelper {
+
+    /**
+     * Private constructor to prevent instantiation.
+     */
+    private MetadataHelper() {
+    }
+
+    /**
+     * Returns the contents of the given file, and sets any related metadata
+     * entries.
+     *
+     * @param file the file to be read
+     * @param metadata where the file metadata is stored
+     * @return file content
+     * @throws FileNotFoundException if the file does not exist
+     */
+    public static InputStream getInputStream(File file, Metadata metadata)
+            throws FileNotFoundException {
+        String name = file.getName();
+        if (name.length() > 0) {
+            metadata.set(Metadata.RESOURCE_NAME_KEY, name);
+        }
+
+        long length = file.length();
+        if (length > 0) {
+            metadata.set(Metadata.CONTENT_LENGTH, Long.toString(length));
+        }
+
+        return new FileInputStream(file);
+    }
+
+    /**
+     * Returns the content at the given URL, and sets any related
+     * metadata entries.
+     *
+     * @param url the URL of the resource to be read
+     * @param metadata where the resource metadata is stored
+     * @return resource content
+     * @throws FileNotFoundException if the URL can not be accessed
+     */
+    public static InputStream getInputStream(URL url, Metadata metadata)
+            throws IOException {
+        URLConnection connection = url.openConnection();
+
+        String path = url.getPath();
+        int slash = path.lastIndexOf('/');
+        if (slash + 1 < path.length()) { // works even with -1!
+            metadata.set(Metadata.RESOURCE_NAME_KEY, path.substring(slash + 
1));
+        }
+
+        String type = connection.getContentType();
+        if (type != null) {
+            metadata.set(Metadata.CONTENT_TYPE, type);
+        }
+
+        String encoding = connection.getContentEncoding();
+        if (encoding != null) {
+            metadata.set(Metadata.CONTENT_TYPE, encoding);
+        }
+
+        int length = connection.getContentLength();
+        if (length >= 0) {
+            metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
+        }
+
+        return connection.getInputStream();
+    }
+
+}

Added: 
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java?rev=903470&view=auto
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java
 (added)
+++ 
lucene/tika/trunk/tika-core/src/test/java/org/apache/tika/metadata/TestMetadataHelper.java
 Tue Jan 26 22:54:57 2010
@@ -0,0 +1,48 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.metadata;
+
+import java.io.File;
+import java.net.URL;
+
+import junit.framework.TestCase;
+
+/**
+ * Unit tests for {...@link MetadataHelper}.
+ */
+public class TestMetadataHelper extends TestCase {
+
+    public void testGetInputStream() throws Exception {
+        URL url = TestMetadataHelper.class.getResource("test.txt");
+        File file = new File(url.toURI());
+
+        Metadata urlMetadata = new Metadata();
+        MetadataHelper.getInputStream(url, urlMetadata).close();
+        assertEquals("test.txt", urlMetadata.get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals(
+                Long.toString(file.length()),
+                urlMetadata.get(Metadata.CONTENT_LENGTH));
+
+        Metadata fileMetadata = new Metadata();
+        MetadataHelper.getInputStream(file, fileMetadata).close();
+        assertEquals("test.txt", fileMetadata.get(Metadata.RESOURCE_NAME_KEY));
+        assertEquals(
+                Long.toString(file.length()),
+                fileMetadata.get(Metadata.CONTENT_LENGTH));
+    }
+
+}

Added: 
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/metadata/test.txt
URL: 
http://svn.apache.org/viewvc/lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/metadata/test.txt?rev=903470&view=auto
==============================================================================
--- 
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/metadata/test.txt
 (added)
+++ 
lucene/tika/trunk/tika-core/src/test/resources/org/apache/tika/metadata/test.txt
 Tue Jan 26 22:54:57 2010
@@ -0,0 +1 @@
+Hello, World!
\ No newline at end of file


Reply via email to