Author: ridabenjelloun
Date: Thu Oct  4 08:14:27 2007
New Revision: 581944

URL: http://svn.apache.org/viewvc?rev=581944&view=rev
Log:
ZIP extraction. Three methods has been added to ParseUtils class. 
getParsersFromZip() methods return a list of parsers. consult unit test class 
to see how it works.

Added:
    incubator/tika/trunk/src/test/resources/test-documents/test-documents.zip   
(with props)
Modified:
    incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
    incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
    incubator/tika/trunk/src/main/resources/tika-config.xml
    incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java

Modified: 
incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java?rev=581944&r1=581943&r2=581944&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java 
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java 
Thu Oct  4 08:14:27 2007
@@ -20,11 +20,13 @@
 import java.io.BufferedInputStream;
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
 
-// TIKA imports
 import org.apache.tika.config.ParserConfig;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
@@ -38,163 +40,221 @@
  */
 public class ParseUtils implements TikaMimeKeys {
 
-    /**
-     * Returns a parser that can handle the specified MIME type, and is set to
-     * receive input from a stream opened from the specified URL. NB: Close the
-     * input stream when it is no longer needed!
-     * 
-     * @param config
-     * @param mimeType
-     *            the document's MIME type
-     * @return a parser appropriate to this MIME type
-     * @throws TikaException
-     */
-    public static Parser getParser(TikaConfig config, String mimeType)
-            throws TikaException {
-        ParserConfig pc  = config.getParserConfig(mimeType);
-        Parser parser = ParserFactory.getParser(pc);
-        parser.setMimeType(mimeType);
-        return parser;
-    }
-
-    /**
-     * Returns a parser that can handle the specified MIME type, and is set to
-     * receive input from a stream opened from the specified URL. The MIME type
-     * is determined automatically. NB: Close the input stream when it is no
-     * longer needed!
-     * 
-     * @param documentUrl
-     *            URL pointing to the document to parse
-     * @param config
-     * @return a parser appropriate to this MIME type and ready to read input
-     *         from the specified document
-     * @throws TikaException
-     */
-    public static Parser getParser(URL documentUrl, TikaConfig config)
-            throws TikaException {
-        String mimetype =
-            config.getMimeRepository().getMimeType(documentUrl).getName();
-        return getParser(config, mimetype);
-    }
-
-    /**
-     * Returns a parser that can handle the specified MIME type, and is set to
-     * receive input from a stream opened from the specified URL. NB: Close the
-     * input stream when it is no longer needed!
-     * 
-     * @param documentFile
-     *            File object pointing to the document to parse
-     * @param config
-     * @return a parser appropriate to this MIME type and ready to read input
-     *         from the specified document
-     * @throws TikaException
-     */
-    public static Parser getParser(File documentFile, TikaConfig config)
-            throws TikaException {
-        String mimetype =
-            config.getMimeRepository().getMimeType(documentFile).getName();
-        return getParser(config, mimetype);
-    }
-
-    /**
-     * Gets the string content of a document read from an input stream.
-     * 
-     * @param inputStream
-     *            the stream from which to read document data
-     * @param config
-     * @param mimeType
-     *            MIME type of the data
-     * @return the string content parsed from the document
-     * @throws TikaException
-     * @throws IOException
-     */
-    public static String getStringContent(
-            InputStream inputStream, TikaConfig config, String mimeType)
-            throws TikaException, IOException {
-        ParserConfig pc  = config.getParserConfig(mimeType);
-        Parser parser = ParserFactory.getParser(pc);
-        parser.setMimeType(mimeType);
-        return parser.getContents(inputStream, pc.getContents());
-    }
-
-    /**
-     * Gets the string content of a document read from an input stream.
-     * 
-     * @param documentUrl
-     *            URL pointing to the document to parse
-     * @param config
-     * @return the string content parsed from the document
-     * @throws TikaException
-     * @throws IOException
-     */
-    public static String getStringContent(URL documentUrl, TikaConfig config)
-            throws TikaException, IOException {
-        String mime =
-                config.getMimeRepository().getMimeType(documentUrl).getName();
-        return getStringContent(documentUrl, config, mime);
-    }
-
-    /**
-     * Gets the string content of a document read from an input stream.
-     * 
-     * @param documentUrl
-     *            URL pointing to the document to parse
-     * @param config
-     * @param mimeType
-     *            MIME type of the data
-     * @return the string content parsed from the document
-     * @throws TikaException
-     * @throws IOException
-     */
-    public static String getStringContent(
-            URL documentUrl, TikaConfig config, String mimeType)
-            throws TikaException, IOException {
-        InputStream stream = documentUrl.openStream();
-        try {
-            return getStringContent(stream, config, mimeType);
-        } finally {
-            stream.close();
-        }
-    }
-
-    /**
-     * Gets the string content of a document read from an input stream.
-     * 
-     * @param documentFile
-     *            File object pointing to the document to parse
-     * @param config
-     * @param mimeType
-     *            MIME type of the data
-     * @return the string content parsed from the document
-     * @throws TikaException
-     * @throws IOException
-     */
-    public static String getStringContent(
-            File documentFile, TikaConfig config, String mimeType)
-            throws TikaException, IOException {
-        InputStream stream = new BufferedInputStream(new 
FileInputStream(documentFile));
-        try {
-            return getStringContent(stream, config, mimeType);
-        } finally {
-            stream.close();
-        }
-    }
-
-    /**
-     * Gets the string content of a document read from an input stream.
-     * 
-     * @param documentFile
-     *            File object pointing to the document to parse
-     * @param config
-     * @return the string content parsed from the document
-     * @throws TikaException
-     * @throws IOException
-     */
-    public static String getStringContent(File documentFile, TikaConfig config)
-            throws TikaException, IOException {
-        String mime =
-            config.getMimeRepository().getMimeType(documentFile).getName();
-        return getStringContent(documentFile, config, mime);
-    }
+       /**
+        * Returns a parser that can handle the specified MIME type, and is set 
to
+        * receive input from a stream opened from the specified URL. NB: Close 
the
+        * input stream when it is no longer needed!
+        * 
+        * @param config
+        * @param mimeType
+        *            the document's MIME type
+        * @return a parser appropriate to this MIME type
+        * @throws TikaException
+        */
+       public static Parser getParser(TikaConfig config, String mimeType)
+                       throws TikaException {
+               ParserConfig pc = config.getParserConfig(mimeType);
+               Parser parser = ParserFactory.getParser(pc);
+               parser.setMimeType(mimeType);
+               return parser;
+       }
+
+       /**
+        * Returns a parser that can handle the specified MIME type, and is set 
to
+        * receive input from a stream opened from the specified URL. The MIME 
type
+        * is determined automatically. NB: Close the input stream when it is no
+        * longer needed!
+        * 
+        * @param documentUrl
+        *            URL pointing to the document to parse
+        * @param config
+        * @return a parser appropriate to this MIME type and ready to read 
input
+        *         from the specified document
+        * @throws TikaException
+        */
+       public static Parser getParser(URL documentUrl, TikaConfig config)
+                       throws TikaException {
+               String mimetype = 
config.getMimeRepository().getMimeType(documentUrl)
+                               .getName();
+               return getParser(config, mimetype);
+       }
+
+       /**
+        * Returns a parser that can handle the specified MIME type, and is set 
to
+        * receive input from a stream opened from the specified URL. NB: Close 
the
+        * input stream when it is no longer needed!
+        * 
+        * @param documentFile
+        *            File object pointing to the document to parse
+        * @param config
+        * @return a parser appropriate to this MIME type and ready to read 
input
+        *         from the specified document
+        * @throws TikaException
+        */
+       public static Parser getParser(File documentFile, TikaConfig config)
+                       throws TikaException {
+               String mimetype = 
config.getMimeRepository().getMimeType(documentFile)
+                               .getName();
+               return getParser(config, mimetype);
+       }
+
+       /**
+        * Returns a list of parsers from zip InputStream
+        * 
+        * @param zip
+        *            InputStream
+        * @param config
+        * @return a list of parsers from zip file
+        * @throws TikaException
+        */
+       private static List<Parser> getParsersFromZip(InputStream zipIs,
+                       TikaConfig config) throws TikaException {
+               List<Parser> parsers = new ArrayList<Parser>();
+               List<File> zipFiles = Utils.unzip(zipIs);
+               for (int i = 0; i < zipFiles.size(); i++) {
+                       File zipEntry = zipFiles.get(i);
+                       parsers.add(getParser(zipEntry, config));
+               }
+               return parsers;
+       }
+
+       /**
+        * Returns a list of parsers from zip File
+        * 
+        * @param zip
+        *            File
+        * @param config
+        * @return a list of parsers from zip file
+        * @throws TikaException
+        * @throws FileNotFoundException
+        */
+       public static List<Parser> getParsersFromZip(File zip, TikaConfig 
config)
+                       throws TikaException, FileNotFoundException {
+               String zipMimeType = config.getMimeRepository().getMimeType(zip)
+                               .getName();
+               if (!zipMimeType.equalsIgnoreCase("application/zip")) {
+                       throw new TikaException("The file you are using is note 
a zip file");
+               }
+               return getParsersFromZip(new FileInputStream(zip), config);
+       }
+
+       /**
+        * Returns a list of parsers from URL
+        * 
+        * @param URL
+        * @param config
+        * @return a list of parsers from zip file
+        * @throws TikaException
+        * @throws IOException
+        */
+       public static List<Parser> getParsersFromZip(URL zip, TikaConfig config)
+                       throws TikaException, IOException {
+               String zipMimeType = config.getMimeRepository().getMimeType(zip)
+                               .getName();
+               if (!zipMimeType.equalsIgnoreCase("application/zip")) {
+                       throw new TikaException("The file you are using is note 
a zip file");
+               }
+               return getParsersFromZip(zip.openStream(), config);
+       }
+
+       /**
+        * Gets the string content of a document read from an input stream.
+        * 
+        * @param inputStream
+        *            the stream from which to read document data
+        * @param config
+        * @param mimeType
+        *            MIME type of the data
+        * @return the string content parsed from the document
+        * @throws TikaException
+        * @throws IOException
+        */
+       public static String getStringContent(InputStream inputStream,
+                       TikaConfig config, String mimeType) throws 
TikaException,
+                       IOException {
+               ParserConfig pc = config.getParserConfig(mimeType);
+               Parser parser = ParserFactory.getParser(pc);
+               parser.setMimeType(mimeType);
+               return parser.getContents(inputStream, pc.getContents());
+       }
+
+       /**
+        * Gets the string content of a document read from an input stream.
+        * 
+        * @param documentUrl
+        *            URL pointing to the document to parse
+        * @param config
+        * @return the string content parsed from the document
+        * @throws TikaException
+        * @throws IOException
+        */
+       public static String getStringContent(URL documentUrl, TikaConfig 
config)
+                       throws TikaException, IOException {
+               String mime = 
config.getMimeRepository().getMimeType(documentUrl)
+                               .getName();
+               return getStringContent(documentUrl, config, mime);
+       }
+
+       /**
+        * Gets the string content of a document read from an input stream.
+        * 
+        * @param documentUrl
+        *            URL pointing to the document to parse
+        * @param config
+        * @param mimeType
+        *            MIME type of the data
+        * @return the string content parsed from the document
+        * @throws TikaException
+        * @throws IOException
+        */
+       public static String getStringContent(URL documentUrl, TikaConfig 
config,
+                       String mimeType) throws TikaException, IOException {
+               InputStream stream = documentUrl.openStream();
+               try {
+                       return getStringContent(stream, config, mimeType);
+               } finally {
+                       stream.close();
+               }
+       }
+
+       /**
+        * Gets the string content of a document read from an input stream.
+        * 
+        * @param documentFile
+        *            File object pointing to the document to parse
+        * @param config
+        * @param mimeType
+        *            MIME type of the data
+        * @return the string content parsed from the document
+        * @throws TikaException
+        * @throws IOException
+        */
+       public static String getStringContent(File documentFile, TikaConfig 
config,
+                       String mimeType) throws TikaException, IOException {
+               InputStream stream = new BufferedInputStream(new 
FileInputStream(
+                               documentFile));
+               try {
+                       return getStringContent(stream, config, mimeType);
+               } finally {
+                       stream.close();
+               }
+       }
+
+       /**
+        * Gets the string content of a document read from an input stream.
+        * 
+        * @param documentFile
+        *            File object pointing to the document to parse
+        * @param config
+        * @return the string content parsed from the document
+        * @throws TikaException
+        * @throws IOException
+        */
+       public static String getStringContent(File documentFile, TikaConfig 
config)
+                       throws TikaException, IOException {
+               String mime = 
config.getMimeRepository().getMimeType(documentFile)
+                               .getName();
+               return getStringContent(documentFile, config, mime);
+       }
 
 }

Modified: incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java?rev=581944&r1=581943&r2=581944&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java 
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java Thu Oct 
 4 08:14:27 2007
@@ -29,8 +29,6 @@
 import java.io.StringWriter;
 import java.io.Writer;
 import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.zip.ZipEntry;
@@ -120,6 +118,7 @@
                                saveInputStreamInFile(isEntry, new 
BufferedOutputStream(
                                                new FileOutputStream(file)));
                                res.add(file);
+                               isEntry.close();
                        }
                        in.close();
                } catch (IOException e) {

Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=581944&r1=581943&r2=581944&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Thu Oct  4 08:14:27 
2007
@@ -7,6 +7,7 @@
                 <namespace>http://purl.org/dc/elements/1.1/</namespace>
                 <mime>application/xml</mime>
                 <extract>
+                       <content name="fullText" xpathSelect="//*"/>
                     <content name="title" xpathSelect="//dc:title"/>
                     <content name="subject" xpathSelect="//dc:subject"/>
                     <content name="creator" xpathSelect="//dc:creator"/>

Modified: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java?rev=581944&r1=581943&r2=581944&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java 
(original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Thu Oct 
 4 08:14:27 2007
@@ -20,10 +20,12 @@
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.List;
 import java.util.Map;
 
 import junit.framework.TestCase;
 
+import org.apache.log4j.Logger;
 import org.apache.tika.config.Content;
 import org.apache.tika.config.ParserConfig;
 import org.apache.tika.config.TikaConfig;
@@ -40,6 +42,7 @@
 public class TestParsers extends TestCase {
 
        private TikaConfig tc;
+
        private File testFilesBaseDir;
 
        public void setUp() throws JDOMException, IOException {
@@ -61,16 +64,13 @@
                 */
 
                // FIXME for now, fix filenames according to Maven testing 
layout
-        
-  // The file below should be the default configuration for the test of
-  // getDefaultConfig() to be legitimate.
-  final String tikaConfigFilename
-                = "target/classes/org/apache/tika/tika-config.xml";
+               // The file below should be the default configuration for the 
test of
+               // getDefaultConfig() to be legitimate.
+               final String tikaConfigFilename = 
"target/classes/org/apache/tika/tika-config.xml";
 
-  final String log4jPropertiesFilename
-                = "target/classes/log4j/log4j.properties";
+               final String log4jPropertiesFilename = 
"target/classes/log4j/log4j.properties";
 
-        testFilesBaseDir = new File("src/test/resources/test-documents");
+               testFilesBaseDir = new 
File("src/test/resources/test-documents");
 
                tc = new TikaConfig(tikaConfigFilename);
 
@@ -81,10 +81,10 @@
                File file = getTestFile("testPDF.pdf");
                String s1 = ParseUtils.getStringContent(file, tc);
                String s2 = ParseUtils.getStringContent(file, tc, 
"application/pdf");
-        String s3 = ParseUtils.getStringContent(file,
-                TikaConfig.getDefaultConfig());
-        assertEquals(s1, s2);
-        assertEquals(s1, s3);
+               String s3 = ParseUtils.getStringContent(file, TikaConfig
+                               .getDefaultConfig());
+               assertEquals(s1, s2);
+               assertEquals(s1, s3);
        }
 
        public void testTXTExtraction() throws Exception {
@@ -149,10 +149,10 @@
 
        public void testEXCELExtraction() throws Exception {
                final String expected = "Numbers and their Squares Number 
Square 1.0 "
-                + "1.0 2.0 4.0 3.0 9.0 4.0 16.0 5.0 25.0 6.0 36.0 7.0 49.0 8.0 
"
-                + "64.0 9.0 81.0 10.0 100.0 11.0 121.0 12.0 144.0 13.0 169.0 "
-                + "14.0 196.0 15.0 225.0 Written and saved in Microsoft Excel "
-                + "X for Mac Service Release 1.";
+                               + "1.0 2.0 4.0 3.0 9.0 4.0 16.0 5.0 25.0 6.0 
36.0 7.0 49.0 8.0 "
+                               + "64.0 9.0 81.0 10.0 100.0 11.0 121.0 12.0 
144.0 13.0 169.0 "
+                               + "14.0 196.0 15.0 225.0 Written and saved in 
Microsoft Excel "
+                               + "X for Mac Service Release 1.";
                File file = getTestFile("testEXCEL.xls");
                String s1 = ParseUtils.getStringContent(file, tc);
                String s2 = ParseUtils.getStringContent(file, tc,
@@ -210,6 +210,35 @@
                final String text = Utils.toString(contents);
                final String expected = "Test Indexation Html";
                assertTrue("text contains '" + expected + "'", 
text.contains(expected));
+       }
+
+       public void testZipExtraction() throws Exception {
+               File zip = getTestFile("test-documents.zip");
+               List<Parser> parsers = ParseUtils.getParsersFromZip(zip, tc);
+               List<File> zipFiles = Utils.unzip(new FileInputStream(zip));
+               for (int i = 0; i < parsers.size(); i++) {
+                       Parser zipEntryParser = parsers.get(i);
+                       assertNotNull(zipEntryParser);
+                       assertNotNull(zipEntryParser.getMimeType());
+                       for (int j = 0; j < zipFiles.size(); j++) {
+                               if 
(zipEntryParser.getMimeType().equalsIgnoreCase(
+                                               
tc.getMimeRepository().getMimeType(zipFiles.get(j))
+                                                               .getName())) {
+                                       ParserConfig config = 
tc.getParserConfig(zipEntryParser
+                                                       .getMimeType());
+                                       Map<String, Content> contents = 
config.getContents();
+                                       assertNotNull(contents);
+                                       InputStream stream = new 
FileInputStream(zipFiles.get(j));
+                                       try {
+                                               
zipEntryParser.getContents(stream, contents);
+                                               
assertNotNull(contents.get("fullText"));
+                                       } finally {
+                                               stream.close();
+                                       }
+                               }
+                       }
+
+               }
        }
 
        private File getTestFile(String filename) {

Added: incubator/tika/trunk/src/test/resources/test-documents/test-documents.zip
URL: 
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/test-documents.zip?rev=581944&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
incubator/tika/trunk/src/test/resources/test-documents/test-documents.zip
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream


Reply via email to