Author: ridabenjelloun
Date: Thu Oct 4 08:14:27 2007
New Revision: 581944
URL: http://svn.apache.org/viewvc?rev=581944&view=rev
Log:
ZIP extraction. Three methods has been added to ParseUtils class.
getParsersFromZip() methods return a list of parsers. consult unit test class
to see how it works.
Added:
incubator/tika/trunk/src/test/resources/test-documents/test-documents.zip
(with props)
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
incubator/tika/trunk/src/main/resources/tika-config.xml
incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
Modified:
incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java?rev=581944&r1=581943&r2=581944&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/ParseUtils.java
Thu Oct 4 08:14:27 2007
@@ -20,11 +20,13 @@
import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileInputStream;
+import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
+import java.util.ArrayList;
+import java.util.List;
-// TIKA imports
import org.apache.tika.config.ParserConfig;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
@@ -38,163 +40,221 @@
*/
public class ParseUtils implements TikaMimeKeys {
- /**
- * Returns a parser that can handle the specified MIME type, and is set to
- * receive input from a stream opened from the specified URL. NB: Close the
- * input stream when it is no longer needed!
- *
- * @param config
- * @param mimeType
- * the document's MIME type
- * @return a parser appropriate to this MIME type
- * @throws TikaException
- */
- public static Parser getParser(TikaConfig config, String mimeType)
- throws TikaException {
- ParserConfig pc = config.getParserConfig(mimeType);
- Parser parser = ParserFactory.getParser(pc);
- parser.setMimeType(mimeType);
- return parser;
- }
-
- /**
- * Returns a parser that can handle the specified MIME type, and is set to
- * receive input from a stream opened from the specified URL. The MIME type
- * is determined automatically. NB: Close the input stream when it is no
- * longer needed!
- *
- * @param documentUrl
- * URL pointing to the document to parse
- * @param config
- * @return a parser appropriate to this MIME type and ready to read input
- * from the specified document
- * @throws TikaException
- */
- public static Parser getParser(URL documentUrl, TikaConfig config)
- throws TikaException {
- String mimetype =
- config.getMimeRepository().getMimeType(documentUrl).getName();
- return getParser(config, mimetype);
- }
-
- /**
- * Returns a parser that can handle the specified MIME type, and is set to
- * receive input from a stream opened from the specified URL. NB: Close the
- * input stream when it is no longer needed!
- *
- * @param documentFile
- * File object pointing to the document to parse
- * @param config
- * @return a parser appropriate to this MIME type and ready to read input
- * from the specified document
- * @throws TikaException
- */
- public static Parser getParser(File documentFile, TikaConfig config)
- throws TikaException {
- String mimetype =
- config.getMimeRepository().getMimeType(documentFile).getName();
- return getParser(config, mimetype);
- }
-
- /**
- * Gets the string content of a document read from an input stream.
- *
- * @param inputStream
- * the stream from which to read document data
- * @param config
- * @param mimeType
- * MIME type of the data
- * @return the string content parsed from the document
- * @throws TikaException
- * @throws IOException
- */
- public static String getStringContent(
- InputStream inputStream, TikaConfig config, String mimeType)
- throws TikaException, IOException {
- ParserConfig pc = config.getParserConfig(mimeType);
- Parser parser = ParserFactory.getParser(pc);
- parser.setMimeType(mimeType);
- return parser.getContents(inputStream, pc.getContents());
- }
-
- /**
- * Gets the string content of a document read from an input stream.
- *
- * @param documentUrl
- * URL pointing to the document to parse
- * @param config
- * @return the string content parsed from the document
- * @throws TikaException
- * @throws IOException
- */
- public static String getStringContent(URL documentUrl, TikaConfig config)
- throws TikaException, IOException {
- String mime =
- config.getMimeRepository().getMimeType(documentUrl).getName();
- return getStringContent(documentUrl, config, mime);
- }
-
- /**
- * Gets the string content of a document read from an input stream.
- *
- * @param documentUrl
- * URL pointing to the document to parse
- * @param config
- * @param mimeType
- * MIME type of the data
- * @return the string content parsed from the document
- * @throws TikaException
- * @throws IOException
- */
- public static String getStringContent(
- URL documentUrl, TikaConfig config, String mimeType)
- throws TikaException, IOException {
- InputStream stream = documentUrl.openStream();
- try {
- return getStringContent(stream, config, mimeType);
- } finally {
- stream.close();
- }
- }
-
- /**
- * Gets the string content of a document read from an input stream.
- *
- * @param documentFile
- * File object pointing to the document to parse
- * @param config
- * @param mimeType
- * MIME type of the data
- * @return the string content parsed from the document
- * @throws TikaException
- * @throws IOException
- */
- public static String getStringContent(
- File documentFile, TikaConfig config, String mimeType)
- throws TikaException, IOException {
- InputStream stream = new BufferedInputStream(new
FileInputStream(documentFile));
- try {
- return getStringContent(stream, config, mimeType);
- } finally {
- stream.close();
- }
- }
-
- /**
- * Gets the string content of a document read from an input stream.
- *
- * @param documentFile
- * File object pointing to the document to parse
- * @param config
- * @return the string content parsed from the document
- * @throws TikaException
- * @throws IOException
- */
- public static String getStringContent(File documentFile, TikaConfig config)
- throws TikaException, IOException {
- String mime =
- config.getMimeRepository().getMimeType(documentFile).getName();
- return getStringContent(documentFile, config, mime);
- }
+ /**
+ * Returns a parser that can handle the specified MIME type, and is set
to
+ * receive input from a stream opened from the specified URL. NB: Close
the
+ * input stream when it is no longer needed!
+ *
+ * @param config
+ * @param mimeType
+ * the document's MIME type
+ * @return a parser appropriate to this MIME type
+ * @throws TikaException
+ */
+ public static Parser getParser(TikaConfig config, String mimeType)
+ throws TikaException {
+ ParserConfig pc = config.getParserConfig(mimeType);
+ Parser parser = ParserFactory.getParser(pc);
+ parser.setMimeType(mimeType);
+ return parser;
+ }
+
+ /**
+ * Returns a parser that can handle the specified MIME type, and is set
to
+ * receive input from a stream opened from the specified URL. The MIME
type
+ * is determined automatically. NB: Close the input stream when it is no
+ * longer needed!
+ *
+ * @param documentUrl
+ * URL pointing to the document to parse
+ * @param config
+ * @return a parser appropriate to this MIME type and ready to read
input
+ * from the specified document
+ * @throws TikaException
+ */
+ public static Parser getParser(URL documentUrl, TikaConfig config)
+ throws TikaException {
+ String mimetype =
config.getMimeRepository().getMimeType(documentUrl)
+ .getName();
+ return getParser(config, mimetype);
+ }
+
+ /**
+ * Returns a parser that can handle the specified MIME type, and is set
to
+ * receive input from a stream opened from the specified URL. NB: Close
the
+ * input stream when it is no longer needed!
+ *
+ * @param documentFile
+ * File object pointing to the document to parse
+ * @param config
+ * @return a parser appropriate to this MIME type and ready to read
input
+ * from the specified document
+ * @throws TikaException
+ */
+ public static Parser getParser(File documentFile, TikaConfig config)
+ throws TikaException {
+ String mimetype =
config.getMimeRepository().getMimeType(documentFile)
+ .getName();
+ return getParser(config, mimetype);
+ }
+
+ /**
+ * Returns a list of parsers from zip InputStream
+ *
+ * @param zip
+ * InputStream
+ * @param config
+ * @return a list of parsers from zip file
+ * @throws TikaException
+ */
+ private static List<Parser> getParsersFromZip(InputStream zipIs,
+ TikaConfig config) throws TikaException {
+ List<Parser> parsers = new ArrayList<Parser>();
+ List<File> zipFiles = Utils.unzip(zipIs);
+ for (int i = 0; i < zipFiles.size(); i++) {
+ File zipEntry = zipFiles.get(i);
+ parsers.add(getParser(zipEntry, config));
+ }
+ return parsers;
+ }
+
+ /**
+ * Returns a list of parsers from zip File
+ *
+ * @param zip
+ * File
+ * @param config
+ * @return a list of parsers from zip file
+ * @throws TikaException
+ * @throws FileNotFoundException
+ */
+ public static List<Parser> getParsersFromZip(File zip, TikaConfig
config)
+ throws TikaException, FileNotFoundException {
+ String zipMimeType = config.getMimeRepository().getMimeType(zip)
+ .getName();
+ if (!zipMimeType.equalsIgnoreCase("application/zip")) {
+ throw new TikaException("The file you are using is note
a zip file");
+ }
+ return getParsersFromZip(new FileInputStream(zip), config);
+ }
+
+ /**
+ * Returns a list of parsers from URL
+ *
+ * @param URL
+ * @param config
+ * @return a list of parsers from zip file
+ * @throws TikaException
+ * @throws IOException
+ */
+ public static List<Parser> getParsersFromZip(URL zip, TikaConfig config)
+ throws TikaException, IOException {
+ String zipMimeType = config.getMimeRepository().getMimeType(zip)
+ .getName();
+ if (!zipMimeType.equalsIgnoreCase("application/zip")) {
+ throw new TikaException("The file you are using is note
a zip file");
+ }
+ return getParsersFromZip(zip.openStream(), config);
+ }
+
+ /**
+ * Gets the string content of a document read from an input stream.
+ *
+ * @param inputStream
+ * the stream from which to read document data
+ * @param config
+ * @param mimeType
+ * MIME type of the data
+ * @return the string content parsed from the document
+ * @throws TikaException
+ * @throws IOException
+ */
+ public static String getStringContent(InputStream inputStream,
+ TikaConfig config, String mimeType) throws
TikaException,
+ IOException {
+ ParserConfig pc = config.getParserConfig(mimeType);
+ Parser parser = ParserFactory.getParser(pc);
+ parser.setMimeType(mimeType);
+ return parser.getContents(inputStream, pc.getContents());
+ }
+
+ /**
+ * Gets the string content of a document read from an input stream.
+ *
+ * @param documentUrl
+ * URL pointing to the document to parse
+ * @param config
+ * @return the string content parsed from the document
+ * @throws TikaException
+ * @throws IOException
+ */
+ public static String getStringContent(URL documentUrl, TikaConfig
config)
+ throws TikaException, IOException {
+ String mime =
config.getMimeRepository().getMimeType(documentUrl)
+ .getName();
+ return getStringContent(documentUrl, config, mime);
+ }
+
+ /**
+ * Gets the string content of a document read from an input stream.
+ *
+ * @param documentUrl
+ * URL pointing to the document to parse
+ * @param config
+ * @param mimeType
+ * MIME type of the data
+ * @return the string content parsed from the document
+ * @throws TikaException
+ * @throws IOException
+ */
+ public static String getStringContent(URL documentUrl, TikaConfig
config,
+ String mimeType) throws TikaException, IOException {
+ InputStream stream = documentUrl.openStream();
+ try {
+ return getStringContent(stream, config, mimeType);
+ } finally {
+ stream.close();
+ }
+ }
+
+ /**
+ * Gets the string content of a document read from an input stream.
+ *
+ * @param documentFile
+ * File object pointing to the document to parse
+ * @param config
+ * @param mimeType
+ * MIME type of the data
+ * @return the string content parsed from the document
+ * @throws TikaException
+ * @throws IOException
+ */
+ public static String getStringContent(File documentFile, TikaConfig
config,
+ String mimeType) throws TikaException, IOException {
+ InputStream stream = new BufferedInputStream(new
FileInputStream(
+ documentFile));
+ try {
+ return getStringContent(stream, config, mimeType);
+ } finally {
+ stream.close();
+ }
+ }
+
+ /**
+ * Gets the string content of a document read from an input stream.
+ *
+ * @param documentFile
+ * File object pointing to the document to parse
+ * @param config
+ * @return the string content parsed from the document
+ * @throws TikaException
+ * @throws IOException
+ */
+ public static String getStringContent(File documentFile, TikaConfig
config)
+ throws TikaException, IOException {
+ String mime =
config.getMimeRepository().getMimeType(documentFile)
+ .getName();
+ return getStringContent(documentFile, config, mime);
+ }
}
Modified: incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java?rev=581944&r1=581943&r2=581944&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java
(original)
+++ incubator/tika/trunk/src/main/java/org/apache/tika/utils/Utils.java Thu Oct
4 08:14:27 2007
@@ -29,8 +29,6 @@
import java.io.StringWriter;
import java.io.Writer;
import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.zip.ZipEntry;
@@ -120,6 +118,7 @@
saveInputStreamInFile(isEntry, new
BufferedOutputStream(
new FileOutputStream(file)));
res.add(file);
+ isEntry.close();
}
in.close();
} catch (IOException e) {
Modified: incubator/tika/trunk/src/main/resources/tika-config.xml
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/main/resources/tika-config.xml?rev=581944&r1=581943&r2=581944&view=diff
==============================================================================
--- incubator/tika/trunk/src/main/resources/tika-config.xml (original)
+++ incubator/tika/trunk/src/main/resources/tika-config.xml Thu Oct 4 08:14:27
2007
@@ -7,6 +7,7 @@
<namespace>http://purl.org/dc/elements/1.1/</namespace>
<mime>application/xml</mime>
<extract>
+ <content name="fullText" xpathSelect="//*"/>
<content name="title" xpathSelect="//dc:title"/>
<content name="subject" xpathSelect="//dc:subject"/>
<content name="creator" xpathSelect="//dc:creator"/>
Modified: incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java?rev=581944&r1=581943&r2=581944&view=diff
==============================================================================
--- incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java
(original)
+++ incubator/tika/trunk/src/test/java/org/apache/tika/TestParsers.java Thu Oct
4 08:14:27 2007
@@ -20,10 +20,12 @@
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
+import java.util.List;
import java.util.Map;
import junit.framework.TestCase;
+import org.apache.log4j.Logger;
import org.apache.tika.config.Content;
import org.apache.tika.config.ParserConfig;
import org.apache.tika.config.TikaConfig;
@@ -40,6 +42,7 @@
public class TestParsers extends TestCase {
private TikaConfig tc;
+
private File testFilesBaseDir;
public void setUp() throws JDOMException, IOException {
@@ -61,16 +64,13 @@
*/
// FIXME for now, fix filenames according to Maven testing
layout
-
- // The file below should be the default configuration for the test of
- // getDefaultConfig() to be legitimate.
- final String tikaConfigFilename
- = "target/classes/org/apache/tika/tika-config.xml";
+ // The file below should be the default configuration for the
test of
+ // getDefaultConfig() to be legitimate.
+ final String tikaConfigFilename =
"target/classes/org/apache/tika/tika-config.xml";
- final String log4jPropertiesFilename
- = "target/classes/log4j/log4j.properties";
+ final String log4jPropertiesFilename =
"target/classes/log4j/log4j.properties";
- testFilesBaseDir = new File("src/test/resources/test-documents");
+ testFilesBaseDir = new
File("src/test/resources/test-documents");
tc = new TikaConfig(tikaConfigFilename);
@@ -81,10 +81,10 @@
File file = getTestFile("testPDF.pdf");
String s1 = ParseUtils.getStringContent(file, tc);
String s2 = ParseUtils.getStringContent(file, tc,
"application/pdf");
- String s3 = ParseUtils.getStringContent(file,
- TikaConfig.getDefaultConfig());
- assertEquals(s1, s2);
- assertEquals(s1, s3);
+ String s3 = ParseUtils.getStringContent(file, TikaConfig
+ .getDefaultConfig());
+ assertEquals(s1, s2);
+ assertEquals(s1, s3);
}
public void testTXTExtraction() throws Exception {
@@ -149,10 +149,10 @@
public void testEXCELExtraction() throws Exception {
final String expected = "Numbers and their Squares Number
Square 1.0 "
- + "1.0 2.0 4.0 3.0 9.0 4.0 16.0 5.0 25.0 6.0 36.0 7.0 49.0 8.0
"
- + "64.0 9.0 81.0 10.0 100.0 11.0 121.0 12.0 144.0 13.0 169.0 "
- + "14.0 196.0 15.0 225.0 Written and saved in Microsoft Excel "
- + "X for Mac Service Release 1.";
+ + "1.0 2.0 4.0 3.0 9.0 4.0 16.0 5.0 25.0 6.0
36.0 7.0 49.0 8.0 "
+ + "64.0 9.0 81.0 10.0 100.0 11.0 121.0 12.0
144.0 13.0 169.0 "
+ + "14.0 196.0 15.0 225.0 Written and saved in
Microsoft Excel "
+ + "X for Mac Service Release 1.";
File file = getTestFile("testEXCEL.xls");
String s1 = ParseUtils.getStringContent(file, tc);
String s2 = ParseUtils.getStringContent(file, tc,
@@ -210,6 +210,35 @@
final String text = Utils.toString(contents);
final String expected = "Test Indexation Html";
assertTrue("text contains '" + expected + "'",
text.contains(expected));
+ }
+
+ public void testZipExtraction() throws Exception {
+ File zip = getTestFile("test-documents.zip");
+ List<Parser> parsers = ParseUtils.getParsersFromZip(zip, tc);
+ List<File> zipFiles = Utils.unzip(new FileInputStream(zip));
+ for (int i = 0; i < parsers.size(); i++) {
+ Parser zipEntryParser = parsers.get(i);
+ assertNotNull(zipEntryParser);
+ assertNotNull(zipEntryParser.getMimeType());
+ for (int j = 0; j < zipFiles.size(); j++) {
+ if
(zipEntryParser.getMimeType().equalsIgnoreCase(
+
tc.getMimeRepository().getMimeType(zipFiles.get(j))
+ .getName())) {
+ ParserConfig config =
tc.getParserConfig(zipEntryParser
+ .getMimeType());
+ Map<String, Content> contents =
config.getContents();
+ assertNotNull(contents);
+ InputStream stream = new
FileInputStream(zipFiles.get(j));
+ try {
+
zipEntryParser.getContents(stream, contents);
+
assertNotNull(contents.get("fullText"));
+ } finally {
+ stream.close();
+ }
+ }
+ }
+
+ }
}
private File getTestFile(String filename) {
Added: incubator/tika/trunk/src/test/resources/test-documents/test-documents.zip
URL:
http://svn.apache.org/viewvc/incubator/tika/trunk/src/test/resources/test-documents/test-documents.zip?rev=581944&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
incubator/tika/trunk/src/test/resources/test-documents/test-documents.zip
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream