Author: nick
Date: Mon Dec 22 05:57:07 2014
New Revision: 1647243
URL: http://svn.apache.org/r1647243
Log:
TIKA-1490 Use the Old Excel parser for older OLE2 based formats too, like Excel
5 and 95
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1647243&r1=1647242&r2=1647243&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Mon Dec 22 05:57:07 2014
@@ -63,6 +63,9 @@ Release 1.7 - Current Development
* Created a new Tesseract OCR Parser to extract text from images.
Requires installation of Tesseract before use (TIKA-93).
+ * Basic parser for older Excel formats, such as Excel 4, 5 and 95,
+ which can get simple text, and metadata for Excel 5+95 (TIKA-1490)
+
Release 1.6 - 08/31/2014
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1647243&r1=1647242&r2=1647243&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
Mon Dec 22 05:57:07 2014
@@ -34,6 +34,7 @@ import org.apache.poi.hssf.eventusermode
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
+import org.apache.poi.hssf.extractor.OldExcelExtractor;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.BoundSheetRecord;
import org.apache.poi.hssf.record.CellValueRecordInterface;
@@ -150,7 +151,8 @@ public class ExcelExtractor extends Abst
// Excel 5 / Excel 95 file
// Records are in a different structure so needs a
// different parser to process them
- // TODO Call one, see TIKA-1490
+ OldExcelExtractor extractor = new OldExcelExtractor(root);
+ OldExcelParser.parse(extractor, xhtml);
return;
} else {
// Corrupt file / very old file, just skip text extraction
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java?rev=1647243&r1=1647242&r2=1647243&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OldExcelParser.java
Mon Dec 22 05:57:07 2014
@@ -72,17 +72,16 @@ public class OldExcelParser extends Abst
// TODO Get the version and type, to set as the Content Type
// Have the text extracted and given to our Content Handler
- parse(extractor, handler, metadata);
+ XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
+ parse(extractor, xhtml);
}
- protected static void parse(OldExcelExtractor extractor, ContentHandler
handler,
- Metadata metadata) throws TikaException, IOException, SAXException
{
+ protected static void parse(OldExcelExtractor extractor,
+ XHTMLContentHandler xhtml) throws TikaException, IOException,
SAXException {
// Get the whole text, as a single string
String text = extractor.getText();
// Split and output
- XHTMLContentHandler xhtml =
- new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
String line;
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=1647243&r1=1647242&r2=1647243&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
Mon Dec 22 05:57:07 2014
@@ -16,11 +16,11 @@
*/
package org.apache.tika.parser.microsoft;
+import static org.apache.tika.TikaTest.assertContains;
+import static org.apache.tika.TikaTest.assertNotContained;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import static org.junit.Assert.fail;
-import static org.apache.tika.TikaTest.assertContains;
-import static org.apache.tika.TikaTest.assertNotContained;
import java.io.InputStream;
import java.util.Locale;
@@ -29,6 +29,7 @@ import org.apache.tika.detect.DefaultDet
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLExtended;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
@@ -41,8 +42,8 @@ import org.junit.Test;
import org.xml.sax.ContentHandler;
public class ExcelParserTest {
-
@Test
+ @SuppressWarnings("deprecation") // Checks legacy Tika-1.0 style metadata
keys
public void testExcelParser() throws Exception {
InputStream input = ExcelParserTest.class.getResourceAsStream(
"/test-documents/testEXCEL.xls");
@@ -330,48 +331,97 @@ public class ExcelParserTest {
}
/**
- * We don't currently support the old Excel 95 .xls file format,
- * but we shouldn't break on these files either (TIKA-976)
+ * Excel 5 and 95 are older formats, and only get basic support
*/
@Test
public void testExcel95() throws Exception {
Detector detector = new DefaultDetector();
AutoDetectParser parser = new AutoDetectParser();
+ InputStream input;
+ MediaType type;
+ Metadata m;
- InputStream input = ExcelParserTest.class.getResourceAsStream(
- "/test-documents/testEXCEL_95.xls");
- Metadata m = new Metadata();
- m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
-
- // Should be detected correctly
- MediaType type = null;
+ // First try detection of Excel 5
+ m = new Metadata();
+ m.add(Metadata.RESOURCE_NAME_KEY, "excel_5.xls");
+ input =
ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls");
try {
- type = detector.detect(input, m);
- assertEquals("application/vnd.ms-excel", type.toString());
- } finally {
- input.close();
- }
-
- // OfficeParser will claim to handle it
- assertEquals(true, (new OfficeParser()).getSupportedTypes(new
ParseContext()).contains(type));
-
- // OOXMLParser won't handle it
- assertEquals(false, (new OOXMLParser()).getSupportedTypes(new
ParseContext()).contains(type));
+ type = detector.detect(input, m);
+ assertEquals("application/vnd.ms-excel", type.toString());
+ } finally {
+ input.close();
+ }
- // AutoDetectParser doesn't break on it
+ // Now Excel 95
+ m = new Metadata();
+ m.add(Metadata.RESOURCE_NAME_KEY, "excel_95.xls");
input =
ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls");
-
try {
- ContentHandler handler = new BodyContentHandler(-1);
- ParseContext context = new ParseContext();
- context.set(Locale.class, Locale.US);
- parser.parse(input, handler, m, context);
+ type = detector.detect(input, m);
+ assertEquals("application/vnd.ms-excel", type.toString());
+ } finally {
+ input.close();
+ }
- String content = handler.toString();
- assertEquals("", content);
- } finally {
- input.close();
- }
+ // OfficeParser can handle it
+ assertEquals(true, (new OfficeParser()).getSupportedTypes(new
ParseContext()).contains(type));
+
+ // OOXMLParser won't handle it
+ assertEquals(false, (new OOXMLParser()).getSupportedTypes(new
ParseContext()).contains(type));
+
+
+ // Parse the Excel 5 file
+ m = new Metadata();
+ input =
ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_5.xls");
+ try {
+ ContentHandler handler = new BodyContentHandler(-1);
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ parser.parse(input, handler, m, context);
+
+ String content = handler.toString();
+
+ // Sheet names
+ assertContains("Feuil1", content);
+ assertContains("Feuil3", content);
+
+ // Text
+ assertContains("Sample Excel", content);
+ assertContains("Number", content);
+
+ // Numbers
+ assertContains("15", content);
+ assertContains("225", content);
+
+ // Metadata was also fetched
+ assertEquals("Simple Excel document",
m.get(TikaCoreProperties.TITLE));
+ assertEquals("Keith Bennett", m.get(TikaCoreProperties.CREATOR));
+ } finally {
+ input.close();
+ }
+
+ // Parse the Excel 95 file
+ m = new Metadata();
+ input =
ExcelParserTest.class.getResourceAsStream("/test-documents/testEXCEL_95.xls");
+ try {
+ ContentHandler handler = new BodyContentHandler(-1);
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ parser.parse(input, handler, m, context);
+
+ String content = handler.toString();
+
+ // Sheet name
+ assertContains("Foglio1", content);
+
+ // Very boring file, no actual text or numbers!
+
+ // Metadata was also fetched
+ assertEquals(null, m.get(TikaCoreProperties.TITLE));
+ assertEquals("Marco Quaranta", m.get(Office.LAST_AUTHOR));
+ } finally {
+ input.close();
+ }
}
/**