Author: nick
Date: Sat Jul 31 17:00:57 2010
New Revision: 981076
URL: http://svn.apache.org/viewvc?rev=981076&view=rev
Log:
Excel parsing improvements for files with charts (TIKA-214)
Support chart based sheets, outputting chart labels, not over-writing sheet
entries with chart ones, and outputting extra sheet text inside the sheet but
outside the table.
Also adds unit test based on file from TIKA-214, along with a few toString()
methods to aid with debugging
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL-charts.xls
(with props)
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=981076&r1=981075&r2=981076&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
Sat Jul 31 17:00:57 2010
@@ -48,6 +48,7 @@ import org.apache.poi.hssf.record.Number
import org.apache.poi.hssf.record.RKRecord;
import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.SSTRecord;
+import org.apache.poi.hssf.record.chart.SeriesTextRecord;
import org.apache.poi.hssf.record.common.UnicodeString;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -142,6 +143,8 @@ public class ExcelExtractor {
private SAXException exception = null;
private SSTRecord sstRecord;
+
+ private short previousSid;
/**
* Internal <code>FormatTrackingHSSFListener</code> to handle cell
@@ -165,6 +168,12 @@ public class ExcelExtractor {
* worksheet is currently active.
*/
private SortedMap<Point, Cell> currentSheet = null;
+
+ /**
+ * Extra text or cells that crops up, typically as part of a
+ * worksheet but not always.
+ */
+ private List<Cell> extraTextCells = new ArrayList<Cell>();
/**
* Format for rendering numbers in the worksheet. Currently we just
@@ -216,6 +225,7 @@ public class ExcelExtractor {
hssfRequest.addListener(formatListener, RKRecord.sid);
hssfRequest.addListener(formatListener, HyperlinkRecord.sid);
hssfRequest.addListener(formatListener, TextObjectRecord.sid);
+ hssfRequest.addListener(formatListener, SeriesTextRecord.sid);
hssfRequest.addListener(formatListener, FormatRecord.sid);
hssfRequest.addListener(formatListener,
ExtendedFormatRecord.sid);
}
@@ -224,6 +234,9 @@ public class ExcelExtractor {
DocumentInputStream documentInputStream =
filesystem.createDocumentInputStream("Workbook");
HSSFEventFactory eventFactory = new HSSFEventFactory();
eventFactory.processEvents(hssfRequest, documentInputStream);
+
+ // Output any extra text that came after all the sheets
+ processExtraText();
}
/**
@@ -253,10 +266,21 @@ public class ExcelExtractor {
BOFRecord bof = (BOFRecord) record;
if (bof.getType() == BOFRecord.TYPE_WORKBOOK) {
currentSheetIndex = -1;
+ } else if (bof.getType() == BOFRecord.TYPE_CHART) {
+ if(previousSid == EOFRecord.sid) {
+ // This is a sheet which contains only a chart
+ newSheet();
+ } else {
+ // This is a chart within a normal sheet
+ // Handling of this is a bit hacky...
+ if (currentSheet != null) {
+ processSheet();
+ currentSheetIndex--;
+ newSheet();
+ }
+ }
} else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) {
- currentSheetIndex++;
- currentSheet =
- new TreeMap<Point, Cell>(new PointComparator());
+ newSheet();
}
break;
@@ -313,13 +337,34 @@ public class ExcelExtractor {
}
}
break;
+
case TextObjectRecord.sid:
TextObjectRecord tor = (TextObjectRecord) record;
addTextCell(record, tor.getStr().getString());
break;
+
+ case SeriesTextRecord.sid: // Chart label or title
+ SeriesTextRecord str = (SeriesTextRecord) record;
+ addTextCell(record, str.getText());
+ break;
}
+
+ previousSid = record.getSid();
}
-
+
+ private void processExtraText() throws SAXException {
+ if(extraTextCells.size() > 0) {
+ for(Cell cell : extraTextCells) {
+ handler.startElement("div", "class", "outside");
+ cell.render(handler);
+ handler.endElement("div");
+ }
+
+ // Reset
+ extraTextCells.clear();
+ }
+ }
+
/**
* Adds the given cell (unless <code>null</code>) to the current
* worksheet (if any) at the position (if any) of the given record.
@@ -339,9 +384,7 @@ public class ExcelExtractor {
currentSheet.put(point, cell);
} else {
// Cell outside the worksheets
- handler.startElement("div", "class", "outside");
- cell.render(handler);
- handler.endElement("div");
+ extraTextCells.add(cell);
}
}
@@ -362,6 +405,11 @@ public class ExcelExtractor {
}
}
+ private void newSheet() {
+ currentSheetIndex++;
+ currentSheet = new TreeMap<Point, Cell>(new PointComparator());
+ }
+
/**
* Process an excel sheet.
*
@@ -405,6 +453,9 @@ public class ExcelExtractor {
// Sheet End
handler.endElement("tbody");
handler.endElement("table");
+
+ // Finish up
+ processExtraText();
handler.endElement("div");
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java?rev=981076&r1=981075&r2=981076&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
Sat Jul 31 17:00:57 2010
@@ -39,4 +39,7 @@ public class NumberCell implements Cell
handler.characters(format.format(number));
}
+ public String toString() {
+ return "Numeric Cell: " + format.format(number);
+ }
}
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TextCell.java?rev=981076&r1=981075&r2=981076&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
Sat Jul 31 17:00:57 2010
@@ -34,4 +34,7 @@ public class TextCell implements Cell {
handler.characters(text);
}
+ public String toString() {
+ return "Text Cell: \"" + text + "\"";
+ }
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=981076&r1=981075&r2=981076&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
Sat Jul 31 17:00:57 2010
@@ -125,4 +125,43 @@ public class ExcelParserTest extends Tes
}
}
+ /**
+ * TIKA-214 - Ensure we extract labels etc from Charts
+ */
+ public void testExcelParserCharts() throws Exception {
+ InputStream input = ExcelParserTest.class.getResourceAsStream(
+ "/test-documents/testEXCEL-charts.xls");
+ try {
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.US);
+ ContentHandler handler = new BodyContentHandler();
+ new OfficeParser().parse(input, handler, metadata, context);
+
+ assertEquals(
+ "application/vnd.ms-excel",
+ metadata.get(Metadata.CONTENT_TYPE));
+
+ String content = handler.toString();
+
+ // The first sheet has a pie chart
+ assertTrue(content.contains("charttabyodawg"));
+ assertTrue(content.contains("WhamPuff"));
+
+ // The second sheet has a bar chart and some text
+ assertTrue(content.contains("Sheet1"));
+ assertTrue(content.contains("Test Excel Spreasheet"));
+ assertTrue(content.contains("foo"));
+ assertTrue(content.contains("bar"));
+ assertTrue(content.contains("fizzlepuff"));
+ assertTrue(content.contains("whyaxis"));
+ assertTrue(content.contains("eksaxis"));
+
+ // The third sheet has some text
+ assertTrue(content.contains("Sheet2"));
+ assertTrue(content.contains("dingdong"));
+ } finally {
+ input.close();
+ }
+ }
}
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL-charts.xls
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL-charts.xls?rev=981076&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL-charts.xls
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream