Author: nick
Date: Sat Jul 31 17:00:57 2010
New Revision: 981076

URL: http://svn.apache.org/viewvc?rev=981076&view=rev
Log:
Excel parsing improvements for files with charts (TIKA-214)
Support chart based sheets, outputting chart labels, not over-writing sheet 
entries with chart ones, and outputting extra sheet text inside the sheet but 
outside the table.
Also adds unit test based on file from TIKA-214, along with a few toString() 
methods to aid with debugging

Added:
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL-charts.xls  
 (with props)
Modified:
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=981076&r1=981075&r2=981076&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
 Sat Jul 31 17:00:57 2010
@@ -48,6 +48,7 @@ import org.apache.poi.hssf.record.Number
 import org.apache.poi.hssf.record.RKRecord;
 import org.apache.poi.hssf.record.Record;
 import org.apache.poi.hssf.record.SSTRecord;
+import org.apache.poi.hssf.record.chart.SeriesTextRecord;
 import org.apache.poi.hssf.record.common.UnicodeString;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@@ -142,6 +143,8 @@ public class ExcelExtractor {
         private SAXException exception = null;
 
         private SSTRecord sstRecord;
+        
+        private short previousSid;
 
         /**
          * Internal <code>FormatTrackingHSSFListener</code> to handle cell
@@ -165,6 +168,12 @@ public class ExcelExtractor {
          * worksheet is currently active.
          */
         private SortedMap<Point, Cell> currentSheet = null;
+        
+        /**
+         * Extra text or cells that crops up, typically as part of a
+         *  worksheet but not always.
+         */
+        private List<Cell> extraTextCells = new ArrayList<Cell>();
 
         /**
          * Format for rendering numbers in the worksheet. Currently we just
@@ -216,6 +225,7 @@ public class ExcelExtractor {
                 hssfRequest.addListener(formatListener, RKRecord.sid);
                 hssfRequest.addListener(formatListener, HyperlinkRecord.sid);
                 hssfRequest.addListener(formatListener, TextObjectRecord.sid);
+                hssfRequest.addListener(formatListener, SeriesTextRecord.sid);
                 hssfRequest.addListener(formatListener, FormatRecord.sid);
                 hssfRequest.addListener(formatListener, 
ExtendedFormatRecord.sid);
             }
@@ -224,6 +234,9 @@ public class ExcelExtractor {
             DocumentInputStream documentInputStream = 
filesystem.createDocumentInputStream("Workbook");
             HSSFEventFactory eventFactory = new HSSFEventFactory();
             eventFactory.processEvents(hssfRequest, documentInputStream);
+            
+            // Output any extra text that came after all the sheets
+            processExtraText(); 
        }
 
         /**
@@ -253,10 +266,21 @@ public class ExcelExtractor {
                 BOFRecord bof = (BOFRecord) record;
                 if (bof.getType() == BOFRecord.TYPE_WORKBOOK) {
                     currentSheetIndex = -1;
+                } else if (bof.getType() == BOFRecord.TYPE_CHART) {
+                    if(previousSid == EOFRecord.sid) {
+                        // This is a sheet which contains only a chart
+                        newSheet();
+                    } else {
+                        // This is a chart within a normal sheet
+                        // Handling of this is a bit hacky...
+                        if (currentSheet != null) {
+                            processSheet();
+                            currentSheetIndex--;
+                            newSheet();
+                        }
+                    }
                 } else if (bof.getType() == BOFRecord.TYPE_WORKSHEET) {
-                    currentSheetIndex++;
-                    currentSheet =
-                        new TreeMap<Point, Cell>(new PointComparator());
+                    newSheet();
                 }
                 break;
 
@@ -313,13 +337,34 @@ public class ExcelExtractor {
                     }
                 }
                 break;
+                
             case TextObjectRecord.sid:
                 TextObjectRecord tor = (TextObjectRecord) record;
                 addTextCell(record, tor.getStr().getString());
                 break;
+                
+            case SeriesTextRecord.sid: // Chart label or title
+                SeriesTextRecord str = (SeriesTextRecord) record;
+                addTextCell(record, str.getText());
+                break;
             }
+            
+            previousSid = record.getSid();
         }
-
+        
+        private void processExtraText() throws SAXException {
+            if(extraTextCells.size() > 0) {
+                for(Cell cell : extraTextCells) {
+                    handler.startElement("div", "class", "outside");
+                    cell.render(handler);
+                    handler.endElement("div");
+                }
+                
+                // Reset
+                extraTextCells.clear();
+            }
+        }
+        
         /**
          * Adds the given cell (unless <code>null</code>) to the current
          * worksheet (if any) at the position (if any) of the given record.
@@ -339,9 +384,7 @@ public class ExcelExtractor {
                 currentSheet.put(point, cell);
             } else {
                 // Cell outside the worksheets
-                handler.startElement("div", "class", "outside");
-                cell.render(handler);
-                handler.endElement("div");
+                extraTextCells.add(cell);
             }
         }
 
@@ -362,6 +405,11 @@ public class ExcelExtractor {
             }
         }
 
+        private void newSheet() {
+            currentSheetIndex++;
+            currentSheet = new TreeMap<Point, Cell>(new PointComparator());
+        }
+
         /**
          * Process an excel sheet.
          *
@@ -405,6 +453,9 @@ public class ExcelExtractor {
             // Sheet End
             handler.endElement("tbody");
             handler.endElement("table");
+            
+            // Finish up
+            processExtraText();
             handler.endElement("div");
         }
 

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java?rev=981076&r1=981075&r2=981076&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/NumberCell.java
 Sat Jul 31 17:00:57 2010
@@ -39,4 +39,7 @@ public class NumberCell implements Cell 
         handler.characters(format.format(number));
     }
 
+    public String toString() {
+        return "Numeric Cell: " + format.format(number);
+    }
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TextCell.java?rev=981076&r1=981075&r2=981076&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/TextCell.java
 Sat Jul 31 17:00:57 2010
@@ -34,4 +34,7 @@ public class TextCell implements Cell {
         handler.characters(text);
     }
 
+    public String toString() {
+        return "Text Cell: \"" + text + "\"";
+    }
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=981076&r1=981075&r2=981076&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
 Sat Jul 31 17:00:57 2010
@@ -125,4 +125,43 @@ public class ExcelParserTest extends Tes
         }
     }
 
+    /**
+     * TIKA-214 - Ensure we extract labels etc from Charts
+     */
+    public void testExcelParserCharts() throws Exception {
+        InputStream input = ExcelParserTest.class.getResourceAsStream(
+                  "/test-documents/testEXCEL-charts.xls");
+        try {
+            Metadata metadata = new Metadata();
+            ParseContext context = new ParseContext();
+            context.set(Locale.class, Locale.US);
+            ContentHandler handler = new BodyContentHandler();
+            new OfficeParser().parse(input, handler, metadata, context);
+        
+            assertEquals(
+                    "application/vnd.ms-excel",
+                    metadata.get(Metadata.CONTENT_TYPE));
+        
+            String content = handler.toString();
+            
+            // The first sheet has a pie chart
+            assertTrue(content.contains("charttabyodawg"));
+            assertTrue(content.contains("WhamPuff"));
+            
+            // The second sheet has a bar chart and some text
+            assertTrue(content.contains("Sheet1"));
+            assertTrue(content.contains("Test Excel Spreasheet"));
+            assertTrue(content.contains("foo"));
+            assertTrue(content.contains("bar"));
+            assertTrue(content.contains("fizzlepuff"));
+            assertTrue(content.contains("whyaxis"));
+            assertTrue(content.contains("eksaxis"));
+            
+            // The third sheet has some text
+            assertTrue(content.contains("Sheet2"));
+            assertTrue(content.contains("dingdong"));
+        } finally {
+            input.close();
+        }
+    }
 }

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL-charts.xls
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL-charts.xls?rev=981076&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL-charts.xls
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream


Reply via email to