Author: tallison
Date: Thu Jul  2 14:19:15 2015
New Revision: 1688834

URL: http://svn.apache.org/r1688834
Log:
TIKA-1400

Added:
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_headers_footers.xls
   (with props)
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_headers_footers.xlsx
   (with props)
Modified:
    tika/trunk/CHANGES.txt
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1688834&r1=1688833&r2=1688834&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu Jul  2 14:19:15 2015
@@ -1,4 +1,7 @@
 Release 1.10 - Current Development
+  * Add header/footer extraction to xls (via Aeham Abushwashi)
+    (TIKA-1400).
+
   * Drop the source file name from the embedded file path in
     RecursiveParserWrapper's "X-TIKA:embedded_resource_path" 
     (TIKA-1673).

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1688834&r1=1688833&r2=1688834&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
 Thu Jul  2 14:19:15 2015
@@ -43,8 +43,10 @@ import org.apache.poi.hssf.record.DateWi
 import org.apache.poi.hssf.record.DrawingGroupRecord;
 import org.apache.poi.hssf.record.EOFRecord;
 import org.apache.poi.hssf.record.ExtendedFormatRecord;
+import org.apache.poi.hssf.record.FooterRecord;
 import org.apache.poi.hssf.record.FormatRecord;
 import org.apache.poi.hssf.record.FormulaRecord;
+import org.apache.poi.hssf.record.HeaderRecord;
 import org.apache.poi.hssf.record.HyperlinkRecord;
 import org.apache.poi.hssf.record.LabelRecord;
 import org.apache.poi.hssf.record.LabelSSTRecord;
@@ -296,6 +298,8 @@ public class ExcelExtractor extends Abst
                 hssfRequest.addListener(formatListener, FormatRecord.sid);
                 hssfRequest.addListener(formatListener, 
ExtendedFormatRecord.sid);
                 hssfRequest.addListener(formatListener, 
DrawingGroupRecord.sid);
+                hssfRequest.addListener(formatListener, HeaderRecord.sid);
+                hssfRequest.addListener(formatListener, FooterRecord.sid);
             }
 
             // Create event factory and process Workbook (fire events)
@@ -462,6 +466,16 @@ public class ExcelExtractor extends Abst
                     //  the continue records are in
                     drawingGroups.add((DrawingGroupRecord) record);
                     break;
+                    
+                case HeaderRecord.sid:
+                       HeaderRecord headerRecord = (HeaderRecord) record;
+                       addTextCell(record, headerRecord.getText());
+                       break;
+                       
+                case FooterRecord.sid:
+                       FooterRecord footerRecord = (FooterRecord) record;
+                       addTextCell(record, footerRecord.getText());
+                       break;
 
             }
 

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=1688834&r1=1688833&r2=1688834&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
 Thu Jul  2 14:19:15 2015
@@ -454,4 +454,38 @@ public class ExcelParserTest {
         assertEquals("2010-12-30T22:00:00Z", 
metadata.get("custom:MyCustomDate"));
         assertEquals("2010-12-29T22:00:00Z", 
metadata.get("custom:myCustomSecondDate"));
     }
+
+       @Test
+    public void testHeaderAndFooterExtraction() throws Exception {
+        InputStream input = ExcelParserTest.class.getResourceAsStream(
+                "/test-documents/testEXCEL_headers_footers.xls");
+        
+        try {
+            Metadata metadata = new Metadata();
+            ContentHandler handler = new BodyContentHandler();
+            ParseContext context = new ParseContext();
+            context.set(Locale.class, Locale.UK);
+            new OfficeParser().parse(input, handler, metadata, context);
+
+            assertEquals(
+                    "application/vnd.ms-excel",
+                    metadata.get(Metadata.CONTENT_TYPE));
+            assertEquals("Internal spreadsheet", 
metadata.get(TikaCoreProperties.TITLE));
+            assertEquals("Aeham Abushwashi", 
metadata.get(TikaCoreProperties.CREATOR));
+            assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR));
+
+            String content = handler.toString();
+            assertContains("John Smith1", content);
+            assertContains("John Smith50", content);
+            assertContains("1 Corporate HQ", content);
+            assertContains("Header - Corporate Spreadsheet", content);
+            assertContains("Header - For Internal Use Only", content);
+            assertContains("Header - Author: John Smith", content);
+            assertContains("Footer - Corporate Spreadsheet", content);
+            assertContains("Footer - For Internal Use Only", content);
+            assertContains("Footer - Author: John Smith", content);
+        } finally {
+            input.close();
+        }
+    }
 }

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1688834&r1=1688833&r2=1688834&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 Thu Jul  2 14:19:15 2015
@@ -1249,6 +1249,29 @@ public class OOXMLParserTest extends Tik
         assertContains(">01..1 01..1", xml);
         assertContains(">02 02", xml);
     }
+
+    @Test
+    public void testExcelHeaderAndFooterExtraction() throws Exception {
+        XMLResult xml = getXML("testEXCEL_headers_footers.xlsx");
+
+        assertEquals(
+                
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+                xml.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("Internal spreadsheet", 
xml.metadata.get(TikaCoreProperties.TITLE));
+        assertEquals("Aeham Abushwashi", 
xml.metadata.get(TikaCoreProperties.CREATOR));
+        assertEquals("Aeham Abushwashi", xml.metadata.get(Metadata.AUTHOR));
+
+        String content = xml.xml;
+        assertContains("John Smith1", content);
+        assertContains("John Smith50", content);
+        assertContains("1 Corporate HQ", content);
+        assertContains("Header - Corporate Spreadsheet", content);
+        assertContains("Header - For Internal Use Only", content);
+        assertContains("Header - Author: John Smith", content);
+        assertContains("Footer - Corporate Spreadsheet", content);
+        assertContains("Footer - For Internal Use Only", content);
+        assertContains("Footer - Author: John Smith", content);
+    }
 }
 
 

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_headers_footers.xls
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_headers_footers.xls?rev=1688834&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_headers_footers.xls
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_headers_footers.xlsx
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_headers_footers.xlsx?rev=1688834&view=auto
==============================================================================
Binary file - no diff available.

Propchange: 
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_headers_footers.xlsx
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream


Reply via email to