Author: tallison
Date: Thu Jul 2 14:19:15 2015
New Revision: 1688834
URL: http://svn.apache.org/r1688834
Log:
TIKA-1400
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_headers_footers.xls
(with props)
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_headers_footers.xlsx
(with props)
Modified:
tika/trunk/CHANGES.txt
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Modified: tika/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1688834&r1=1688833&r2=1688834&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Thu Jul 2 14:19:15 2015
@@ -1,4 +1,7 @@
Release 1.10 - Current Development
+ * Add header/footer extraction to xls (via Aeham Abushwashi)
+ (TIKA-1400).
+
* Drop the source file name from the embedded file path in
RecursiveParserWrapper's "X-TIKA:embedded_resource_path"
(TIKA-1673).
Modified:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java?rev=1688834&r1=1688833&r2=1688834&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
(original)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
Thu Jul 2 14:19:15 2015
@@ -43,8 +43,10 @@ import org.apache.poi.hssf.record.DateWi
import org.apache.poi.hssf.record.DrawingGroupRecord;
import org.apache.poi.hssf.record.EOFRecord;
import org.apache.poi.hssf.record.ExtendedFormatRecord;
+import org.apache.poi.hssf.record.FooterRecord;
import org.apache.poi.hssf.record.FormatRecord;
import org.apache.poi.hssf.record.FormulaRecord;
+import org.apache.poi.hssf.record.HeaderRecord;
import org.apache.poi.hssf.record.HyperlinkRecord;
import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.LabelSSTRecord;
@@ -296,6 +298,8 @@ public class ExcelExtractor extends Abst
hssfRequest.addListener(formatListener, FormatRecord.sid);
hssfRequest.addListener(formatListener,
ExtendedFormatRecord.sid);
hssfRequest.addListener(formatListener,
DrawingGroupRecord.sid);
+ hssfRequest.addListener(formatListener, HeaderRecord.sid);
+ hssfRequest.addListener(formatListener, FooterRecord.sid);
}
// Create event factory and process Workbook (fire events)
@@ -462,6 +466,16 @@ public class ExcelExtractor extends Abst
// the continue records are in
drawingGroups.add((DrawingGroupRecord) record);
break;
+
+ case HeaderRecord.sid:
+ HeaderRecord headerRecord = (HeaderRecord) record;
+ addTextCell(record, headerRecord.getText());
+ break;
+
+ case FooterRecord.sid:
+ FooterRecord footerRecord = (FooterRecord) record;
+ addTextCell(record, footerRecord.getText());
+ break;
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java?rev=1688834&r1=1688833&r2=1688834&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
Thu Jul 2 14:19:15 2015
@@ -454,4 +454,38 @@ public class ExcelParserTest {
assertEquals("2010-12-30T22:00:00Z",
metadata.get("custom:MyCustomDate"));
assertEquals("2010-12-29T22:00:00Z",
metadata.get("custom:myCustomSecondDate"));
}
+
+ @Test
+ public void testHeaderAndFooterExtraction() throws Exception {
+ InputStream input = ExcelParserTest.class.getResourceAsStream(
+ "/test-documents/testEXCEL_headers_footers.xls");
+
+ try {
+ Metadata metadata = new Metadata();
+ ContentHandler handler = new BodyContentHandler();
+ ParseContext context = new ParseContext();
+ context.set(Locale.class, Locale.UK);
+ new OfficeParser().parse(input, handler, metadata, context);
+
+ assertEquals(
+ "application/vnd.ms-excel",
+ metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Internal spreadsheet",
metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Aeham Abushwashi",
metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Aeham Abushwashi", metadata.get(Metadata.AUTHOR));
+
+ String content = handler.toString();
+ assertContains("John Smith1", content);
+ assertContains("John Smith50", content);
+ assertContains("1 Corporate HQ", content);
+ assertContains("Header - Corporate Spreadsheet", content);
+ assertContains("Header - For Internal Use Only", content);
+ assertContains("Header - Author: John Smith", content);
+ assertContains("Footer - Corporate Spreadsheet", content);
+ assertContains("Footer - For Internal Use Only", content);
+ assertContains("Footer - Author: John Smith", content);
+ } finally {
+ input.close();
+ }
+ }
}
Modified:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java?rev=1688834&r1=1688833&r2=1688834&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
(original)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
Thu Jul 2 14:19:15 2015
@@ -1249,6 +1249,29 @@ public class OOXMLParserTest extends Tik
assertContains(">01..1 01..1", xml);
assertContains(">02 02", xml);
}
+
+ @Test
+ public void testExcelHeaderAndFooterExtraction() throws Exception {
+ XMLResult xml = getXML("testEXCEL_headers_footers.xlsx");
+
+ assertEquals(
+
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ xml.metadata.get(Metadata.CONTENT_TYPE));
+ assertEquals("Internal spreadsheet",
xml.metadata.get(TikaCoreProperties.TITLE));
+ assertEquals("Aeham Abushwashi",
xml.metadata.get(TikaCoreProperties.CREATOR));
+ assertEquals("Aeham Abushwashi", xml.metadata.get(Metadata.AUTHOR));
+
+ String content = xml.xml;
+ assertContains("John Smith1", content);
+ assertContains("John Smith50", content);
+ assertContains("1 Corporate HQ", content);
+ assertContains("Header - Corporate Spreadsheet", content);
+ assertContains("Header - For Internal Use Only", content);
+ assertContains("Header - Author: John Smith", content);
+ assertContains("Footer - Corporate Spreadsheet", content);
+ assertContains("Footer - For Internal Use Only", content);
+ assertContains("Footer - Author: John Smith", content);
+ }
}
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_headers_footers.xls
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_headers_footers.xls?rev=1688834&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_headers_footers.xls
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added:
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_headers_footers.xlsx
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_headers_footers.xlsx?rev=1688834&view=auto
==============================================================================
Binary file - no diff available.
Propchange:
tika/trunk/tika-parsers/src/test/resources/test-documents/testEXCEL_headers_footers.xlsx
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream