This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 320d728 TIKA-3030 -- allow for more flexibility in the naming of the
workbook entry in .xls files
320d728 is described below
commit 320d7281f28051e11b6df02e5a97df61bcc8716f
Author: tballison <[email protected]>
AuthorDate: Wed Jan 29 09:32:42 2020 -0500
TIKA-3030 -- allow for more flexibility in the naming of the workbook entry
in .xls files
---
.../tika/parser/microsoft/ExcelExtractor.java | 38 +++++++++++++++++----
.../parser/microsoft/POIFSContainerDetector.java | 16 +++++----
.../tika/parser/microsoft/SummaryExtractor.java | 5 +++
.../tika/parser/microsoft/ExcelParserTest.java | 6 ++++
.../testEXCEL_WORKBOOK_in_capitals.xls | Bin 0 -> 64512 bytes
5 files changed, 52 insertions(+), 13 deletions(-)
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index 0dd86ba..3ccd019 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -35,6 +35,7 @@ import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
import org.apache.poi.hssf.extractor.OldExcelExtractor;
+import org.apache.poi.hssf.model.InternalWorkbook;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.BoundSheetRecord;
import org.apache.poi.hssf.record.CellValueRecordInterface;
@@ -92,7 +93,6 @@ import org.xml.sax.SAXException;
*/
public class ExcelExtractor extends AbstractPOIFSExtractor {
- private static final String WORKBOOK_ENTRY = "Workbook";
private static final String BOOK_ENTRY = "Book";
/**
* <code>true</code> if the HSSFListener should be registered
@@ -147,7 +147,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
protected void parse(
DirectoryNode root, XHTMLContentHandler xhtml,
Locale locale) throws IOException, SAXException, TikaException {
- if (!root.hasEntry(WORKBOOK_ENTRY)) {
+ String workbookEntryName = findWorkbookEntry(root);
+ if (workbookEntryName == null) {
if (root.hasEntry(BOOK_ENTRY)) {
// Excel 5 / Excel 95 file
// Records are in a different structure so needs a
@@ -156,8 +157,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
OldExcelParser.parse(extractor, xhtml);
return;
} else {
- // Corrupt file / very old file, just skip text extraction
- return;
+ // Corrupt file / very old file
+ throw new TikaException("Couldn't find workbook entry");
}
}
@@ -165,7 +166,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
Biff8EncryptionKey.setCurrentUserPassword(getPassword());
// Have the file processed in event mode
- TikaHSSFListener listener = new TikaHSSFListener(xhtml, locale, this,
officeParserConfig);
+ TikaHSSFListener listener = new TikaHSSFListener(workbookEntryName,
+ xhtml, locale, this, officeParserConfig);
listener.processFile(root, isListenForAllRecords());
listener.throwStoredException();
@@ -181,6 +183,22 @@ public class ExcelExtractor extends AbstractPOIFSExtractor
{
}
}
+ /**
+ * Looks for one of the variant names for the workbook entry;
+ * returns null if not found.
+ *
+ * @param root directory root to search
+ * @return workbook entry or null
+ */
+ private static String findWorkbookEntry(DirectoryNode root) {
+ for (String workbookDirEntryName :
InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES) {
+ if (root.hasEntry(workbookDirEntryName)) {
+ return workbookDirEntryName;
+ }
+ }
+ return null;
+ }
+
// ======================================================================
/**
@@ -222,6 +240,7 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
*/
private FormatTrackingHSSFListener formatListener;
private final TikaExcelDataFormatter tikaExcelDataFormatter;
+ private final String workbookEntryName;
/**
* List of worksheet names.
@@ -255,7 +274,11 @@ public class ExcelExtractor extends AbstractPOIFSExtractor
{
*
* @param handler Destination to write the parsed output to
*/
- private TikaHSSFListener(XHTMLContentHandler handler, Locale locale,
AbstractPOIFSExtractor extractor, OfficeParserConfig officeParserConfig) {
+ private TikaHSSFListener(String workbookEntryName,
+ XHTMLContentHandler handler, Locale locale,
+ AbstractPOIFSExtractor extractor,
+ OfficeParserConfig officeParserConfig) {
+ this.workbookEntryName = workbookEntryName;
this.handler = handler;
this.extractor = extractor;
this.format = NumberFormat.getInstance(locale);
@@ -310,7 +333,8 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
}
// Create event factory and process Workbook (fire events)
- DocumentInputStream documentInputStream =
root.createDocumentInputStream(WORKBOOK_ENTRY);
+ DocumentInputStream documentInputStream =
root.createDocumentInputStream(
+ workbookEntryName);
HSSFEventFactory eventFactory = new HSSFEventFactory();
try {
eventFactory.processEvents(hssfRequest, documentInputStream);
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
index 64953e7..7f44913 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/POIFSContainerDetector.java
@@ -29,6 +29,7 @@ import java.util.Set;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
+import org.apache.poi.hssf.model.InternalWorkbook;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
@@ -208,6 +209,15 @@ public class POIFSContainerDetector implements Detector {
*/
protected static MediaType detect(Set<String> names, DirectoryEntry root) {
if (names != null) {
+ for (String workbookEntryName :
InternalWorkbook.WORKBOOK_DIR_ENTRY_NAMES) {
+ if (names.contains(workbookEntryName)) {
+ MediaType tmp = processCompObjFormatType(root);
+ if (tmp.equals(MS_GRAPH_CHART)) {
+ return MS_GRAPH_CHART;
+ }
+ return XLS;
+ }
+ }
if (names.contains("SwDocContentMgr") &&
names.contains("SwDocMgrTempStorage")) {
return SLDWORKS;
} else if (names.contains("StarCalcDocument")) {
@@ -237,12 +247,6 @@ public class POIFSContainerDetector implements Detector {
// Works 7.0 spreadsheet files contain both
// we want to avoid classifying this as Excel
return XLR;
- } else if (names.contains("Workbook") ||
names.contains("WORKBOOK")) {
- MediaType tmp = processCompObjFormatType(root);
- if (tmp.equals(MS_GRAPH_CHART)) {
- return MS_GRAPH_CHART;
- }
- return XLS;
} else if (names.contains("Book")) {
// Excel 95 or older, we won't be able to parse this....
return XLS;
diff --git
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
index 8017184..ba98c0e 100644
---
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
+++
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/SummaryExtractor.java
@@ -78,6 +78,9 @@ public class SummaryExtractor {
DirectoryNode root, String entryName)
throws IOException, TikaException {
try {
+ if (! root.hasEntry(entryName)) {
+ return;
+ }
DocumentEntry entry =
(DocumentEntry) root.getEntry(entryName);
PropertySet properties =
@@ -94,6 +97,8 @@ public class SummaryExtractor {
// no property stream, just skip it
} catch (UnexpectedPropertySetTypeException e) {
throw new TikaException("Unexpected HPSF document", e);
+ } catch (SecurityException e) {
+ throw e;
} catch (Exception e) {
LOG.warn("Ignoring unexpected exception while parsing summary
entry {}", entryName, e);
}
diff --git
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index bcddecf..c6b79b6 100644
---
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -573,4 +573,10 @@ public class ExcelParserTest extends TikaTest {
String xml = getXML("testEXCEL_labels-govdocs-515858.xls").xml;
assertContains("Morocco", xml);
}
+
+ @Test
+ public void testWorkBookInCapitals() throws Exception {
+ String xml = getXML("testEXCEL_WORKBOOK_in_capitals.xls").xml;
+ assertContains("Inventarliste", xml);
+ }
}
diff --git
a/tika-parsers/src/test/resources/test-documents/testEXCEL_WORKBOOK_in_capitals.xls
b/tika-parsers/src/test/resources/test-documents/testEXCEL_WORKBOOK_in_capitals.xls
new file mode 100644
index 0000000..1c3ef61
Binary files /dev/null and
b/tika-parsers/src/test/resources/test-documents/testEXCEL_WORKBOOK_in_capitals.xls
differ