This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 2d199aa0d TIKA-4430 -- improve extraction of metadata from xls (#2240)
2d199aa0d is described below
commit 2d199aa0da8421f88bf424d84c106d092751e46b
Author: Tim Allison <[email protected]>
AuthorDate: Tue Jun 3 09:10:24 2025 -0400
TIKA-4430 -- improve extraction of metadata from xls (#2240)
---
.../tika/parser/microsoft/ExcelExtractor.java | 81 ++++++++++++++++++++-
.../tika/parser/microsoft/ExcelParserTest.java | 15 ++++
.../test-documents/testEXCEL_extra_metadata.xls | Bin 0 -> 12800 bytes
3 files changed, 94 insertions(+), 2 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index 2aac29d91..41a1a840e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -24,8 +24,10 @@ import java.util.Comparator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
+import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
+import java.util.TreeSet;
import org.apache.poi.ddf.EscherBSERecord;
import org.apache.poi.ddf.EscherBlipRecord;
@@ -39,6 +41,7 @@ import org.apache.poi.hssf.model.InternalWorkbook;
import org.apache.poi.hssf.record.BOFRecord;
import org.apache.poi.hssf.record.BoundSheetRecord;
import org.apache.poi.hssf.record.CellValueRecordInterface;
+import org.apache.poi.hssf.record.ColumnInfoRecord;
import org.apache.poi.hssf.record.CountryRecord;
import org.apache.poi.hssf.record.DateWindow1904Record;
import org.apache.poi.hssf.record.DrawingGroupRecord;
@@ -51,9 +54,12 @@ import org.apache.poi.hssf.record.HeaderRecord;
import org.apache.poi.hssf.record.HyperlinkRecord;
import org.apache.poi.hssf.record.LabelRecord;
import org.apache.poi.hssf.record.LabelSSTRecord;
+import org.apache.poi.hssf.record.NoteRecord;
import org.apache.poi.hssf.record.NumberRecord;
+import org.apache.poi.hssf.record.ProtectRecord;
import org.apache.poi.hssf.record.RKRecord;
import org.apache.poi.hssf.record.Record;
+import org.apache.poi.hssf.record.RowRecord;
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hssf.record.StringRecord;
import org.apache.poi.hssf.record.TextObjectRecord;
@@ -73,8 +79,10 @@ import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
/**
* Excel parser implementation which uses POI's Event API
@@ -188,6 +196,7 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
new TikaHSSFListener(workbookEntryName, xhtml, locale, this,
officeParserConfig);
listener.processFile(root, isListenForAllRecords());
listener.throwStoredException();
+ updateMetadata(listener);
for (Entry entry : root) {
if (entry.getName().startsWith("MBD") && entry instanceof
DirectoryEntry) {
@@ -200,6 +209,36 @@ public class ExcelExtractor extends AbstractPOIFSExtractor
{
}
}
+ private void updateMetadata(TikaHSSFListener listener) {
+ if (listener.hasProtectedSheet) {
+ parentMetadata.set(Office.PROTECTED_WORKSHEET, true);
+ }
+ if (listener.hasHiddenColumn) {
+ parentMetadata.set(Office.HAS_HIDDEN_COLUMNS, true);
+ }
+ if (listener.hasHiddenRow) {
+ parentMetadata.set(Office.HAS_HIDDEN_ROWS, true);
+ }
+ if (! listener.commentAuthors.isEmpty()) {
+ for (String author : listener.commentAuthors) {
+ parentMetadata.add(Office.COMMENT_PERSONS, author);
+ }
+ parentMetadata.set(Office.HAS_COMMENTS, true);
+ }
+ if (! listener.hiddenSheets.isEmpty()) {
+ for (String sheetName : listener.hiddenSheets) {
+ parentMetadata.add(Office.HIDDEN_SHEET_NAMES, sheetName);
+ }
+ parentMetadata.set(Office.HAS_HIDDEN_SHEETS, true);
+ }
+ if (! listener.veryHiddenSheets.isEmpty()) {
+ for (String sheetName : listener.veryHiddenSheets) {
+ parentMetadata.add(Office.VERY_HIDDEN_SHEET_NAMES, sheetName);
+ }
+ parentMetadata.set(Office.HAS_VERY_HIDDEN_SHEETS, true);
+ }
+ }
+
// ======================================================================
/**
@@ -266,7 +305,14 @@ public class ExcelExtractor extends AbstractPOIFSExtractor
{
* depend on continue records that aren't always
* contiguous. Collect them for later processing.
*/
- private List<DrawingGroupRecord> drawingGroups = new ArrayList<>();
+ private final List<DrawingGroupRecord> drawingGroups = new
ArrayList<>();
+
+ private final List<String> hiddenSheets = new ArrayList<>();
+ private final List<String> veryHiddenSheets = new ArrayList<>();
+ private final Set<String> commentAuthors = new TreeSet<>();
+ private boolean hasHiddenColumn = false;
+ private boolean hasHiddenRow = false;
+ private boolean hasProtectedSheet = false;
/**
* Construct a new listener instance outputting parsed data to
@@ -328,6 +374,10 @@ public class ExcelExtractor extends AbstractPOIFSExtractor
{
hssfRequest.addListener(formatListener, FormatRecord.sid);
hssfRequest.addListener(formatListener,
ExtendedFormatRecord.sid);
hssfRequest.addListener(formatListener,
DrawingGroupRecord.sid);
+ hssfRequest.addListener(formatListener, ProtectRecord.sid);
+ hssfRequest.addListener(formatListener, ColumnInfoRecord.sid);
+ hssfRequest.addListener(formatListener, RowRecord.sid);
+ hssfRequest.addListener(formatListener, NoteRecord.sid);
if (extractor.officeParserConfig.isIncludeHeadersAndFooters())
{
hssfRequest.addListener(formatListener, HeaderRecord.sid);
hssfRequest.addListener(formatListener, FooterRecord.sid);
@@ -419,6 +469,12 @@ public class ExcelExtractor extends AbstractPOIFSExtractor
{
case BoundSheetRecord.sid: // Worksheet index record
BoundSheetRecord boundSheetRecord = (BoundSheetRecord)
record;
+ if (boundSheetRecord.isHidden()) {
+ hiddenSheets.add(boundSheetRecord.getSheetname());
+ }
+ if (boundSheetRecord.isVeryHidden()) {
+ veryHiddenSheets.add(boundSheetRecord.getSheetname());
+ }
sheetNames.add(boundSheetRecord.getSheetname());
break;
@@ -524,6 +580,28 @@ public class ExcelExtractor extends AbstractPOIFSExtractor
{
addTextCell(record, footerRecord.getText());
}
break;
+ case ProtectRecord.sid:
+ if (((ProtectRecord)record).getProtect()) {
+ //TODO -- associate this worksheet name
+ hasProtectedSheet = true;
+ }
+ break;
+ case ColumnInfoRecord.sid:
+ if (((ColumnInfoRecord)record).getHidden()) {
+ hasHiddenColumn = true;
+ }
+ break;
+ case NoteRecord.sid:
+ String author = ((NoteRecord)record).getAuthor();
+ if (!StringUtils.isBlank(author)) {
+ commentAuthors.add(author);
+ }
+ break;
+ case RowRecord.sid:
+ if (((RowRecord)record).getZeroHeight()) {
+ hasHiddenRow = true;
+ }
+ break;
}
previousSid = record.getSid();
@@ -680,7 +758,6 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
@Override
public void processRecord(Record record) {
-// System.out.println(record.getClass() + " :
"+record.toString());
super.processRecord(record);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 600194407..a90d79445 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -23,6 +23,7 @@ import static org.junit.jupiter.api.Assertions.fail;
import java.io.InputStream;
import java.text.DecimalFormatSymbols;
+import java.util.List;
import java.util.Locale;
import org.apache.poi.util.LocaleUtil;
@@ -577,4 +578,18 @@ public class ExcelParserTest extends TikaTest {
assertContains("1996-08-10", xml);
}
}
+
+ @Test
+ public void testExtraMetadata() throws Exception {
+ List<Metadata> metadataList =
getRecursiveMetadata("testEXCEL_extra_metadata.xls");
+ Metadata m = metadataList.get(0);
+ assertEquals("Unknown Author", m.getValues(Office.COMMENT_PERSONS)[0]);
+ assertEquals("true", m.get(Office.HAS_HIDDEN_COLUMNS));
+ assertEquals("true", m.get(Office.HAS_HIDDEN_ROWS));
+ assertEquals("true", m.get(Office.PROTECTED_WORKSHEET));
+ assertEquals("hidden-sheet",
m.getValues(Office.HIDDEN_SHEET_NAMES)[0]);
+ assertEquals("very-hidden-sheet",
m.getValues(Office.VERY_HIDDEN_SHEET_NAMES)[0]);
+ assertEquals("true", m.get(Office.HAS_COMMENTS));
+ assertEquals("true", m.get(Office.HAS_HIDDEN_COLUMNS));
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEXCEL_extra_metadata.xls
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEXCEL_extra_metadata.xls
new file mode 100644
index 000000000..e624857b7
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEXCEL_extra_metadata.xls
differ