This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 2d199aa0d TIKA-4430 -- improve extraction of metadata from xls (#2240)
2d199aa0d is described below

commit 2d199aa0da8421f88bf424d84c106d092751e46b
Author: Tim Allison <[email protected]>
AuthorDate: Tue Jun 3 09:10:24 2025 -0400

    TIKA-4430 -- improve extraction of metadata from xls (#2240)
---
 .../tika/parser/microsoft/ExcelExtractor.java      |  81 ++++++++++++++++++++-
 .../tika/parser/microsoft/ExcelParserTest.java     |  15 ++++
 .../test-documents/testEXCEL_extra_metadata.xls    | Bin 0 -> 12800 bytes
 3 files changed, 94 insertions(+), 2 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index 2aac29d91..41a1a840e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -24,8 +24,10 @@ import java.util.Comparator;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Set;
 import java.util.SortedMap;
 import java.util.TreeMap;
+import java.util.TreeSet;
 
 import org.apache.poi.ddf.EscherBSERecord;
 import org.apache.poi.ddf.EscherBlipRecord;
@@ -39,6 +41,7 @@ import org.apache.poi.hssf.model.InternalWorkbook;
 import org.apache.poi.hssf.record.BOFRecord;
 import org.apache.poi.hssf.record.BoundSheetRecord;
 import org.apache.poi.hssf.record.CellValueRecordInterface;
+import org.apache.poi.hssf.record.ColumnInfoRecord;
 import org.apache.poi.hssf.record.CountryRecord;
 import org.apache.poi.hssf.record.DateWindow1904Record;
 import org.apache.poi.hssf.record.DrawingGroupRecord;
@@ -51,9 +54,12 @@ import org.apache.poi.hssf.record.HeaderRecord;
 import org.apache.poi.hssf.record.HyperlinkRecord;
 import org.apache.poi.hssf.record.LabelRecord;
 import org.apache.poi.hssf.record.LabelSSTRecord;
+import org.apache.poi.hssf.record.NoteRecord;
 import org.apache.poi.hssf.record.NumberRecord;
+import org.apache.poi.hssf.record.ProtectRecord;
 import org.apache.poi.hssf.record.RKRecord;
 import org.apache.poi.hssf.record.Record;
+import org.apache.poi.hssf.record.RowRecord;
 import org.apache.poi.hssf.record.SSTRecord;
 import org.apache.poi.hssf.record.StringRecord;
 import org.apache.poi.hssf.record.TextObjectRecord;
@@ -73,8 +79,10 @@ import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
 
 /**
  * Excel parser implementation which uses POI's Event API
@@ -188,6 +196,7 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
                 new TikaHSSFListener(workbookEntryName, xhtml, locale, this, 
officeParserConfig);
         listener.processFile(root, isListenForAllRecords());
         listener.throwStoredException();
+        updateMetadata(listener);
 
         for (Entry entry : root) {
             if (entry.getName().startsWith("MBD") && entry instanceof 
DirectoryEntry) {
@@ -200,6 +209,36 @@ public class ExcelExtractor extends AbstractPOIFSExtractor 
{
         }
     }
 
+    private void updateMetadata(TikaHSSFListener listener) {
+        if (listener.hasProtectedSheet) {
+            parentMetadata.set(Office.PROTECTED_WORKSHEET, true);
+        }
+        if (listener.hasHiddenColumn) {
+            parentMetadata.set(Office.HAS_HIDDEN_COLUMNS, true);
+        }
+        if (listener.hasHiddenRow) {
+            parentMetadata.set(Office.HAS_HIDDEN_ROWS, true);
+        }
+        if (! listener.commentAuthors.isEmpty()) {
+            for (String author : listener.commentAuthors) {
+                parentMetadata.add(Office.COMMENT_PERSONS, author);
+            }
+            parentMetadata.set(Office.HAS_COMMENTS, true);
+        }
+        if (! listener.hiddenSheets.isEmpty()) {
+            for (String sheetName : listener.hiddenSheets) {
+                parentMetadata.add(Office.HIDDEN_SHEET_NAMES, sheetName);
+            }
+            parentMetadata.set(Office.HAS_HIDDEN_SHEETS, true);
+        }
+        if (! listener.veryHiddenSheets.isEmpty()) {
+            for (String sheetName : listener.veryHiddenSheets) {
+                parentMetadata.add(Office.VERY_HIDDEN_SHEET_NAMES, sheetName);
+            }
+            parentMetadata.set(Office.HAS_VERY_HIDDEN_SHEETS, true);
+        }
+    }
+
     // ======================================================================
 
     /**
@@ -266,7 +305,14 @@ public class ExcelExtractor extends AbstractPOIFSExtractor 
{
          * depend on continue records that aren't always
          * contiguous. Collect them for later processing.
          */
-        private List<DrawingGroupRecord> drawingGroups = new ArrayList<>();
+        private final List<DrawingGroupRecord> drawingGroups = new 
ArrayList<>();
+
+        private final List<String> hiddenSheets = new ArrayList<>();
+        private final List<String> veryHiddenSheets = new ArrayList<>();
+        private final Set<String> commentAuthors = new TreeSet<>();
+        private boolean hasHiddenColumn = false;
+        private boolean hasHiddenRow = false;
+        private boolean hasProtectedSheet = false;
 
         /**
          * Construct a new listener instance outputting parsed data to
@@ -328,6 +374,10 @@ public class ExcelExtractor extends AbstractPOIFSExtractor 
{
                 hssfRequest.addListener(formatListener, FormatRecord.sid);
                 hssfRequest.addListener(formatListener, 
ExtendedFormatRecord.sid);
                 hssfRequest.addListener(formatListener, 
DrawingGroupRecord.sid);
+                hssfRequest.addListener(formatListener, ProtectRecord.sid);
+                hssfRequest.addListener(formatListener, ColumnInfoRecord.sid);
+                hssfRequest.addListener(formatListener, RowRecord.sid);
+                hssfRequest.addListener(formatListener, NoteRecord.sid);
                 if (extractor.officeParserConfig.isIncludeHeadersAndFooters()) 
{
                     hssfRequest.addListener(formatListener, HeaderRecord.sid);
                     hssfRequest.addListener(formatListener, FooterRecord.sid);
@@ -419,6 +469,12 @@ public class ExcelExtractor extends AbstractPOIFSExtractor 
{
 
                 case BoundSheetRecord.sid: // Worksheet index record
                     BoundSheetRecord boundSheetRecord = (BoundSheetRecord) 
record;
+                    if (boundSheetRecord.isHidden()) {
+                        hiddenSheets.add(boundSheetRecord.getSheetname());
+                    }
+                    if (boundSheetRecord.isVeryHidden()) {
+                        veryHiddenSheets.add(boundSheetRecord.getSheetname());
+                    }
                     sheetNames.add(boundSheetRecord.getSheetname());
                     break;
 
@@ -524,6 +580,28 @@ public class ExcelExtractor extends AbstractPOIFSExtractor 
{
                         addTextCell(record, footerRecord.getText());
                     }
                     break;
+                case ProtectRecord.sid:
+                    if (((ProtectRecord)record).getProtect()) {
+                        //TODO -- associate this worksheet name
+                        hasProtectedSheet = true;
+                    }
+                    break;
+                case ColumnInfoRecord.sid:
+                    if (((ColumnInfoRecord)record).getHidden()) {
+                        hasHiddenColumn = true;
+                    }
+                    break;
+                case NoteRecord.sid:
+                    String author = ((NoteRecord)record).getAuthor();
+                    if (!StringUtils.isBlank(author)) {
+                        commentAuthors.add(author);
+                    }
+                    break;
+                case RowRecord.sid:
+                    if (((RowRecord)record).getZeroHeight()) {
+                        hasHiddenRow = true;
+                    }
+                    break;
             }
 
             previousSid = record.getSid();
@@ -680,7 +758,6 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
 
             @Override
             public void processRecord(Record record) {
-//                System.out.println(record.getClass() + " : 
"+record.toString());
                 super.processRecord(record);
             }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 600194407..a90d79445 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -23,6 +23,7 @@ import static org.junit.jupiter.api.Assertions.fail;
 
 import java.io.InputStream;
 import java.text.DecimalFormatSymbols;
+import java.util.List;
 import java.util.Locale;
 
 import org.apache.poi.util.LocaleUtil;
@@ -577,4 +578,18 @@ public class ExcelParserTest extends TikaTest {
             assertContains("1996-08-10", xml);
         }
     }
+
+    @Test
+    public void testExtraMetadata() throws Exception {
+        List<Metadata> metadataList = 
getRecursiveMetadata("testEXCEL_extra_metadata.xls");
+        Metadata m = metadataList.get(0);
+        assertEquals("Unknown Author", m.getValues(Office.COMMENT_PERSONS)[0]);
+        assertEquals("true", m.get(Office.HAS_HIDDEN_COLUMNS));
+        assertEquals("true", m.get(Office.HAS_HIDDEN_ROWS));
+        assertEquals("true", m.get(Office.PROTECTED_WORKSHEET));
+        assertEquals("hidden-sheet", 
m.getValues(Office.HIDDEN_SHEET_NAMES)[0]);
+        assertEquals("very-hidden-sheet", 
m.getValues(Office.VERY_HIDDEN_SHEET_NAMES)[0]);
+        assertEquals("true", m.get(Office.HAS_COMMENTS));
+        assertEquals("true", m.get(Office.HAS_HIDDEN_COLUMNS));
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEXCEL_extra_metadata.xls
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEXCEL_extra_metadata.xls
new file mode 100644
index 000000000..e624857b7
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEXCEL_extra_metadata.xls
 differ

Reply via email to