This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_3x by this push:
     new d375b4c14 TIKA-4410 (#2226)
d375b4c14 is described below

commit d375b4c145a97451069a5f0dd30ede98165c990d
Author: tallison <[email protected]>
AuthorDate: Tue Jun 3 08:49:38 2025 -0400

    TIKA-4410 (#2226)
    
    TIKA-4430 -- improve extraction of metadata from xls
---
 .../main/java/org/apache/tika/metadata/Office.java |  21 +++
 .../tika/parser/microsoft/ExcelExtractor.java      |  81 ++++++++++-
 .../microsoft/ooxml/CommentPersonHandler.java      |  47 +++++++
 .../parser/microsoft/ooxml/OPCPackageWrapper.java  |   3 +
 .../ooxml/XSSFExcelExtractorDecorator.java         | 149 ++++++++++++++++++++-
 .../tika/parser/microsoft/ExcelParserTest.java     |  15 +++
 .../test-documents/testEXCEL_extra_metadata.xls    | Bin 0 -> 12800 bytes
 7 files changed, 309 insertions(+), 7 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java 
b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
index 2a9e428eb..39607445f 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
@@ -184,4 +184,25 @@ public interface Office {
             TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-importance");
     Property MAPI_IS_FLAGGED = Property.internalBoolean(PREFIX_DOC_META +
             TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER + "mapi-is-flagged");
+
+    Property HAS_HIDDEN_SHEETS = 
Property.internalBoolean("msoffice:excel:has-hidden-sheets");
+
+    Property HAS_HIDDEN_COLUMNS = 
Property.internalBoolean("msoffice:excel:has-hidden-cols");
+
+    Property HAS_HIDDEN_ROWS = 
Property.internalBoolean("msoffice:excel:has-hidden-rows");
+
+    Property HAS_VERY_HIDDEN_SHEETS = 
Property.internalBoolean("msoffice:excel:has-very-hidden-sheets");
+
+    Property HIDDEN_SHEET_NAMES = 
Property.internalTextBag("msoffice:excel:hidden-sheet-names");
+
+    Property VERY_HIDDEN_SHEET_NAMES = 
Property.internalTextBag("msoffice:excel:very-hidden-sheet-names");
+
+    Property PROTECTED_WORKSHEET = 
Property.internalBoolean("msoffice:excel:protected-worksheet");
+
+    Property WORKBOOK_CODENAME = 
Property.internalText("msoffice:excel:workbook-codename");
+
+    Property HAS_COMMENTS = Property.internalBoolean("msoffice:has-comments");
+
+    Property COMMENT_PERSONS = 
Property.internalTextBag("msoffice:comment-person-display-name");
+
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
index 2aac29d91..41a1a840e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ExcelExtractor.java
@@ -24,8 +24,10 @@ import java.util.Comparator;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.Set;
 import java.util.SortedMap;
 import java.util.TreeMap;
+import java.util.TreeSet;
 
 import org.apache.poi.ddf.EscherBSERecord;
 import org.apache.poi.ddf.EscherBlipRecord;
@@ -39,6 +41,7 @@ import org.apache.poi.hssf.model.InternalWorkbook;
 import org.apache.poi.hssf.record.BOFRecord;
 import org.apache.poi.hssf.record.BoundSheetRecord;
 import org.apache.poi.hssf.record.CellValueRecordInterface;
+import org.apache.poi.hssf.record.ColumnInfoRecord;
 import org.apache.poi.hssf.record.CountryRecord;
 import org.apache.poi.hssf.record.DateWindow1904Record;
 import org.apache.poi.hssf.record.DrawingGroupRecord;
@@ -51,9 +54,12 @@ import org.apache.poi.hssf.record.HeaderRecord;
 import org.apache.poi.hssf.record.HyperlinkRecord;
 import org.apache.poi.hssf.record.LabelRecord;
 import org.apache.poi.hssf.record.LabelSSTRecord;
+import org.apache.poi.hssf.record.NoteRecord;
 import org.apache.poi.hssf.record.NumberRecord;
+import org.apache.poi.hssf.record.ProtectRecord;
 import org.apache.poi.hssf.record.RKRecord;
 import org.apache.poi.hssf.record.Record;
+import org.apache.poi.hssf.record.RowRecord;
 import org.apache.poi.hssf.record.SSTRecord;
 import org.apache.poi.hssf.record.StringRecord;
 import org.apache.poi.hssf.record.TextObjectRecord;
@@ -73,8 +79,10 @@ import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
 
 /**
  * Excel parser implementation which uses POI's Event API
@@ -188,6 +196,7 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
                 new TikaHSSFListener(workbookEntryName, xhtml, locale, this, 
officeParserConfig);
         listener.processFile(root, isListenForAllRecords());
         listener.throwStoredException();
+        updateMetadata(listener);
 
         for (Entry entry : root) {
             if (entry.getName().startsWith("MBD") && entry instanceof 
DirectoryEntry) {
@@ -200,6 +209,36 @@ public class ExcelExtractor extends AbstractPOIFSExtractor 
{
         }
     }
 
+    private void updateMetadata(TikaHSSFListener listener) {
+        if (listener.hasProtectedSheet) {
+            parentMetadata.set(Office.PROTECTED_WORKSHEET, true);
+        }
+        if (listener.hasHiddenColumn) {
+            parentMetadata.set(Office.HAS_HIDDEN_COLUMNS, true);
+        }
+        if (listener.hasHiddenRow) {
+            parentMetadata.set(Office.HAS_HIDDEN_ROWS, true);
+        }
+        if (! listener.commentAuthors.isEmpty()) {
+            for (String author : listener.commentAuthors) {
+                parentMetadata.add(Office.COMMENT_PERSONS, author);
+            }
+            parentMetadata.set(Office.HAS_COMMENTS, true);
+        }
+        if (! listener.hiddenSheets.isEmpty()) {
+            for (String sheetName : listener.hiddenSheets) {
+                parentMetadata.add(Office.HIDDEN_SHEET_NAMES, sheetName);
+            }
+            parentMetadata.set(Office.HAS_HIDDEN_SHEETS, true);
+        }
+        if (! listener.veryHiddenSheets.isEmpty()) {
+            for (String sheetName : listener.veryHiddenSheets) {
+                parentMetadata.add(Office.VERY_HIDDEN_SHEET_NAMES, sheetName);
+            }
+            parentMetadata.set(Office.HAS_VERY_HIDDEN_SHEETS, true);
+        }
+    }
+
     // ======================================================================
 
     /**
@@ -266,7 +305,14 @@ public class ExcelExtractor extends AbstractPOIFSExtractor 
{
          * depend on continue records that aren't always
          * contiguous. Collect them for later processing.
          */
-        private List<DrawingGroupRecord> drawingGroups = new ArrayList<>();
+        private final List<DrawingGroupRecord> drawingGroups = new 
ArrayList<>();
+
+        private final List<String> hiddenSheets = new ArrayList<>();
+        private final List<String> veryHiddenSheets = new ArrayList<>();
+        private final Set<String> commentAuthors = new TreeSet<>();
+        private boolean hasHiddenColumn = false;
+        private boolean hasHiddenRow = false;
+        private boolean hasProtectedSheet = false;
 
         /**
          * Construct a new listener instance outputting parsed data to
@@ -328,6 +374,10 @@ public class ExcelExtractor extends AbstractPOIFSExtractor 
{
                 hssfRequest.addListener(formatListener, FormatRecord.sid);
                 hssfRequest.addListener(formatListener, 
ExtendedFormatRecord.sid);
                 hssfRequest.addListener(formatListener, 
DrawingGroupRecord.sid);
+                hssfRequest.addListener(formatListener, ProtectRecord.sid);
+                hssfRequest.addListener(formatListener, ColumnInfoRecord.sid);
+                hssfRequest.addListener(formatListener, RowRecord.sid);
+                hssfRequest.addListener(formatListener, NoteRecord.sid);
                 if (extractor.officeParserConfig.isIncludeHeadersAndFooters()) 
{
                     hssfRequest.addListener(formatListener, HeaderRecord.sid);
                     hssfRequest.addListener(formatListener, FooterRecord.sid);
@@ -419,6 +469,12 @@ public class ExcelExtractor extends AbstractPOIFSExtractor 
{
 
                 case BoundSheetRecord.sid: // Worksheet index record
                     BoundSheetRecord boundSheetRecord = (BoundSheetRecord) 
record;
+                    if (boundSheetRecord.isHidden()) {
+                        hiddenSheets.add(boundSheetRecord.getSheetname());
+                    }
+                    if (boundSheetRecord.isVeryHidden()) {
+                        veryHiddenSheets.add(boundSheetRecord.getSheetname());
+                    }
                     sheetNames.add(boundSheetRecord.getSheetname());
                     break;
 
@@ -524,6 +580,28 @@ public class ExcelExtractor extends AbstractPOIFSExtractor 
{
                         addTextCell(record, footerRecord.getText());
                     }
                     break;
+                case ProtectRecord.sid:
+                    if (((ProtectRecord)record).getProtect()) {
+                        //TODO -- associate this worksheet name
+                        hasProtectedSheet = true;
+                    }
+                    break;
+                case ColumnInfoRecord.sid:
+                    if (((ColumnInfoRecord)record).getHidden()) {
+                        hasHiddenColumn = true;
+                    }
+                    break;
+                case NoteRecord.sid:
+                    String author = ((NoteRecord)record).getAuthor();
+                    if (!StringUtils.isBlank(author)) {
+                        commentAuthors.add(author);
+                    }
+                    break;
+                case RowRecord.sid:
+                    if (((RowRecord)record).getZeroHeight()) {
+                        hasHiddenRow = true;
+                    }
+                    break;
             }
 
             previousSid = record.getSid();
@@ -680,7 +758,6 @@ public class ExcelExtractor extends AbstractPOIFSExtractor {
 
             @Override
             public void processRecord(Record record) {
-//                System.out.println(record.getClass() + " : 
"+record.toString());
                 super.processRecord(record);
             }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/CommentPersonHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/CommentPersonHandler.java
new file mode 100644
index 000000000..c7efda1ae
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/CommentPersonHandler.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
+import org.apache.tika.utils.StringUtils;
+import org.apache.tika.utils.XMLReaderUtils;
+
+public class CommentPersonHandler extends DefaultHandler {
+
+    private final Metadata metadata;
+
+    CommentPersonHandler(Metadata metadata) {
+        this.metadata = metadata;
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, 
Attributes atts) throws SAXException {
+        //what else do we want?
+        //<person displayName="Wiley Coyote" 
id="{11111111-2234-2342-2342-23498237923}" userId="55bbdf23486284" 
providerId="Windows Live"/>
+        if ("person".equals(localName)) {
+            String displayName = XMLReaderUtils.getAttrValue("displayName", 
atts);
+            if (!StringUtils.isBlank(displayName)) {
+                metadata.add(Office.COMMENT_PERSONS, displayName);
+            }
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
index 2cfd24f92..1fb0b8e40 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
@@ -29,6 +29,9 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
  */
 public class OPCPackageWrapper implements Closeable {
 
+    public static final String PERSON_RELATION = 
"http://schemas.microsoft.com/office/2017/10/relationships/person";;
+    public static final String THREADED_COMMENT_RELATION = 
"http://schemas.microsoft.com/office/2017/10/relationships/threadedComment";;
+
     private final OPCPackage opcPackage;
 
     public OPCPackageWrapper(OPCPackage opcPackage) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index da5357937..873242927 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -35,6 +35,7 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
 import org.apache.poi.openxml4j.opc.PackagePart;
 import org.apache.poi.openxml4j.opc.PackagePartName;
 import org.apache.poi.openxml4j.opc.PackageRelationship;
+import org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.openxml4j.opc.PackageRelationshipTypes;
 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
 import org.apache.poi.openxml4j.opc.TargetMode;
@@ -68,11 +69,13 @@ import org.xml.sax.helpers.DefaultHandler;
 import org.apache.tika.exception.RuntimeSAXException;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.microsoft.TikaExcelDataFormatter;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
 import org.apache.tika.utils.XMLReaderUtils;
 
 public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor {
@@ -159,9 +162,12 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
                 sheetParts.add(sheetPart);
 
                 Comments comments = iter.getSheetComments();
+                if (comments != null && comments.getNumberOfComments() > 0) {
+                    metadata.set(Office.HAS_COMMENTS, true);
+                }
 
                 // Start, and output the sheet name
-                xhtml.startElement("div");
+                xhtml.startElement("div", "class", "sheet");
                 xhtml.element("h1", iter.getSheetName());
 
                 // Extract the main sheet contents
@@ -169,9 +175,14 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
                 xhtml.startElement("tbody");
 
                 processSheet(sheetExtractor, comments, styles, strings, 
stream);
+                try {
+                    getThreadedComments(container, sheetPart, xhtml);
+                } catch (InvalidFormatException | TikaException | IOException 
e) {
+                    //swallow
+                }
+                xhtml.endElement("tbody");
+                xhtml.endElement("table");
             }
-            xhtml.endElement("tbody");
-            xhtml.endElement("table");
 
             // Output any headers and footers
             // (Need to process the sheet to get them, so we can't
@@ -201,13 +212,63 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
         //consider adding this back to POI
         try (InputStream wbData = xssfReader.getWorkbookData()) {
             XMLReaderUtils
-                    .parseSAX(wbData, new AbsPathExtractorHandler(),
+                    .parseSAX(wbData, new WorkbookMetadataHandler(),
                             parseContext);
         } catch (InvalidFormatException | TikaException e) {
             //swallow
         }
+        try {
+            getPersons(container, metadata);
+        } catch (InvalidFormatException | TikaException | IOException | 
SAXException e) {
+            //swallow
+        }
+
     }
 
+    private void getThreadedComments(OPCPackage container, PackagePart 
sheetPart, XHTMLContentHandler xhtml) throws TikaException,
+            InvalidFormatException, SAXException, IOException {
+        //consider caching the person id -> person names in getPersons and 
injecting that into the xhtml per comment?
+        PackageRelationshipCollection coll = 
sheetPart.getRelationshipsByType(OPCPackageWrapper.THREADED_COMMENT_RELATION);
+        if (coll == null || coll.isEmpty()) {
+            return;
+        }
+        for (PackageRelationship rel : coll) {
+            PackagePart threadedCommentPart = sheetPart.getRelatedPart(rel);
+            if (threadedCommentPart == null) {
+                continue;
+            }
+            try (InputStream is = threadedCommentPart.getInputStream()) {
+                XMLReaderUtils.parseSAX(is, new ThreadedCommentHandler(xhtml), 
parseContext);
+            }
+        }
+    }
+
+    private void getPersons(OPCPackage container, Metadata metadata) throws 
TikaException, InvalidFormatException,
+            IOException, SAXException {
+        PackageRelationship coreDocRelationship = 
container.getRelationshipsByType(
+                PackageRelationshipTypes.CORE_DOCUMENT).getRelationship(0);
+        if (coreDocRelationship == null) {
+            return;
+        }
+        // Get the part that holds the workbook
+        PackagePart workbookPart = container.getPart(coreDocRelationship);
+        if (workbookPart == null) {
+            return;
+        }
+        PackageRelationshipCollection coll = 
workbookPart.getRelationshipsByType(OPCPackageWrapper.PERSON_RELATION);
+        if (coll == null) {
+            return;
+        }
+        for (PackageRelationship rel : coll) {
+            PackagePart personsPart = workbookPart.getRelatedPart(rel);
+            if (personsPart == null) {
+                continue;
+            }
+            try (InputStream is = personsPart.getInputStream()) {
+                XMLReaderUtils.parseSAX(is, new 
CommentPersonHandler(metadata), parseContext);
+            }
+        }
+    }
 
     protected void addDrawingHyperLinks(PackagePart sheetPart) {
         try {
@@ -357,6 +418,12 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
             if (handler.hasProtection) {
                 metadata.set(TikaCoreProperties.PROTECTED, "true");
             }
+            if (handler.hasHiddenColumn) {
+                metadata.set(Office.HAS_HIDDEN_COLUMNS, true);
+            }
+            if (handler.hasHiddenRow) {
+                metadata.set(Office.HAS_HIDDEN_ROWS, true);
+            }
         } catch (TikaException e) {
             throw new RuntimeException("SAX parser appears to be broken - " + 
e.getMessage());
         }
@@ -536,6 +603,8 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
     protected static class XSSFSheetInterestingPartsCapturer extends 
DefaultHandler {
         private ContentHandler delegate;
         private boolean hasProtection = false;
+        private boolean hasHiddenRow = false;
+        private boolean hasHiddenColumn = false;
 
         protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) {
             this.delegate = delegate;
@@ -546,6 +615,18 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
             if ("sheetProtection".equals(qName)) {
                 hasProtection = true;
             }
+            if (! hasHiddenRow && "row".equals(localName)) {
+                String v = atts.getValue("hidden");
+                if ("true".equals(v) || "1".equals(v)) {
+                    hasHiddenRow = true;
+                }
+            }
+            if (! hasHiddenColumn && "col".equals(localName)) {
+                String v = atts.getValue("hidden");
+                if ("true".equals(v) || "1".equals(v)) {
+                    hasHiddenColumn = true;
+                }
+            }
             delegate.startElement(uri, localName, qName, atts);
         }
 
@@ -590,7 +671,7 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
         }
     }
 
-    private class AbsPathExtractorHandler extends DefaultHandler {
+    private class WorkbookMetadataHandler extends DefaultHandler {
         @Override
         public void startElement(String uri, String localName, String qName, 
Attributes atts)
                 throws SAXException {
@@ -604,6 +685,64 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
                         return;
                     }
                 }
+            } else if ("sheet".equals(localName)) {
+                String n = XMLReaderUtils.getAttrValue("name", atts);
+                String state = XMLReaderUtils.getAttrValue("state", atts);
+                if ("hidden".equals(state)) {
+                    metadata.set(Office.HAS_HIDDEN_SHEETS, true);
+                    metadata.add(Office.HIDDEN_SHEET_NAMES, n);
+                } else if ("veryHidden".equals(state)) {
+                    metadata.set(Office.HAS_VERY_HIDDEN_SHEETS, true);
+                    metadata.set(Office.VERY_HIDDEN_SHEET_NAMES, n);
+                }
+            } else if ("workbookPr".equals(localName)) {
+                String codeName = XMLReaderUtils.getAttrValue("codeName", 
atts);
+                if (!StringUtils.isBlank(codeName)) {
+                    metadata.set(Office.WORKBOOK_CODENAME, codeName);
+                }
+            }
+            // file version? <fileVersion appName="xl" lastEdited="7" 
lowestEdited="7" rupBuild="28526"/>
+        }
+    }
+
+    private static class ThreadedCommentHandler extends DefaultHandler {
+        private final XHTMLContentHandler xhtml;
+        StringBuilder sb = new StringBuilder();
+        boolean inText = false;
+        public ThreadedCommentHandler(XHTMLContentHandler xhtml) {
+            this.xhtml = xhtml;
+        }
+
+        @Override
+        public void startElement(String uri, String localName, String qName, 
Attributes atts) throws SAXException {
+            if ("text".equals(localName)) {
+                inText = true;
+            }
+        }
+
+        @Override
+        public void endElement(String uri, String localName, String qName) 
throws SAXException {
+            if ("text".equals(localName)) {
+                xhtml.startElement("div", "class", "threaded-comment");
+                xhtml.startElement("p");
+                xhtml.characters(sb.toString());
+                xhtml.endElement("p");
+                xhtml.endElement("div");
+                sb.setLength(0);
+            }
+        }
+
+        @Override
+        public void characters(char[] ch, int start, int length) throws 
SAXException {
+            if (inText) {
+                sb.append(ch, start, length);
+            }
+        }
+
+        @Override
+        public void ignorableWhitespace(char[] ch, int start, int length) 
throws SAXException {
+            if (inText) {
+                sb.append(ch, start, length);
             }
         }
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
index 600194407..a90d79445 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ExcelParserTest.java
@@ -23,6 +23,7 @@ import static org.junit.jupiter.api.Assertions.fail;
 
 import java.io.InputStream;
 import java.text.DecimalFormatSymbols;
+import java.util.List;
 import java.util.Locale;
 
 import org.apache.poi.util.LocaleUtil;
@@ -577,4 +578,18 @@ public class ExcelParserTest extends TikaTest {
             assertContains("1996-08-10", xml);
         }
     }
+
+    @Test
+    public void testExtraMetadata() throws Exception {
+        List<Metadata> metadataList = 
getRecursiveMetadata("testEXCEL_extra_metadata.xls");
+        Metadata m = metadataList.get(0);
+        assertEquals("Unknown Author", m.getValues(Office.COMMENT_PERSONS)[0]);
+        assertEquals("true", m.get(Office.HAS_HIDDEN_COLUMNS));
+        assertEquals("true", m.get(Office.HAS_HIDDEN_ROWS));
+        assertEquals("true", m.get(Office.PROTECTED_WORKSHEET));
+        assertEquals("hidden-sheet", 
m.getValues(Office.HIDDEN_SHEET_NAMES)[0]);
+        assertEquals("very-hidden-sheet", 
m.getValues(Office.VERY_HIDDEN_SHEET_NAMES)[0]);
+        assertEquals("true", m.get(Office.HAS_COMMENTS));
+        assertEquals("true", m.get(Office.HAS_HIDDEN_COLUMNS));
+    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEXCEL_extra_metadata.xls
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEXCEL_extra_metadata.xls
new file mode 100644
index 000000000..e624857b7
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/resources/test-documents/testEXCEL_extra_metadata.xls
 differ

Reply via email to