This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4410b
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 1cec0bb3a0aeda6aa2b4d1465ce2370ad58a3da7
Author: tallison <[email protected]>
AuthorDate: Fri May 30 18:35:52 2025 -0400

    TIKA-4410 -- further additions
---
 .../main/java/org/apache/tika/metadata/Office.java |  5 ++
 .../parser/microsoft/ooxml/OPCPackageWrapper.java  |  2 +
 .../ooxml/XSSFExcelExtractorDecorator.java         | 92 +++++++++++++++++++++-
 3 files changed, 96 insertions(+), 3 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java 
b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
index 9d5442b67..7883df999 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java
@@ -157,9 +157,14 @@ public interface Office {
 
     Property HAS_HIDDEN_SHEETS = 
Property.internalBoolean("msoffice:excel:has-hidden-sheets");
 
+    Property HAS_HIDDEN_COLUMNS = 
Property.internalBoolean("msoffice:excel:has-hidden-cols");
+
+    Property HAS_HIDDEN_ROWS = 
Property.internalBoolean("msoffice:excel:has-hidden-rows");
+
     Property HAS_VERY_HIDDEN_SHEETS = 
Property.internalBoolean("msoffice:excel:has-very-hidden-sheets");
 
     Property HIDDEN_SHEET_NAMES = 
Property.internalTextBag("msoffice:excel:hidden-sheet-names");
+
     Property VERY_HIDDEN_SHEET_NAMES = 
Property.internalTextBag("msoffice:excel:very-hidden-sheet-names");
 
     Property PROTECTED_WORKSHEET = 
Property.internalBoolean("msoffice:excel:protected-worksheet");
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
index 34834a416..1fb0b8e40 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java
@@ -30,6 +30,8 @@ import org.apache.poi.openxml4j.opc.OPCPackage;
 public class OPCPackageWrapper implements Closeable {
 
     public static final String PERSON_RELATION = 
"http://schemas.microsoft.com/office/2017/10/relationships/person";;
+    public static final String THREADED_COMMENT_RELATION = 
"http://schemas.microsoft.com/office/2017/10/relationships/threadedComment";;
+
     private final OPCPackage opcPackage;
 
     public OPCPackageWrapper(OPCPackage opcPackage) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
index 97c629b6a..b3a70c09c 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java
@@ -166,7 +166,7 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
                 }
 
                 // Start, and output the sheet name
-                xhtml.startElement("div");
+                xhtml.startElement("div", "class", "sheet");
                 xhtml.element("h1", iter.getSheetName());
 
                 // Extract the main sheet contents
@@ -174,9 +174,14 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
                 xhtml.startElement("tbody");
 
                 processSheet(sheetExtractor, comments, styles, strings, 
stream);
+                try {
+                    getThreadedComments(container, sheetPart, xhtml);
+                } catch (InvalidFormatException | TikaException | IOException 
e) {
+                    //swallow
+                }
+                xhtml.endElement("tbody");
+                xhtml.endElement("table");
             }
-            xhtml.endElement("tbody");
-            xhtml.endElement("table");
 
             // Output any headers and footers
             // (Need to process the sheet to get them, so we can't
@@ -216,6 +221,25 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
         } catch (InvalidFormatException | TikaException | IOException | 
SAXException e) {
             //swallow
         }
+
+    }
+
+    private void getThreadedComments(OPCPackage container, PackagePart 
sheetPart, XHTMLContentHandler xhtml) throws TikaException,
+            InvalidFormatException, SAXException, IOException {
+        //consider caching the person id -> person names in getPersons and 
injecting that into the xhtml per comment?
+        PackageRelationshipCollection coll = 
sheetPart.getRelationshipsByType(OPCPackageWrapper.THREADED_COMMENT_RELATION);
+        if (coll == null || coll.isEmpty()) {
+            return;
+        }
+        for (PackageRelationship rel : coll) {
+            PackagePart threadedCommentPart = sheetPart.getRelatedPart(rel);
+            if (threadedCommentPart == null) {
+                continue;
+            }
+            try (InputStream is = threadedCommentPart.getInputStream()) {
+                XMLReaderUtils.parseSAX(is, new ThreadedCommentHandler(xhtml), 
parseContext);
+            }
+        }
     }
 
     private void getPersons(OPCPackage container, Metadata metadata) throws 
TikaException, InvalidFormatException,
@@ -393,6 +417,12 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
             if (handler.hasProtection) {
                 metadata.set(Office.PROTECTED_WORKSHEET, true);
             }
+            if (handler.hasHiddenColumn) {
+                metadata.set(Office.HAS_HIDDEN_COLUMNS, true);
+            }
+            if (handler.hasHiddenRow) {
+                metadata.set(Office.HAS_HIDDEN_ROWS, true);
+            }
         } catch (TikaException e) {
             throw new RuntimeException("SAX parser appears to be broken - " + 
e.getMessage());
         }
@@ -572,6 +602,8 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
     protected static class XSSFSheetInterestingPartsCapturer extends 
DefaultHandler {
         private ContentHandler delegate;
         private boolean hasProtection = false;
+        private boolean hasHiddenRow = false;
+        private boolean hasHiddenColumn = false;
 
         protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) {
             this.delegate = delegate;
@@ -582,6 +614,18 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
             if ("sheetProtection".equals(qName)) {
                 hasProtection = true;
             }
+            if (! hasHiddenRow && "row".equals(localName)) {
+                String v = atts.getValue("hidden");
+                if ("true".equals(v) || "1".equals(v)) {
+                    hasHiddenRow = true;
+                }
+            }
+            if (! hasHiddenColumn && "col".equals(localName)) {
+                String v = atts.getValue("hidden");
+                if ("true".equals(v) || "1".equals(v)) {
+                    hasHiddenColumn = true;
+                }
+            }
             delegate.startElement(uri, localName, qName, atts);
         }
 
@@ -659,4 +703,46 @@ public class XSSFExcelExtractorDecorator extends 
AbstractOOXMLExtractor {
             // file version? <fileVersion appName="xl" lastEdited="7" 
lowestEdited="7" rupBuild="28526"/>
         }
     }
+
+    private static class ThreadedCommentHandler extends DefaultHandler {
+        private final XHTMLContentHandler xhtml;
+        StringBuilder sb = new StringBuilder();
+        boolean inText = false;
+        public ThreadedCommentHandler(XHTMLContentHandler xhtml) {
+            this.xhtml = xhtml;
+        }
+
+        @Override
+        public void startElement(String uri, String localName, String qName, 
Attributes atts) throws SAXException {
+            if ("text".equals(localName)) {
+                inText = true;
+            }
+        }
+
+        @Override
+        public void endElement(String uri, String localName, String qName) 
throws SAXException {
+            if ("text".equals(localName)) {
+                xhtml.startElement("div", "class", "threaded-comment");
+                xhtml.startElement("p");
+                xhtml.characters(sb.toString());
+                xhtml.endElement("p");
+                xhtml.endElement("div");
+                sb.setLength(0);
+            }
+        }
+
+        @Override
+        public void characters(char[] ch, int start, int length) throws 
SAXException {
+            if (inText) {
+                sb.append(ch, start, length);
+            }
+        }
+
+        @Override
+        public void ignorableWhitespace(char[] ch, int start, int length) 
throws SAXException {
+            if (inText) {
+                sb.append(ch, start, length);
+            }
+        }
+    }
 }

Reply via email to