This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4410b in repository https://gitbox.apache.org/repos/asf/tika.git
commit 1cec0bb3a0aeda6aa2b4d1465ce2370ad58a3da7 Author: tallison <[email protected]> AuthorDate: Fri May 30 18:35:52 2025 -0400 TIKA-4410 -- further additions --- .../main/java/org/apache/tika/metadata/Office.java | 5 ++ .../parser/microsoft/ooxml/OPCPackageWrapper.java | 2 + .../ooxml/XSSFExcelExtractorDecorator.java | 92 +++++++++++++++++++++- 3 files changed, 96 insertions(+), 3 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/Office.java b/tika-core/src/main/java/org/apache/tika/metadata/Office.java index 9d5442b67..7883df999 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/Office.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/Office.java @@ -157,9 +157,14 @@ public interface Office { Property HAS_HIDDEN_SHEETS = Property.internalBoolean("msoffice:excel:has-hidden-sheets"); + Property HAS_HIDDEN_COLUMNS = Property.internalBoolean("msoffice:excel:has-hidden-cols"); + + Property HAS_HIDDEN_ROWS = Property.internalBoolean("msoffice:excel:has-hidden-rows"); + Property HAS_VERY_HIDDEN_SHEETS = Property.internalBoolean("msoffice:excel:has-very-hidden-sheets"); Property HIDDEN_SHEET_NAMES = Property.internalTextBag("msoffice:excel:hidden-sheet-names"); + Property VERY_HIDDEN_SHEET_NAMES = Property.internalTextBag("msoffice:excel:very-hidden-sheet-names"); Property PROTECTED_WORKSHEET = Property.internalBoolean("msoffice:excel:protected-worksheet"); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java index 34834a416..1fb0b8e40 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OPCPackageWrapper.java @@ -30,6 +30,8 @@ import org.apache.poi.openxml4j.opc.OPCPackage; public class OPCPackageWrapper implements Closeable { public static final String PERSON_RELATION = "http://schemas.microsoft.com/office/2017/10/relationships/person"; + public static final String THREADED_COMMENT_RELATION = "http://schemas.microsoft.com/office/2017/10/relationships/threadedComment"; + private final OPCPackage opcPackage; public OPCPackageWrapper(OPCPackage opcPackage) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java index 97c629b6a..b3a70c09c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java @@ -166,7 +166,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { } // Start, and output the sheet name - xhtml.startElement("div"); + xhtml.startElement("div", "class", "sheet"); xhtml.element("h1", iter.getSheetName()); // Extract the main sheet contents @@ -174,9 +174,14 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { xhtml.startElement("tbody"); processSheet(sheetExtractor, comments, styles, strings, stream); + try { + getThreadedComments(container, sheetPart, xhtml); + } catch (InvalidFormatException | TikaException | IOException e) { + //swallow + } + xhtml.endElement("tbody"); + xhtml.endElement("table"); } - xhtml.endElement("tbody"); - xhtml.endElement("table"); // Output any headers and footers // (Need to process the sheet to get them, so we can't @@ -216,6 +221,25 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { } catch (InvalidFormatException | TikaException | IOException | SAXException e) { //swallow } + + } + + private void getThreadedComments(OPCPackage container, PackagePart sheetPart, XHTMLContentHandler xhtml) throws TikaException, + InvalidFormatException, SAXException, IOException { + //consider caching the person id -> person names in getPersons and injecting that into the xhtml per comment? + PackageRelationshipCollection coll = sheetPart.getRelationshipsByType(OPCPackageWrapper.THREADED_COMMENT_RELATION); + if (coll == null || coll.isEmpty()) { + return; + } + for (PackageRelationship rel : coll) { + PackagePart threadedCommentPart = sheetPart.getRelatedPart(rel); + if (threadedCommentPart == null) { + continue; + } + try (InputStream is = threadedCommentPart.getInputStream()) { + XMLReaderUtils.parseSAX(is, new ThreadedCommentHandler(xhtml), parseContext); + } + } } private void getPersons(OPCPackage container, Metadata metadata) throws TikaException, InvalidFormatException, @@ -393,6 +417,12 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { if (handler.hasProtection) { metadata.set(Office.PROTECTED_WORKSHEET, true); } + if (handler.hasHiddenColumn) { + metadata.set(Office.HAS_HIDDEN_COLUMNS, true); + } + if (handler.hasHiddenRow) { + metadata.set(Office.HAS_HIDDEN_ROWS, true); + } } catch (TikaException e) { throw new RuntimeException("SAX parser appears to be broken - " + e.getMessage()); } @@ -572,6 +602,8 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { protected static class XSSFSheetInterestingPartsCapturer extends DefaultHandler { private ContentHandler delegate; private boolean hasProtection = false; + private boolean hasHiddenRow = false; + private boolean hasHiddenColumn = false; protected XSSFSheetInterestingPartsCapturer(ContentHandler delegate) { this.delegate = delegate; @@ -582,6 +614,18 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { if ("sheetProtection".equals(qName)) { hasProtection = true; } + if (! hasHiddenRow && "row".equals(localName)) { + String v = atts.getValue("hidden"); + if ("true".equals(v) || "1".equals(v)) { + hasHiddenRow = true; + } + } + if (! hasHiddenColumn && "col".equals(localName)) { + String v = atts.getValue("hidden"); + if ("true".equals(v) || "1".equals(v)) { + hasHiddenColumn = true; + } + } delegate.startElement(uri, localName, qName, atts); } @@ -659,4 +703,46 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { // file version? <fileVersion appName="xl" lastEdited="7" lowestEdited="7" rupBuild="28526"/> } } + + private static class ThreadedCommentHandler extends DefaultHandler { + private final XHTMLContentHandler xhtml; + StringBuilder sb = new StringBuilder(); + boolean inText = false; + public ThreadedCommentHandler(XHTMLContentHandler xhtml) { + this.xhtml = xhtml; + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) throws SAXException { + if ("text".equals(localName)) { + inText = true; + } + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + if ("text".equals(localName)) { + xhtml.startElement("div", "class", "threaded-comment"); + xhtml.startElement("p"); + xhtml.characters(sb.toString()); + xhtml.endElement("p"); + xhtml.endElement("div"); + sb.setLength(0); + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (inText) { + sb.append(ch, start, length); + } + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + if (inText) { + sb.append(ch, start, length); + } + } + } }
