This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4708-refactor-xlsx in repository https://gitbox.apache.org/repos/asf/tika.git
commit 6f7eca02f55262de29e7c45c79f88d4b5ec993c2 Author: tallison <[email protected]> AuthorDate: Fri Apr 3 07:31:20 2026 -0400 refactor xlsx - WIP --- .../microsoft/ooxml/TikaSheetContentsHandler.java | 36 ++++ .../microsoft/ooxml/TikaSheetXMLHandler.java | 34 ++-- .../parser/microsoft/ooxml/XSSFCommentsShim.java | 187 +++++++++++++++++++++ .../ooxml/XSSFExcelExtractorDecorator.java | 68 +++++++- 4 files changed, 297 insertions(+), 28 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetContentsHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetContentsHandler.java new file mode 100644 index 0000000000..44173ec322 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetContentsHandler.java @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +/** + * Sheet contents handler that uses {@link XSSFCommentsShim.CommentData} + * instead of POI's XMLBeans-dependent {@code XSSFComment}. + */ +interface TikaSheetContentsHandler { + + void startRow(int rowNum); + + void endRow(int rowNum); + + void cell(String cellRef, String formattedValue, XSSFCommentsShim.CommentData comment); + + default void headerFooter(String text, boolean isHeader, String tagName) { + } + + default void endSheet() { + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java index c7276e92a5..3ba83dd255 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/TikaSheetXMLHandler.java @@ -23,9 +23,6 @@ import java.util.Queue; import org.apache.poi.ss.usermodel.BuiltinFormats; import org.apache.poi.ss.usermodel.DataFormatter; import org.apache.poi.ss.util.CellAddress; -import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler; -import org.apache.poi.xssf.model.Comments; -import org.apache.poi.xssf.usermodel.XSSFComment; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.Attributes; @@ -34,11 +31,10 @@ import org.xml.sax.helpers.DefaultHandler; /** * Sheet XML handler for XLSX event-based parsing that uses {@link XSSFStylesShim} - * instead of POI's XMLBeans-dependent {@code StylesTable}. + * and {@link XSSFCommentsShim} instead of POI's XMLBeans-dependent + * {@code StylesTable} and {@code CommentsTable}. * <p> * Adapted from Apache POI's {@code XSSFSheetXMLHandler} (Apache 2.0 license). - * The only structural change is replacing the {@code Styles}/{@code XSSFCellStyle} - * lookup with a direct call to our SAX-based styles shim for format resolution. */ class TikaSheetXMLHandler extends DefaultHandler { @@ -57,9 +53,9 @@ class TikaSheetXMLHandler extends DefaultHandler { } private final XSSFStylesShim stylesShim; - private final Comments comments; + private final XSSFCommentsShim commentsShim; private final XSSFSharedStringsShim sharedStringsShim; - private final SheetContentsHandler output; + private final TikaSheetContentsHandler output; private final DataFormatter formatter; private final boolean formulasNotResults; @@ -83,34 +79,34 @@ class TikaSheetXMLHandler extends DefaultHandler { private Queue<CellAddress> commentCellRefs; TikaSheetXMLHandler(XSSFStylesShim stylesShim, - Comments comments, + XSSFCommentsShim commentsShim, XSSFSharedStringsShim sharedStringsShim, - SheetContentsHandler sheetContentsHandler, + TikaSheetContentsHandler sheetContentsHandler, DataFormatter dataFormatter, boolean formulasNotResults) { this.stylesShim = stylesShim; - this.comments = comments; + this.commentsShim = commentsShim; this.sharedStringsShim = sharedStringsShim; this.output = sheetContentsHandler; this.formatter = dataFormatter; this.formulasNotResults = formulasNotResults; this.nextDataType = XssfDataType.NUMBER; - initComments(comments); + initComments(commentsShim); } TikaSheetXMLHandler(XSSFStylesShim stylesShim, XSSFSharedStringsShim sharedStringsShim, - SheetContentsHandler sheetContentsHandler, + TikaSheetContentsHandler sheetContentsHandler, DataFormatter dataFormatter, boolean formulasNotResults) { this(stylesShim, null, sharedStringsShim, sheetContentsHandler, dataFormatter, formulasNotResults); } - private void initComments(Comments commentsTable) { - if (commentsTable != null) { + private void initComments(XSSFCommentsShim commentsShim) { + if (commentsShim != null) { commentCellRefs = new LinkedList<>(); - for (Iterator<CellAddress> iter = commentsTable.getCellAddresses(); + for (Iterator<CellAddress> iter = commentsShim.getCellAddresses(); iter.hasNext(); ) { commentCellRefs.add(iter.next()); } @@ -333,8 +329,8 @@ class TikaSheetXMLHandler extends DefaultHandler { } checkForEmptyCellComments(EmptyCellCommentsCheckType.CELL); - XSSFComment comment = comments != null ? - comments.findCellComment(new CellAddress(cellRef)) : null; + XSSFCommentsShim.CommentData comment = commentsShim != null ? + commentsShim.findCellComment(new CellAddress(cellRef)) : null; output.cell(cellRef, thisStr, comment); } @@ -393,7 +389,7 @@ class TikaSheetXMLHandler extends DefaultHandler { } private void outputEmptyCellComment(CellAddress cellRef) { - XSSFComment comment = comments.findCellComment(cellRef); + XSSFCommentsShim.CommentData comment = commentsShim.findCellComment(cellRef); output.cell(cellRef.formatAsString(), null, comment); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFCommentsShim.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFCommentsShim.java new file mode 100644 index 0000000000..f3293a0d3c --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFCommentsShim.java @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Iterator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +import org.apache.poi.ss.util.CellAddress; +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import org.apache.tika.exception.TikaException; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.utils.XMLReaderUtils; + +/** + * SAX-based shim that parses {@code xl/commentsN.xml} without XMLBeans. + * Replaces POI's {@code CommentsTable} (which depends on poi-ooxml-lite) + * for Tika's text extraction needs. + * + * <p>Only extracts what Tika needs: cell reference → (author, text) mapping.</p> + */ +class XSSFCommentsShim { + + private final Map<CellAddress, CommentData> commentsByCell; + + /** + * Simple holder for comment data needed by Tika. + */ + static class CommentData { + private final String author; + private final String text; + + CommentData(String author, String text) { + this.author = author; + this.text = text; + } + + public String getAuthor() { + return author; + } + + public String getText() { + return text; + } + } + + /** + * Parse a comments XML stream. + * + * @param is the {@code xl/commentsN.xml} stream (may be null) + * @param parseContext parse context for SAX parser configuration + */ + XSSFCommentsShim(InputStream is, ParseContext parseContext) + throws IOException, TikaException, SAXException { + commentsByCell = new LinkedHashMap<>(); + if (is != null) { + CommentsHandler handler = new CommentsHandler(); + XMLReaderUtils.parseSAX(is, handler, parseContext); + } + } + + /** + * @return the number of comments parsed + */ + int getNumberOfComments() { + return commentsByCell.size(); + } + + /** + * Find comment data for a given cell address. + * + * @return CommentData or null if no comment at that cell + */ + CommentData findCellComment(CellAddress cellAddress) { + return commentsByCell.get(cellAddress); + } + + /** + * @return iterator over all cell addresses that have comments, in document order + */ + Iterator<CellAddress> getCellAddresses() { + return commentsByCell.keySet().iterator(); + } + + /** + * SAX handler for comments XML. Structure: + * <pre> + * <comments> + * <authors> + * <author>Name</author> + * </authors> + * <commentList> + * <comment ref="A1" authorId="0"> + * <text> + * <r><t>Comment text</t></r> + * or plain <t>Comment text</t> + * </text> + * </comment> + * </commentList> + * </comments> + * </pre> + */ + private class CommentsHandler extends DefaultHandler { + + private final List<String> authors = new ArrayList<>(); + private final StringBuilder textBuffer = new StringBuilder(); + + private boolean inAuthor; + private boolean inT; + private boolean inText; + + private String currentRef; + private int currentAuthorId; + private final StringBuilder commentText = new StringBuilder(); + + @Override + public void startElement(String uri, String localName, String qName, + Attributes atts) { + if ("author".equals(localName)) { + inAuthor = true; + textBuffer.setLength(0); + } else if ("comment".equals(localName)) { + currentRef = atts.getValue("ref"); + String authorIdStr = atts.getValue("authorId"); + currentAuthorId = authorIdStr != null ? Integer.parseInt(authorIdStr) : -1; + commentText.setLength(0); + } else if ("text".equals(localName)) { + inText = true; + } else if ("t".equals(localName) && inText) { + inT = true; + textBuffer.setLength(0); + } + } + + @Override + public void endElement(String uri, String localName, String qName) { + if ("author".equals(localName)) { + inAuthor = false; + authors.add(textBuffer.toString()); + } else if ("t".equals(localName) && inT) { + inT = false; + if (commentText.length() > 0) { + commentText.append(' '); + } + commentText.append(textBuffer); + } else if ("text".equals(localName)) { + inText = false; + } else if ("comment".equals(localName)) { + if (currentRef != null) { + String author = (currentAuthorId >= 0 && currentAuthorId < authors.size()) + ? authors.get(currentAuthorId) : ""; + commentsByCell.put(new CellAddress(currentRef), + new CommentData(author, commentText.toString())); + } + currentRef = null; + } + } + + @Override + public void characters(char[] ch, int start, int length) { + if (inAuthor || inT) { + textBuffer.append(ch, start, length); + } + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java index fab2601e4a..3ebccba8e5 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSSFExcelExtractorDecorator.java @@ -43,7 +43,6 @@ import org.apache.poi.ss.util.CellReference; import org.apache.poi.xssf.eventusermodel.XSSFReader; import org.apache.poi.xssf.eventusermodel.XSSFSheetXMLHandler.SheetContentsHandler; import org.apache.poi.xssf.extractor.XSSFEventBasedExcelExtractor; -import org.apache.poi.xssf.model.Comments; import org.apache.poi.xssf.usermodel.XSSFComment; import org.apache.poi.xssf.usermodel.helpers.HeaderFooterHelper; import org.xml.sax.Attributes; @@ -90,6 +89,8 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { "http://schemas.openxmlformats.org/officeDocument/2006/relationships"; private static final String RELATION_VML_DRAWING = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/vmlDrawing"; + private static final String RELATION_COMMENTS = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/comments"; /** * Allows access to headers/footers from raw xml strings @@ -173,8 +174,8 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { addDrawingHyperLinks(sheetPart); sheetParts.add(sheetPart); - Comments comments = iter.getSheetComments(); - if (comments != null && comments.getNumberOfComments() > 0) { + XSSFCommentsShim commentsShim = parseSheetComments(sheetPart); + if (commentsShim != null && commentsShim.getNumberOfComments() > 0) { metadata.set(Office.HAS_COMMENTS, true); } @@ -186,7 +187,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { xhtml.startElement("table"); xhtml.startElement("tbody"); - processSheet(sheetExtractor, comments, stylesShim, stringsShim, stream); + processSheet(sheetExtractor, commentsShim, stylesShim, stringsShim, stream); try { getThreadedComments(container, sheetPart, xhtml); } catch (InvalidFormatException | TikaException | IOException e) { @@ -822,12 +823,13 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { } } - public void processSheet(SheetContentsHandler sheetContentsHandler, Comments comments, + public void processSheet(TikaSheetContentsHandler sheetContentsHandler, + XSSFCommentsShim commentsShim, XSSFStylesShim stylesShim, XSSFSharedStringsShim stringsShim, InputStream sheetInputStream) throws IOException, SAXException { try { XSSFSheetInterestingPartsCapturer handler = new XSSFSheetInterestingPartsCapturer( - new TikaSheetXMLHandler(stylesShim, comments, stringsShim, + new TikaSheetXMLHandler(stylesShim, commentsShim, stringsShim, sheetContentsHandler, formatter, false)); XMLReaderUtils.parseSAX(sheetInputStream, handler, parseContext); sheetInputStream.close(); @@ -846,6 +848,32 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { } } + /** + * Parse the comments XML for a sheet part via SAX, avoiding XMLBeans. + */ + private XSSFCommentsShim parseSheetComments(PackagePart sheetPart) { + try { + PackageRelationshipCollection rels = + sheetPart.getRelationshipsByType(RELATION_COMMENTS); + if (rels.isEmpty()) { + return null; + } + PackageRelationship rel = rels.getRelationship(0); + PackagePartName partName = + PackagingURIHelper.createPartName(rel.getTargetURI()); + PackagePart commentsPart = rel.getPackage().getPart(partName); + if (commentsPart == null) { + return null; + } + try (InputStream is = commentsPart.getInputStream()) { + return new XSSFCommentsShim(is, parseContext); + } + } catch (InvalidFormatException | IOException | TikaException | SAXException e) { + //swallow — comments are not critical + return null; + } + } + /** * In Excel files, sheets have things embedded in them, @@ -892,7 +920,8 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { /** * Turns formatted sheet events into HTML */ - protected static class SheetTextAsHTML implements SheetContentsHandler { + protected static class SheetTextAsHTML + implements TikaSheetContentsHandler, SheetContentsHandler { private final boolean includeHeadersFooters; private final boolean includeMissingRows; protected List<String> headers; @@ -939,7 +968,8 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { } } - public void cell(String cellRef, String formattedValue, XSSFComment comment) { + public void cell(String cellRef, String formattedValue, + XSSFCommentsShim.CommentData comment) { try { // Handle any missing cells int colNum = @@ -964,7 +994,7 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { xhtml.endElement("br"); xhtml.characters(comment.getAuthor()); xhtml.characters(": "); - xhtml.characters(comment.getString().getString()); + xhtml.characters(comment.getText()); } xhtml.endElement("td"); @@ -973,6 +1003,21 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { } } + /** + * Bridge for POI's {@link SheetContentsHandler} interface, used by the + * XLSB (binary) path via {@link org.apache.poi.xssf.binary.XSSFBSheetHandler}. + */ + public void cell(String cellRef, String formattedValue, XSSFComment comment) { + XSSFCommentsShim.CommentData commentData = null; + if (comment != null) { + String text = comment.getString() != null ? + comment.getString().getString() : ""; + commentData = new XSSFCommentsShim.CommentData( + comment.getAuthor(), text); + } + cell(cellRef, formattedValue, commentData); + } + public void headerFooter(String text, boolean isHeader, String tagName) { if (!includeHeadersFooters) { return; @@ -983,6 +1028,11 @@ public class XSSFExcelExtractorDecorator extends AbstractOOXMLExtractor { footers.add(text); } } + + @Override + public void endSheet() { + // no-op — satisfies both TikaSheetContentsHandler and SheetContentsHandler + } } protected static class HeaderFooterFromString implements HeaderFooter {
