This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4692-improve-ooxml-sax-parsers in repository https://gitbox.apache.org/repos/asf/tika.git
commit d1c8f4ebd55c2fbd40504e5590703aa82928df61 Author: tallison <[email protected]> AuthorDate: Wed Mar 18 09:01:47 2026 -0400 improve sax ooxml - comments and other components - WIP --- .../parser/microsoft/ooxml/CommentAuthors.java | 54 +++++++ .../microsoft/ooxml/FormattingTagManager.java | 136 ++++++++++++++++ .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 102 +----------- .../parser/microsoft/ooxml/PlaceHolderSkipper.java | 72 +++++++++ .../ooxml/SXSLFPowerPointExtractorDecorator.java | 173 +-------------------- .../microsoft/ooxml/XSLFCommentAuthorHandler.java | 54 +++++++ .../microsoft/ooxml/XSLFCommentsHandler.java | 88 +++++++++++ 7 files changed, 413 insertions(+), 266 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/CommentAuthors.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/CommentAuthors.java new file mode 100644 index 0000000000..654eceee23 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/CommentAuthors.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import java.util.HashMap; +import java.util.Map; + +/** + * Stores comment author names and initials by ID for PPTX comment handling. + */ +class CommentAuthors { + final Map<String, String> nameMap = new HashMap<>(); + final Map<String, String> initialMap = new HashMap<>(); + + void add(String id, String name, String initials) { + if (id == null) { + return; + } + if (name != null) { + nameMap.put(id, name); + } + if (initials != null) { + initialMap.put(id, initials); + } + } + + String getName(String id) { + if (id == null) { + return null; + } + return nameMap.get(id); + } + + String getInitials(String id) { + if (id == null) { + return null; + } + return initialMap.get(id); + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java new file mode 100644 index 0000000000..cf8dfefce9 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import org.apache.poi.xwpf.usermodel.UnderlinePatterns; +import org.xml.sax.SAXException; + +import org.apache.tika.sax.XHTMLContentHandler; + +/** + * Manages XHTML formatting tags (b, i, u, strike) as a state machine, + * ensuring proper nesting. Tags are always ordered from outermost to innermost: + * {@code <b><i><strike><u>text</u></strike></i></b>}. + * <p> + * When a formatting change occurs, all tags that are "inside" the changing tag + * must be closed first, then the change applied, then inner tags reopened. + * This avoids generating malformed XHTML with overlapping tags. + */ +class FormattingTagManager { + + private final XHTMLContentHandler xhtml; + + private boolean isBold = false; + private boolean isItalics = false; + private boolean isStrikeThrough = false; + private boolean isUnderline = false; + + FormattingTagManager(XHTMLContentHandler xhtml) { + this.xhtml = xhtml; + } + + /** + * Reconciles the current formatting state with the given run properties, + * opening and closing XHTML tags as needed to maintain proper nesting. + */ + void applyFormatting(RunProperties runProperties) throws SAXException { + if (runProperties.isBold() != isBold) { + // Bold is outermost — close everything inside it + if (isStrikeThrough) { + xhtml.endElement("strike"); + isStrikeThrough = false; + } + if (isUnderline) { + xhtml.endElement("u"); + isUnderline = false; + } + if (isItalics) { + xhtml.endElement("i"); + isItalics = false; + } + if (runProperties.isBold()) { + xhtml.startElement("b"); + } else { + xhtml.endElement("b"); + } + isBold = runProperties.isBold(); + } + + if (runProperties.isItalics() != isItalics) { + if (isStrikeThrough) { + xhtml.endElement("strike"); + isStrikeThrough = false; + } + if (isUnderline) { + xhtml.endElement("u"); + isUnderline = false; + } + if (runProperties.isItalics()) { + xhtml.startElement("i"); + } else { + xhtml.endElement("i"); + } + isItalics = runProperties.isItalics(); + } + + if (runProperties.isStrikeThrough() != isStrikeThrough) { + if (isUnderline) { + xhtml.endElement("u"); + isUnderline = false; + } + if (runProperties.isStrikeThrough()) { + xhtml.startElement("strike"); + } else { + xhtml.endElement("strike"); + } + isStrikeThrough = runProperties.isStrikeThrough(); + } + + boolean runIsUnderlined = runProperties.getUnderline() != UnderlinePatterns.NONE; + if (runIsUnderlined != isUnderline) { + if (runIsUnderlined) { + xhtml.startElement("u"); + } else { + xhtml.endElement("u"); + } + isUnderline = runIsUnderlined; + } + } + + /** + * Closes all currently open formatting tags in proper nesting order + * (innermost first: u, strike, i, b). + */ + void closeAll() throws SAXException { + if (isStrikeThrough) { + xhtml.endElement("strike"); + isStrikeThrough = false; + } + if (isUnderline) { + xhtml.endElement("u"); + isUnderline = false; + } + if (isItalics) { + xhtml.endElement("i"); + isItalics = false; + } + if (isBold) { + xhtml.endElement("b"); + isBold = false; + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java index 95082f8840..0e33acc12c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java @@ -23,7 +23,6 @@ import java.math.BigInteger; import java.util.Date; import java.util.Map; -import org.apache.poi.xwpf.usermodel.UnderlinePatterns; import org.xml.sax.SAXException; import org.xml.sax.helpers.AttributesImpl; @@ -55,10 +54,7 @@ public class OOXMLTikaBodyPartHandler private int pDepth = 0; //paragraph depth private int tableDepth = 0;//table depth private int sdtDepth = 0;// - private boolean isItalics = false; - private boolean isBold = false; - private boolean isUnderline = false; - private boolean isStrikeThrough = false; + private FormattingTagManager formattingTags; private boolean wroteHyperlinkStart = false; //TODO: fix this @@ -85,6 +81,7 @@ public class OOXMLTikaBodyPartHandler public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml, Metadata metadata) { this.xhtml = xhtml; this.metadata = metadata; + this.formattingTags = new FormattingTagManager(xhtml); this.styles = XWPFStylesShim.EMPTY_STYLES; this.listManager = XWPFListManager.EMPTY_LIST; this.includeDeletedText = false; @@ -102,6 +99,7 @@ public class OOXMLTikaBodyPartHandler OfficeParserConfig parserConfig, Metadata metadata) { this.xhtml = xhtml; this.metadata = metadata; + this.formattingTags = new FormattingTagManager(xhtml); this.styles = styles; this.listManager = listManager; this.includeDeletedText = parserConfig.isIncludeDeletedContent(); @@ -121,71 +119,8 @@ public class OOXMLTikaBodyPartHandler @Override public void run(RunProperties runProperties, String contents) throws SAXException { - - // True if we are currently in the named style tag: - if (runProperties.isBold() != isBold) { - if (isStrikeThrough) { - xhtml.endElement("strike"); - isStrikeThrough = false; - } - if (isUnderline) { - xhtml.endElement("u"); - isUnderline = false; - } - if (isItalics) { - xhtml.endElement("i"); - isItalics = false; - } - if (runProperties.isBold()) { - xhtml.startElement("b"); - } else { - xhtml.endElement("b"); - } - isBold = runProperties.isBold(); - } - - if (runProperties.isItalics() != isItalics) { - if (isStrikeThrough) { - xhtml.endElement("strike"); - isStrikeThrough = false; - } - if (isUnderline) { - xhtml.endElement("u"); - isUnderline = false; - } - if (runProperties.isItalics()) { - xhtml.startElement("i"); - } else { - xhtml.endElement("i"); - } - isItalics = runProperties.isItalics(); - } - - if (runProperties.isStrikeThrough() != isStrikeThrough) { - if (isUnderline) { - xhtml.endElement("u"); - isUnderline = false; - } - if (runProperties.isStrikeThrough()) { - xhtml.startElement("strike"); - } else { - xhtml.endElement("strike"); - } - isStrikeThrough = runProperties.isStrikeThrough(); - } - - boolean runIsUnderlined = runProperties.getUnderline() != UnderlinePatterns.NONE; - if (runIsUnderlined != isUnderline) { - if (runIsUnderlined) { - xhtml.startElement("u"); - } else { - xhtml.endElement("u"); - } - isUnderline = runIsUnderlined; - } - + formattingTags.applyFormatting(runProperties); xhtml.characters(contents); - } @Override @@ -199,7 +134,7 @@ public class OOXMLTikaBodyPartHandler @Override public void hyperlinkEnd() throws SAXException { if (wroteHyperlinkStart) { - closeStyleTags(); + formattingTags.closeAll(); wroteHyperlinkStart = false; xhtml.endElement("a"); } @@ -244,7 +179,7 @@ public class OOXMLTikaBodyPartHandler @Override public void endParagraph() throws SAXException { - closeStyleTags(); + formattingTags.closeAll(); if (pDepth == 1 && tableDepth == 0) { xhtml.endElement(paragraphTag); } else if (tableCellDepth > 0 && pWithinCell > 0) { @@ -326,7 +261,7 @@ public class OOXMLTikaBodyPartHandler @Override public void startSDT() throws SAXException { - closeStyleTags(); + formattingTags.closeAll(); sdtDepth++; } @@ -499,29 +434,6 @@ public class OOXMLTikaBodyPartHandler //no-op } - private void closeStyleTags() throws SAXException { - - if (isStrikeThrough) { - xhtml.endElement("strike"); - isStrikeThrough = false; - } - - if (isUnderline) { - xhtml.endElement("u"); - isUnderline = false; - } - - if (isItalics) { - xhtml.endElement("i"); - isItalics = false; - } - - if (isBold) { - xhtml.endElement("b"); - isBold = false; - } - } - private void writeParagraphNumber(int numId, int ilvl, XWPFListManager listManager, XHTMLContentHandler xhtml) throws SAXException { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/PlaceHolderSkipper.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/PlaceHolderSkipper.java new file mode 100644 index 0000000000..bad784967b --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/PlaceHolderSkipper.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * SAX handler wrapper that skips content inside placeholder ({@code <ph>}) + * elements in PPTX slide masters and layouts. Delegates all non-placeholder + * events to the wrapped handler. + */ +class PlaceHolderSkipper extends DefaultHandler { + + private final ContentHandler wrappedHandler; + private boolean inPH = false; + + PlaceHolderSkipper(ContentHandler wrappedHandler) { + this.wrappedHandler = wrappedHandler; + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) + throws SAXException { + if ("ph".equals(localName)) { + inPH = true; + } + if (!inPH) { + wrappedHandler.startElement(uri, localName, qName, atts); + } + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + if (!inPH) { + wrappedHandler.endElement(uri, localName, qName); + } + if ("sp".equals(localName)) { + inPH = false; + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + if (!inPH) { + wrappedHandler.characters(ch, start, length); + } + } + + @Override + public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { + if (!inPH) { + wrappedHandler.characters(ch, start, length); + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java index e8c112241b..7697452a41 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java @@ -33,10 +33,7 @@ import org.apache.poi.openxml4j.opc.PackageRelationshipCollection; import org.apache.poi.openxml4j.opc.PackagingURIHelper; import org.apache.poi.openxml4j.opc.TargetMode; import org.apache.poi.xslf.usermodel.XSLFRelation; -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; @@ -165,7 +162,7 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { } try (InputStream stream = commentAuthorsPart.getInputStream()) { XMLReaderUtils.parseSAX(stream, - new XSLFCommentAuthorHandler(), context); + new XSLFCommentAuthorHandler(commentAuthors), context); } catch (TikaException | SAXException | IOException e) { metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING, @@ -232,7 +229,7 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { } } handleGeneralTextContainingPart(XSLFRelation.COMMENTS.getRelation(), null, slidePart, - metadata, new XSLFCommentsHandler(xhtml)); + metadata, new XSLFCommentsHandler(xhtml, commentAuthors)); handleGeneralTextContainingPart(AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA, "diagram-data", slidePart, metadata, @@ -339,170 +336,4 @@ public class SXSLFPowerPointExtractorDecorator extends AbstractOOXMLExtractor { } - private static class PlaceHolderSkipper extends DefaultHandler { - - private final ContentHandler wrappedHandler; - boolean inPH = false; - - PlaceHolderSkipper(ContentHandler wrappedHandler) { - this.wrappedHandler = wrappedHandler; - } - - @Override - public void startElement(String uri, String localName, String qName, Attributes atts) - throws SAXException { - if ("ph".equals(localName)) { - inPH = true; - } - if (!inPH) { - wrappedHandler.startElement(uri, localName, qName, atts); - } - } - - @Override - public void endElement(String uri, String localName, String qName) throws SAXException { - - if (!inPH) { - wrappedHandler.endElement(uri, localName, qName); - } - if ("sp".equals(localName)) { - inPH = false; - } - } - - @Override - public void characters(char[] ch, int start, int length) throws SAXException { - if (!inPH) { - wrappedHandler.characters(ch, start, length); - } - } - - @Override - public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException { - if (!inPH) { - wrappedHandler.characters(ch, start, length); - } - } - - - } - - private class XSLFCommentsHandler extends DefaultHandler { - - private String commentAuthorId = null; - private StringBuilder commentBuffer = new StringBuilder(); - private XHTMLContentHandler xhtml; - - XSLFCommentsHandler(XHTMLContentHandler xhtml) { - this.xhtml = xhtml; - } - - @Override - public void startElement(String uri, String localName, String qName, Attributes atts) - throws SAXException { - if ("cm".equals(localName)) { - commentAuthorId = atts.getValue("", "authorId"); - //get date (dt)? - } - } - - @Override - public void characters(char[] ch, int start, int length) throws SAXException { - //TODO: require that we're in <p:text>? - commentBuffer.append(ch, start, length); - } - - @Override - public void endElement(String uri, String localName, String qName) throws SAXException { - if ("cm".equals(localName)) { - - xhtml.startElement("p", "class", "slide-comment"); - - String authorString = commentAuthors.getName(commentAuthorId); - String authorInitials = commentAuthors.getInitials(commentAuthorId); - if (authorString != null || authorInitials != null) { - xhtml.startElement("b"); - boolean authorExists = false; - if (authorString != null) { - xhtml.characters(authorString); - authorExists = true; - } - if (authorExists && authorInitials != null) { - xhtml.characters(" ("); - } - if (authorInitials != null) { - xhtml.characters(authorInitials); - } - if (authorExists && authorInitials != null) { - xhtml.characters(")"); - } - xhtml.endElement("b"); - } - xhtml.characters(commentBuffer.toString()); - xhtml.endElement("p"); - - commentBuffer.setLength(0); - commentAuthorId = null; - } - } - } - - private class XSLFCommentAuthorHandler extends DefaultHandler { - String id = null; - String name = null; - String initials = null; - - @Override - public void startElement(String uri, String localName, String qName, Attributes atts) - throws SAXException { - if ("cmAuthor".equals(localName)) { - for (int i = 0; i < atts.getLength(); i++) { - if ("id".equals(atts.getLocalName(i))) { - id = atts.getValue(i); - } else if ("name".equals(atts.getLocalName(i))) { - name = atts.getValue(i); - } else if ("initials".equals(atts.getLocalName(i))) { - initials = atts.getValue(i); - } - } - commentAuthors.add(id, name, initials); - //clear out - id = null; - name = null; - initials = null; - } - } - - } - - private static class CommentAuthors { - Map<String, String> nameMap = new HashMap<>(); - Map<String, String> initialMap = new HashMap<>(); - - void add(String id, String name, String initials) { - if (id == null) { - return; - } - if (name != null) { - nameMap.put(id, name); - } - if (initials != null) { - initialMap.put(id, initials); - } - } - - String getName(String id) { - if (id == null) { - return null; - } - return nameMap.get(id); - } - - String getInitials(String id) { - if (id == null) { - return null; - } - return initialMap.get(id); - } - } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFCommentAuthorHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFCommentAuthorHandler.java new file mode 100644 index 0000000000..5b0d055472 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFCommentAuthorHandler.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +/** + * SAX handler that parses PPTX commentAuthors.xml and populates + * a {@link CommentAuthors} map with author names and initials by ID. + */ +class XSLFCommentAuthorHandler extends DefaultHandler { + + private final CommentAuthors commentAuthors; + + XSLFCommentAuthorHandler(CommentAuthors commentAuthors) { + this.commentAuthors = commentAuthors; + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) + throws SAXException { + if ("cmAuthor".equals(localName)) { + String id = null; + String name = null; + String initials = null; + for (int i = 0; i < atts.getLength(); i++) { + if ("id".equals(atts.getLocalName(i))) { + id = atts.getValue(i); + } else if ("name".equals(atts.getLocalName(i))) { + name = atts.getValue(i); + } else if ("initials".equals(atts.getLocalName(i))) { + initials = atts.getValue(i); + } + } + commentAuthors.add(id, name, initials); + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFCommentsHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFCommentsHandler.java new file mode 100644 index 0000000000..51a3d4c862 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFCommentsHandler.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.microsoft.ooxml; + +import org.xml.sax.Attributes; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import org.apache.tika.sax.XHTMLContentHandler; + +/** + * SAX handler that parses PPTX comments XML and emits comment content + * with author attribution to an XHTML stream. + */ +class XSLFCommentsHandler extends DefaultHandler { + + private final XHTMLContentHandler xhtml; + private final CommentAuthors commentAuthors; + + private String commentAuthorId = null; + private final StringBuilder commentBuffer = new StringBuilder(); + + XSLFCommentsHandler(XHTMLContentHandler xhtml, CommentAuthors commentAuthors) { + this.xhtml = xhtml; + this.commentAuthors = commentAuthors; + } + + @Override + public void startElement(String uri, String localName, String qName, Attributes atts) + throws SAXException { + if ("cm".equals(localName)) { + commentAuthorId = atts.getValue("", "authorId"); + } + } + + @Override + public void characters(char[] ch, int start, int length) throws SAXException { + //TODO: require that we're in <p:text>? + commentBuffer.append(ch, start, length); + } + + @Override + public void endElement(String uri, String localName, String qName) throws SAXException { + if ("cm".equals(localName)) { + xhtml.startElement("p", "class", "slide-comment"); + + String authorString = commentAuthors.getName(commentAuthorId); + String authorInitials = commentAuthors.getInitials(commentAuthorId); + if (authorString != null || authorInitials != null) { + xhtml.startElement("b"); + boolean authorExists = false; + if (authorString != null) { + xhtml.characters(authorString); + authorExists = true; + } + if (authorExists && authorInitials != null) { + xhtml.characters(" ("); + } + if (authorInitials != null) { + xhtml.characters(authorInitials); + } + if (authorExists && authorInitials != null) { + xhtml.characters(")"); + } + xhtml.endElement("b"); + } + xhtml.characters(commentBuffer.toString()); + xhtml.endElement("p"); + + commentBuffer.setLength(0); + commentAuthorId = null; + } + } +}
