This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4692-improve-ooxml-sax-parsers
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d1c8f4ebd55c2fbd40504e5590703aa82928df61
Author: tallison <[email protected]>
AuthorDate: Wed Mar 18 09:01:47 2026 -0400

    improve sax ooxml - comments and other components - WIP
---
 .../parser/microsoft/ooxml/CommentAuthors.java     |  54 +++++++
 .../microsoft/ooxml/FormattingTagManager.java      | 136 ++++++++++++++++
 .../microsoft/ooxml/OOXMLTikaBodyPartHandler.java  | 102 +-----------
 .../parser/microsoft/ooxml/PlaceHolderSkipper.java |  72 +++++++++
 .../ooxml/SXSLFPowerPointExtractorDecorator.java   | 173 +--------------------
 .../microsoft/ooxml/XSLFCommentAuthorHandler.java  |  54 +++++++
 .../microsoft/ooxml/XSLFCommentsHandler.java       |  88 +++++++++++
 7 files changed, 413 insertions(+), 266 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/CommentAuthors.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/CommentAuthors.java
new file mode 100644
index 0000000000..654eceee23
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/CommentAuthors.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Stores comment author names and initials by ID for PPTX comment handling.
+ */
+class CommentAuthors {
+    final Map<String, String> nameMap = new HashMap<>();
+    final Map<String, String> initialMap = new HashMap<>();
+
+    void add(String id, String name, String initials) {
+        if (id == null) {
+            return;
+        }
+        if (name != null) {
+            nameMap.put(id, name);
+        }
+        if (initials != null) {
+            initialMap.put(id, initials);
+        }
+    }
+
+    String getName(String id) {
+        if (id == null) {
+            return null;
+        }
+        return nameMap.get(id);
+    }
+
+    String getInitials(String id) {
+        if (id == null) {
+            return null;
+        }
+        return initialMap.get(id);
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java
new file mode 100644
index 0000000000..cf8dfefce9
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
+import org.xml.sax.SAXException;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+
+/**
+ * Manages XHTML formatting tags (b, i, u, strike) as a state machine,
+ * ensuring proper nesting. Tags are always ordered from outermost to 
innermost:
+ * {@code <b><i><strike><u>text</u></strike></i></b>}.
+ * <p>
+ * When a formatting change occurs, all tags that are "inside" the changing tag
+ * must be closed first, then the change applied, then inner tags reopened.
+ * This avoids generating malformed XHTML with overlapping tags.
+ */
+class FormattingTagManager {
+
+    private final XHTMLContentHandler xhtml;
+
+    private boolean isBold = false;
+    private boolean isItalics = false;
+    private boolean isStrikeThrough = false;
+    private boolean isUnderline = false;
+
+    FormattingTagManager(XHTMLContentHandler xhtml) {
+        this.xhtml = xhtml;
+    }
+
+    /**
+     * Reconciles the current formatting state with the given run properties,
+     * opening and closing XHTML tags as needed to maintain proper nesting.
+     */
+    void applyFormatting(RunProperties runProperties) throws SAXException {
+        if (runProperties.isBold() != isBold) {
+            // Bold is outermost — close everything inside it
+            if (isStrikeThrough) {
+                xhtml.endElement("strike");
+                isStrikeThrough = false;
+            }
+            if (isUnderline) {
+                xhtml.endElement("u");
+                isUnderline = false;
+            }
+            if (isItalics) {
+                xhtml.endElement("i");
+                isItalics = false;
+            }
+            if (runProperties.isBold()) {
+                xhtml.startElement("b");
+            } else {
+                xhtml.endElement("b");
+            }
+            isBold = runProperties.isBold();
+        }
+
+        if (runProperties.isItalics() != isItalics) {
+            if (isStrikeThrough) {
+                xhtml.endElement("strike");
+                isStrikeThrough = false;
+            }
+            if (isUnderline) {
+                xhtml.endElement("u");
+                isUnderline = false;
+            }
+            if (runProperties.isItalics()) {
+                xhtml.startElement("i");
+            } else {
+                xhtml.endElement("i");
+            }
+            isItalics = runProperties.isItalics();
+        }
+
+        if (runProperties.isStrikeThrough() != isStrikeThrough) {
+            if (isUnderline) {
+                xhtml.endElement("u");
+                isUnderline = false;
+            }
+            if (runProperties.isStrikeThrough()) {
+                xhtml.startElement("strike");
+            } else {
+                xhtml.endElement("strike");
+            }
+            isStrikeThrough = runProperties.isStrikeThrough();
+        }
+
+        boolean runIsUnderlined = runProperties.getUnderline() != 
UnderlinePatterns.NONE;
+        if (runIsUnderlined != isUnderline) {
+            if (runIsUnderlined) {
+                xhtml.startElement("u");
+            } else {
+                xhtml.endElement("u");
+            }
+            isUnderline = runIsUnderlined;
+        }
+    }
+
+    /**
+     * Closes all currently open formatting tags in proper nesting order
+     * (innermost first: u, strike, i, b).
+     */
+    void closeAll() throws SAXException {
+        if (isStrikeThrough) {
+            xhtml.endElement("strike");
+            isStrikeThrough = false;
+        }
+        if (isUnderline) {
+            xhtml.endElement("u");
+            isUnderline = false;
+        }
+        if (isItalics) {
+            xhtml.endElement("i");
+            isItalics = false;
+        }
+        if (isBold) {
+            xhtml.endElement("b");
+            isBold = false;
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
index 95082f8840..0e33acc12c 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
@@ -23,7 +23,6 @@ import java.math.BigInteger;
 import java.util.Date;
 import java.util.Map;
 
-import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.AttributesImpl;
 
@@ -55,10 +54,7 @@ public class OOXMLTikaBodyPartHandler
     private int pDepth = 0; //paragraph depth
     private int tableDepth = 0;//table depth
     private int sdtDepth = 0;//
-    private boolean isItalics = false;
-    private boolean isBold = false;
-    private boolean isUnderline = false;
-    private boolean isStrikeThrough = false;
+    private FormattingTagManager formattingTags;
     private boolean wroteHyperlinkStart = false;
 
     //TODO: fix this
@@ -85,6 +81,7 @@ public class OOXMLTikaBodyPartHandler
     public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml, Metadata 
metadata) {
         this.xhtml = xhtml;
         this.metadata = metadata;
+        this.formattingTags = new FormattingTagManager(xhtml);
         this.styles = XWPFStylesShim.EMPTY_STYLES;
         this.listManager = XWPFListManager.EMPTY_LIST;
         this.includeDeletedText = false;
@@ -102,6 +99,7 @@ public class OOXMLTikaBodyPartHandler
                                     OfficeParserConfig parserConfig, Metadata 
metadata) {
         this.xhtml = xhtml;
         this.metadata = metadata;
+        this.formattingTags = new FormattingTagManager(xhtml);
         this.styles = styles;
         this.listManager = listManager;
         this.includeDeletedText = parserConfig.isIncludeDeletedContent();
@@ -121,71 +119,8 @@ public class OOXMLTikaBodyPartHandler
 
     @Override
     public void run(RunProperties runProperties, String contents) throws 
SAXException {
-
-        // True if we are currently in the named style tag:
-        if (runProperties.isBold() != isBold) {
-            if (isStrikeThrough) {
-                xhtml.endElement("strike");
-                isStrikeThrough = false;
-            }
-            if (isUnderline) {
-                xhtml.endElement("u");
-                isUnderline = false;
-            }
-            if (isItalics) {
-                xhtml.endElement("i");
-                isItalics = false;
-            }
-            if (runProperties.isBold()) {
-                xhtml.startElement("b");
-            } else {
-                xhtml.endElement("b");
-            }
-            isBold = runProperties.isBold();
-        }
-
-        if (runProperties.isItalics() != isItalics) {
-            if (isStrikeThrough) {
-                xhtml.endElement("strike");
-                isStrikeThrough = false;
-            }
-            if (isUnderline) {
-                xhtml.endElement("u");
-                isUnderline = false;
-            }
-            if (runProperties.isItalics()) {
-                xhtml.startElement("i");
-            } else {
-                xhtml.endElement("i");
-            }
-            isItalics = runProperties.isItalics();
-        }
-
-        if (runProperties.isStrikeThrough() != isStrikeThrough) {
-            if (isUnderline) {
-                xhtml.endElement("u");
-                isUnderline = false;
-            }
-            if (runProperties.isStrikeThrough()) {
-                xhtml.startElement("strike");
-            } else {
-                xhtml.endElement("strike");
-            }
-            isStrikeThrough = runProperties.isStrikeThrough();
-        }
-
-        boolean runIsUnderlined = runProperties.getUnderline() != 
UnderlinePatterns.NONE;
-        if (runIsUnderlined != isUnderline) {
-            if (runIsUnderlined) {
-                xhtml.startElement("u");
-            } else {
-                xhtml.endElement("u");
-            }
-            isUnderline = runIsUnderlined;
-        }
-
+        formattingTags.applyFormatting(runProperties);
         xhtml.characters(contents);
-
     }
 
     @Override
@@ -199,7 +134,7 @@ public class OOXMLTikaBodyPartHandler
     @Override
     public void hyperlinkEnd() throws SAXException {
         if (wroteHyperlinkStart) {
-            closeStyleTags();
+            formattingTags.closeAll();
             wroteHyperlinkStart = false;
             xhtml.endElement("a");
         }
@@ -244,7 +179,7 @@ public class OOXMLTikaBodyPartHandler
 
     @Override
     public void endParagraph() throws SAXException {
-        closeStyleTags();
+        formattingTags.closeAll();
         if (pDepth == 1 && tableDepth == 0) {
             xhtml.endElement(paragraphTag);
         } else if (tableCellDepth > 0 && pWithinCell > 0) {
@@ -326,7 +261,7 @@ public class OOXMLTikaBodyPartHandler
 
     @Override
     public void startSDT() throws SAXException {
-        closeStyleTags();
+        formattingTags.closeAll();
         sdtDepth++;
     }
 
@@ -499,29 +434,6 @@ public class OOXMLTikaBodyPartHandler
         //no-op
     }
 
-    private void closeStyleTags() throws SAXException {
-
-        if (isStrikeThrough) {
-            xhtml.endElement("strike");
-            isStrikeThrough = false;
-        }
-
-        if (isUnderline) {
-            xhtml.endElement("u");
-            isUnderline = false;
-        }
-
-        if (isItalics) {
-            xhtml.endElement("i");
-            isItalics = false;
-        }
-
-        if (isBold) {
-            xhtml.endElement("b");
-            isBold = false;
-        }
-    }
-
     private void writeParagraphNumber(int numId, int ilvl, XWPFListManager 
listManager,
                                       XHTMLContentHandler xhtml) throws 
SAXException {
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/PlaceHolderSkipper.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/PlaceHolderSkipper.java
new file mode 100644
index 0000000000..bad784967b
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/PlaceHolderSkipper.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * SAX handler wrapper that skips content inside placeholder ({@code <ph>})
+ * elements in PPTX slide masters and layouts. Delegates all non-placeholder
+ * events to the wrapped handler.
+ */
+class PlaceHolderSkipper extends DefaultHandler {
+
+    private final ContentHandler wrappedHandler;
+    private boolean inPH = false;
+
+    PlaceHolderSkipper(ContentHandler wrappedHandler) {
+        this.wrappedHandler = wrappedHandler;
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, 
Attributes atts)
+            throws SAXException {
+        if ("ph".equals(localName)) {
+            inPH = true;
+        }
+        if (!inPH) {
+            wrappedHandler.startElement(uri, localName, qName, atts);
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws 
SAXException {
+        if (!inPH) {
+            wrappedHandler.endElement(uri, localName, qName);
+        }
+        if ("sp".equals(localName)) {
+            inPH = false;
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws 
SAXException {
+        if (!inPH) {
+            wrappedHandler.characters(ch, start, length);
+        }
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) throws 
SAXException {
+        if (!inPH) {
+            wrappedHandler.characters(ch, start, length);
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
index e8c112241b..7697452a41 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -33,10 +33,7 @@ import 
org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
 import org.apache.poi.openxml4j.opc.PackagingURIHelper;
 import org.apache.poi.openxml4j.opc.TargetMode;
 import org.apache.poi.xslf.usermodel.XSLFRelation;
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
@@ -165,7 +162,7 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
             }
             try (InputStream stream = commentAuthorsPart.getInputStream()) {
                 XMLReaderUtils.parseSAX(stream,
-                        new XSLFCommentAuthorHandler(), context);
+                        new XSLFCommentAuthorHandler(commentAuthors), context);
 
             } catch (TikaException | SAXException | IOException e) {
                 metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
@@ -232,7 +229,7 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
             }
         }
         handleGeneralTextContainingPart(XSLFRelation.COMMENTS.getRelation(), 
null, slidePart,
-                metadata, new XSLFCommentsHandler(xhtml));
+                metadata, new XSLFCommentsHandler(xhtml, commentAuthors));
 
         
handleGeneralTextContainingPart(AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
                 "diagram-data", slidePart, metadata,
@@ -339,170 +336,4 @@ public class SXSLFPowerPointExtractorDecorator extends 
AbstractOOXMLExtractor {
 
     }
 
-    private static class PlaceHolderSkipper extends DefaultHandler {
-
-        private final ContentHandler wrappedHandler;
-        boolean inPH = false;
-
-        PlaceHolderSkipper(ContentHandler wrappedHandler) {
-            this.wrappedHandler = wrappedHandler;
-        }
-
-        @Override
-        public void startElement(String uri, String localName, String qName, 
Attributes atts)
-                throws SAXException {
-            if ("ph".equals(localName)) {
-                inPH = true;
-            }
-            if (!inPH) {
-                wrappedHandler.startElement(uri, localName, qName, atts);
-            }
-        }
-
-        @Override
-        public void endElement(String uri, String localName, String qName) 
throws SAXException {
-
-            if (!inPH) {
-                wrappedHandler.endElement(uri, localName, qName);
-            }
-            if ("sp".equals(localName)) {
-                inPH = false;
-            }
-        }
-
-        @Override
-        public void characters(char[] ch, int start, int length) throws 
SAXException {
-            if (!inPH) {
-                wrappedHandler.characters(ch, start, length);
-            }
-        }
-
-        @Override
-        public void ignorableWhitespace(char[] ch, int start, int length) 
throws SAXException {
-            if (!inPH) {
-                wrappedHandler.characters(ch, start, length);
-            }
-        }
-
-
-    }
-
-    private class XSLFCommentsHandler extends DefaultHandler {
-
-        private String commentAuthorId = null;
-        private StringBuilder commentBuffer = new StringBuilder();
-        private XHTMLContentHandler xhtml;
-
-        XSLFCommentsHandler(XHTMLContentHandler xhtml) {
-            this.xhtml = xhtml;
-        }
-
-        @Override
-        public void startElement(String uri, String localName, String qName, 
Attributes atts)
-                throws SAXException {
-            if ("cm".equals(localName)) {
-                commentAuthorId = atts.getValue("", "authorId");
-                //get date (dt)?
-            }
-        }
-
-        @Override
-        public void characters(char[] ch, int start, int length) throws 
SAXException {
-            //TODO: require that we're in <p:text>?
-            commentBuffer.append(ch, start, length);
-        }
-
-        @Override
-        public void endElement(String uri, String localName, String qName) 
throws SAXException {
-            if ("cm".equals(localName)) {
-
-                xhtml.startElement("p", "class", "slide-comment");
-
-                String authorString = commentAuthors.getName(commentAuthorId);
-                String authorInitials = 
commentAuthors.getInitials(commentAuthorId);
-                if (authorString != null || authorInitials != null) {
-                    xhtml.startElement("b");
-                    boolean authorExists = false;
-                    if (authorString != null) {
-                        xhtml.characters(authorString);
-                        authorExists = true;
-                    }
-                    if (authorExists && authorInitials != null) {
-                        xhtml.characters(" (");
-                    }
-                    if (authorInitials != null) {
-                        xhtml.characters(authorInitials);
-                    }
-                    if (authorExists && authorInitials != null) {
-                        xhtml.characters(")");
-                    }
-                    xhtml.endElement("b");
-                }
-                xhtml.characters(commentBuffer.toString());
-                xhtml.endElement("p");
-
-                commentBuffer.setLength(0);
-                commentAuthorId = null;
-            }
-        }
-    }
-
-    private class XSLFCommentAuthorHandler extends DefaultHandler {
-        String id = null;
-        String name = null;
-        String initials = null;
-
-        @Override
-        public void startElement(String uri, String localName, String qName, 
Attributes atts)
-                throws SAXException {
-            if ("cmAuthor".equals(localName)) {
-                for (int i = 0; i < atts.getLength(); i++) {
-                    if ("id".equals(atts.getLocalName(i))) {
-                        id = atts.getValue(i);
-                    } else if ("name".equals(atts.getLocalName(i))) {
-                        name = atts.getValue(i);
-                    } else if ("initials".equals(atts.getLocalName(i))) {
-                        initials = atts.getValue(i);
-                    }
-                }
-                commentAuthors.add(id, name, initials);
-                //clear out
-                id = null;
-                name = null;
-                initials = null;
-            }
-        }
-
-    }
-
-    private static class CommentAuthors {
-        Map<String, String> nameMap = new HashMap<>();
-        Map<String, String> initialMap = new HashMap<>();
-
-        void add(String id, String name, String initials) {
-            if (id == null) {
-                return;
-            }
-            if (name != null) {
-                nameMap.put(id, name);
-            }
-            if (initials != null) {
-                initialMap.put(id, initials);
-            }
-        }
-
-        String getName(String id) {
-            if (id == null) {
-                return null;
-            }
-            return nameMap.get(id);
-        }
-
-        String getInitials(String id) {
-            if (id == null) {
-                return null;
-            }
-            return initialMap.get(id);
-        }
-    }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFCommentAuthorHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFCommentAuthorHandler.java
new file mode 100644
index 0000000000..5b0d055472
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFCommentAuthorHandler.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * SAX handler that parses PPTX commentAuthors.xml and populates
+ * a {@link CommentAuthors} map with author names and initials by ID.
+ */
+class XSLFCommentAuthorHandler extends DefaultHandler {
+
+    private final CommentAuthors commentAuthors;
+
+    XSLFCommentAuthorHandler(CommentAuthors commentAuthors) {
+        this.commentAuthors = commentAuthors;
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, 
Attributes atts)
+            throws SAXException {
+        if ("cmAuthor".equals(localName)) {
+            String id = null;
+            String name = null;
+            String initials = null;
+            for (int i = 0; i < atts.getLength(); i++) {
+                if ("id".equals(atts.getLocalName(i))) {
+                    id = atts.getValue(i);
+                } else if ("name".equals(atts.getLocalName(i))) {
+                    name = atts.getValue(i);
+                } else if ("initials".equals(atts.getLocalName(i))) {
+                    initials = atts.getValue(i);
+                }
+            }
+            commentAuthors.add(id, name, initials);
+        }
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFCommentsHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFCommentsHandler.java
new file mode 100644
index 0000000000..51a3d4c862
--- /dev/null
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XSLFCommentsHandler.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.parser.microsoft.ooxml;
+
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import org.apache.tika.sax.XHTMLContentHandler;
+
+/**
+ * SAX handler that parses PPTX comments XML and emits comment content
+ * with author attribution to an XHTML stream.
+ */
+class XSLFCommentsHandler extends DefaultHandler {
+
+    private final XHTMLContentHandler xhtml;
+    private final CommentAuthors commentAuthors;
+
+    private String commentAuthorId = null;
+    private final StringBuilder commentBuffer = new StringBuilder();
+
+    XSLFCommentsHandler(XHTMLContentHandler xhtml, CommentAuthors 
commentAuthors) {
+        this.xhtml = xhtml;
+        this.commentAuthors = commentAuthors;
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String qName, 
Attributes atts)
+            throws SAXException {
+        if ("cm".equals(localName)) {
+            commentAuthorId = atts.getValue("", "authorId");
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws 
SAXException {
+        //TODO: require that we're in <p:text>?
+        commentBuffer.append(ch, start, length);
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String qName) throws 
SAXException {
+        if ("cm".equals(localName)) {
+            xhtml.startElement("p", "class", "slide-comment");
+
+            String authorString = commentAuthors.getName(commentAuthorId);
+            String authorInitials = 
commentAuthors.getInitials(commentAuthorId);
+            if (authorString != null || authorInitials != null) {
+                xhtml.startElement("b");
+                boolean authorExists = false;
+                if (authorString != null) {
+                    xhtml.characters(authorString);
+                    authorExists = true;
+                }
+                if (authorExists && authorInitials != null) {
+                    xhtml.characters(" (");
+                }
+                if (authorInitials != null) {
+                    xhtml.characters(authorInitials);
+                }
+                if (authorExists && authorInitials != null) {
+                    xhtml.characters(")");
+                }
+                xhtml.endElement("b");
+            }
+            xhtml.characters(commentBuffer.toString());
+            xhtml.endElement("p");
+
+            commentBuffer.setLength(0);
+            commentAuthorId = null;
+        }
+    }
+}

Reply via email to