This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 19c3b8e682 TIKA-4692-improve-ooxml-sax-parsers (#2731)
19c3b8e682 is described below
commit 19c3b8e682399ce6cde2e6513a6ac31f32a6f689
Author: Tim Allison <[email protected]>
AuthorDate: Fri Apr 3 18:28:40 2026 -0400
TIKA-4692-improve-ooxml-sax-parsers (#2731)
---
.../org/apache/tika/sax/XHTMLContentHandler.java | 1 +
.../tika/parser/microsoft/OfficeParserConfig.java | 18 ++
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 26 ++-
...neTagManager.java => FormattingTagManager.java} | 140 +++++++-------
.../microsoft/ooxml/OOXMLPartContentCollector.java | 48 ++---
.../microsoft/ooxml/OOXMLTikaBodyPartHandler.java | 179 ++++-------------
.../ooxml/OOXMLWordAndPowerPointTextHandler.java | 97 +++++-----
.../tika/parser/microsoft/ooxml/RunProperties.java | 11 ++
.../ooxml/SXSLFPowerPointExtractorDecorator.java | 213 ++++++---------------
.../ooxml/SXWPFWordExtractorDecorator.java | 89 ++++-----
.../microsoft/ooxml/XWPFBodyContentsHandler.java | 11 --
.../ooxml/xwpf/XWPFEventBasedWordExtractor.java | 53 +++--
.../parser/microsoft/ooxml/OOXMLDocxSAXTest.java | 31 ++-
13 files changed, 393 insertions(+), 524 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
index 3fd7766d03..bae8c4b885 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/XHTMLContentHandler.java
@@ -94,6 +94,7 @@ public class XHTMLContentHandler extends SafeContentHandler {
private boolean headStarted = false;
private boolean headEnded = false;
private boolean useFrameset = false;
+
public XHTMLContentHandler(ContentHandler handler, Metadata metadata) {
this(handler, metadata, null);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
index c8886e5fdf..9f21b0b798 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParserConfig.java
@@ -39,6 +39,7 @@ public class OfficeParserConfig implements Serializable {
private boolean writeSelectHeadersInBody = false;
+ private boolean includeGlossary = true;
private String dateOverrideFormat = null;
private int maxOverride = 0;//ignore
@@ -213,6 +214,23 @@ public class OfficeParserConfig implements Serializable {
this.concatenatePhoneticRuns = concatenatePhoneticRuns;
}
+ public boolean isIncludeGlossary() {
+ return includeGlossary;
+ }
+
+ /**
+ * Whether or not to include the glossary (building blocks / AutoText)
document
+ * from docx files. The glossary can contain template content such as
form field
+ * placeholders that may duplicate content already present in the main
body.
+ * <p/>
+ * Default: <code>true</code>
+ *
+ * @param includeGlossary whether or not to include glossary content
+ */
+ public void setIncludeGlossary(boolean includeGlossary) {
+ this.includeGlossary = includeGlossary;
+ }
+
public boolean isIncludeMissingRows() {
return includeMissingRows;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index dd7c5eafaf..6beef7c1c4 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -623,6 +623,27 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
* @param parentPart
* @param contentHandler
*/
+ /**
+ * Safely resolves a related part, returning null if the part cannot be
found
+ * instead of throwing {@link IllegalArgumentException}.
+ */
+ public static PackagePart safeGetRelatedPart(PackagePart source,
+ PackageRelationship relationship)
+ throws InvalidFormatException {
+ if (source == null || relationship == null) {
+ return null;
+ }
+ if (!source.isRelationshipExists(relationship)) {
+ return null;
+ }
+ try {
+ return source.getRelatedPart(relationship);
+ } catch (IllegalArgumentException e) {
+ // Relationship exists but target part is missing from the package
+ return null;
+ }
+ }
+
void handleGeneralTextContainingPart(String contentType, String
xhtmlClassLabel,
PackagePart parentPart, Metadata
parentMetadata,
ContentHandler contentHandler) throws
SAXException {
@@ -646,7 +667,10 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
relatedPartPRC.getRelationship(i);
try {
PackagePart relatedPartPart =
-
parentPart.getRelatedPart(relatedPartPackageRelationship);
+ safeGetRelatedPart(parentPart,
relatedPartPackageRelationship);
+ if (relatedPartPart == null) {
+ continue;
+ }
try (InputStream stream =
relatedPartPart.getInputStream()) {
XMLReaderUtils.parseSAX(stream,
new EmbeddedContentHandler(contentHandler),
context);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/InlineTagManager.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java
similarity index 58%
rename from
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/InlineTagManager.java
rename to
tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java
index 45eee33b57..0545cd0037 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/InlineTagManager.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/FormattingTagManager.java
@@ -16,94 +16,112 @@
*/
package org.apache.tika.parser.microsoft.ooxml;
+import java.util.Objects;
+
import org.apache.poi.xwpf.usermodel.UnderlinePatterns;
import org.xml.sax.SAXException;
import org.apache.tika.sax.XHTMLContentHandler;
/**
- * Manages all inline XHTML elements (hyperlinks and formatting tags) as a
- * unified state machine, ensuring proper nesting. The nesting order from
- * outermost to innermost is:
- * <pre>
- * {@code <a> <b> <i> <s> <u> text </u> </s> </i> </b> </a>}
- * </pre>
- * <p>
- * When a change occurs to an outer element, all inner elements are closed
- * first, the change is applied, then inner elements are reopened as needed.
- * This prevents generating malformed XHTML with overlapping or unbalanced
tags.
+ * Single owner of all run-scoped XHTML wrapper tags, ensuring proper nesting.
+ * Nesting order from outermost to innermost:
+ * {@code <a href="..."><b><i><s><u>text</u></s></i></b></a>}.
* <p>
- * This class replaces the separate {@code FormattingTagManager} and the
- * {@code wroteHyperlinkStart} boolean that were previously tracked
independently
- * in {@link OOXMLTikaBodyPartHandler}.
+ * Hyperlinks come from two OOXML sources with different lifecycles:
+ * <ul>
+ * <li><b>Wrapper hyperlinks</b> (DOCX {@code <w:hyperlink>}, field-code
HYPERLINK):
+ * opened/closed explicitly via {@link #openHyperlink}/{@link
#closeHyperlink},
+ * span multiple runs.</li>
+ * <li><b>Run-property hyperlinks</b> (PPTX {@code <a:hlinkClick>}):
+ * set on {@link RunProperties#setHlinkClickUrl}, managed automatically
+ * by {@link #applyFormatting} per-run.</li>
+ * </ul>
+ * Both emit the same {@code <a href="...">} XHTML. Wrapper hyperlinks take
+ * precedence — run properties cannot override an active wrapper.
*/
-class InlineTagManager {
+class FormattingTagManager {
private final XHTMLContentHandler xhtml;
- private boolean hyperlinkOpen = false;
+ // Outermost to innermost: hyperlink > bold > italic > strike > underline
+ private String currentHyperlink = null;
+ private boolean wrapperHyperlinkActive = false;
private boolean isBold = false;
private boolean isItalics = false;
private boolean isStrikeThrough = false;
private boolean isUnderline = false;
- InlineTagManager(XHTMLContentHandler xhtml) {
+ FormattingTagManager(XHTMLContentHandler xhtml) {
this.xhtml = xhtml;
}
/**
- * Opens a hyperlink. Since {@code <a>} is the outermost inline element,
- * any existing inline elements (including a prior hyperlink) are closed
- * first.
- *
- * @param href the link target; if {@code null} this is a no-op
+ * Opens a wrapper-style hyperlink (DOCX {@code <w:hyperlink>} or
field-code).
+ * Closes any open formatting tags first to maintain nesting.
+ * No-op if url is null.
*/
- void openHyperlink(String href) throws SAXException {
- if (href == null) {
+ void openHyperlink(String url) throws SAXException {
+ if (url == null) {
return;
}
- // Close everything — formatting then any existing hyperlink
- closeAll();
- xhtml.startElement("a", "href", href);
- hyperlinkOpen = true;
+ closeFormattingTags();
+ if (currentHyperlink != null) {
+ xhtml.endElement("a");
+ }
+ xhtml.startElement("a", "href", url);
+ currentHyperlink = url;
+ wrapperHyperlinkActive = true;
}
/**
- * Closes the current hyperlink and all formatting inside it.
- * No-op if no hyperlink is open.
+ * Closes the active wrapper-style hyperlink. No-op if none was opened.
*/
void closeHyperlink() throws SAXException {
- if (!hyperlinkOpen) {
- return;
+ if (currentHyperlink != null && wrapperHyperlinkActive) {
+ closeFormattingTags();
+ xhtml.endElement("a");
+ currentHyperlink = null;
+ wrapperHyperlinkActive = false;
}
- closeFormatting();
- xhtml.endElement("a");
- hyperlinkOpen = false;
}
/**
- * Returns {@code true} if a hyperlink is currently open.
+ * Returns true if any hyperlink (wrapper or run-property) is currently
open.
*/
- boolean isHyperlinkOpen() {
- return hyperlinkOpen;
+ boolean isHyperlinkActive() {
+ return currentHyperlink != null;
}
/**
* Reconciles the current formatting state with the given run properties,
* opening and closing XHTML tags as needed to maintain proper nesting.
- * The nesting order for formatting is: {@code <b> <i> <s> <u>}.
*/
void applyFormatting(RunProperties runProperties) throws SAXException {
- if (runProperties.isBold() != isBold) {
- // Bold is outermost formatting — close everything inside it
- if (isUnderline) {
- xhtml.endElement("u");
- isUnderline = false;
+ // Run-property hyperlinks only when no wrapper is active
+ if (!wrapperHyperlinkActive) {
+ String newHyperlink = runProperties.getHlinkClickUrl();
+ if (!Objects.equals(newHyperlink, currentHyperlink)) {
+ closeFormattingTags();
+ if (currentHyperlink != null) {
+ xhtml.endElement("a");
+ }
+ if (newHyperlink != null) {
+ xhtml.startElement("a", "href", newHyperlink);
+ }
+ currentHyperlink = newHyperlink;
}
+ }
+
+ if (runProperties.isBold() != isBold) {
if (isStrikeThrough) {
xhtml.endElement("s");
isStrikeThrough = false;
}
+ if (isUnderline) {
+ xhtml.endElement("u");
+ isUnderline = false;
+ }
if (isItalics) {
xhtml.endElement("i");
isItalics = false;
@@ -117,14 +135,14 @@ class InlineTagManager {
}
if (runProperties.isItalics() != isItalics) {
- if (isUnderline) {
- xhtml.endElement("u");
- isUnderline = false;
- }
if (isStrikeThrough) {
xhtml.endElement("s");
isStrikeThrough = false;
}
+ if (isUnderline) {
+ xhtml.endElement("u");
+ isUnderline = false;
+ }
if (runProperties.isItalics()) {
xhtml.startElement("i");
} else {
@@ -158,10 +176,18 @@ class InlineTagManager {
}
/**
- * Closes all currently open formatting tags in proper nesting order
- * (innermost first: u, s, i, b). Does NOT close the hyperlink.
+ * Closes all currently open tags in proper nesting order.
*/
- void closeFormatting() throws SAXException {
+ void closeAll() throws SAXException {
+ closeFormattingTags();
+ if (currentHyperlink != null) {
+ xhtml.endElement("a");
+ currentHyperlink = null;
+ wrapperHyperlinkActive = false;
+ }
+ }
+
+ private void closeFormattingTags() throws SAXException {
if (isUnderline) {
xhtml.endElement("u");
isUnderline = false;
@@ -179,18 +205,4 @@ class InlineTagManager {
isBold = false;
}
}
-
- /**
- * Closes ALL open inline elements — formatting first, then hyperlink.
- * This is the primary safety mechanism: call at every structural boundary
- * (end of paragraph, table cell, table row, table, etc.) to guarantee
- * well-formed XHTML.
- */
- void closeAll() throws SAXException {
- closeFormatting();
- if (hyperlinkOpen) {
- xhtml.endElement("a");
- hyperlinkOpen = false;
- }
- }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java
index 4b21831638..6cece158e8 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLPartContentCollector.java
@@ -54,9 +54,6 @@ class OOXMLPartContentCollector extends DefaultHandler {
private String currentId = null;
private ByteArrayOutputStream buffer = null;
private int depth = 0;
- // Prefix mappings that fired since the last startElement — need to be
- // emitted as xmlns declarations on the next element inside a collected
fragment.
- private final java.util.List<String[]> pendingPrefixMappings = new
java.util.ArrayList<>();
/**
* @param wrapperElementNames local names of wrapper elements to collect
@@ -79,12 +76,6 @@ class OOXMLPartContentCollector extends DefaultHandler {
@Override
public void startPrefixMapping(String prefix, String uri) {
namespaceMappings.put(prefix, uri);
- // Track prefix mappings that fire within a collected fragment —
- // these need to be emitted as xmlns declarations on the next
- // startElement so that re-parsed fragments have valid namespace
bindings.
- if (currentId != null) {
- pendingPrefixMappings.add(new String[]{prefix, uri});
- }
}
Map<String, byte[]> getContentMap() {
@@ -105,7 +96,9 @@ class OOXMLPartContentCollector extends DefaultHandler {
if (id != null && !skipIds.contains(id)) {
currentId = id;
buffer = new ByteArrayOutputStream();
- writeString(buildWrapperOpenTag());
+ // Don't write wrapper open tag yet — inline xmlns declarations
+ // (e.g., xmlns:a on nested elements) haven't been captured via
+ // startPrefixMapping. Defer to endElement when all are known.
depth = 0;
}
}
@@ -119,8 +112,16 @@ class OOXMLPartContentCollector extends DefaultHandler {
}
if (depth == 0) {
- writeString("</w:body>");
- contentMap.put(currentId, buffer.toByteArray());
+ // Build the wrapper now — all startPrefixMapping calls from nested
+ // elements have been captured, so inline xmlns declarations are
included.
+ byte[] wrapperOpen =
buildWrapperOpenTag().getBytes(StandardCharsets.UTF_8);
+ byte[] content = buffer.toByteArray();
+ ByteArrayOutputStream combined =
+ new ByteArrayOutputStream(wrapperOpen.length +
content.length + 16);
+ combined.write(wrapperOpen, 0, wrapperOpen.length);
+ combined.write(content, 0, content.length);
+ writeString(combined, "</w:body>");
+ contentMap.put(currentId, combined.toByteArray());
currentId = null;
buffer = null;
return;
@@ -166,23 +167,6 @@ class OOXMLPartContentCollector extends DefaultHandler {
String tagName = (qName != null && !qName.isEmpty()) ? qName :
localName;
StringBuilder sb = new StringBuilder();
sb.append('<').append(tagName);
- // Emit any namespace declarations that fired since the last element.
- // In namespace-aware SAX, xmlns:prefix attributes are reported as
- // startPrefixMapping events, NOT as attributes — so they must be
- // re-serialized explicitly for the fragment to be re-parseable.
- if (!pendingPrefixMappings.isEmpty()) {
- for (String[] mapping : pendingPrefixMappings) {
- String prefix = mapping[0];
- String nsUri = mapping[1];
- if (prefix == null || prefix.isEmpty()) {
- sb.append(" xmlns=\"").append(escape(nsUri)).append("\"");
- } else {
- sb.append(" xmlns:").append(prefix).append("=\"")
- .append(escape(nsUri)).append("\"");
- }
- }
- pendingPrefixMappings.clear();
- }
for (int i = 0; i < atts.getLength(); i++) {
String attName = atts.getQName(i);
if (attName == null || attName.isEmpty()) {
@@ -197,8 +181,12 @@ class OOXMLPartContentCollector extends DefaultHandler {
}
private void writeString(String s) {
+ writeString(buffer, s);
+ }
+
+ private static void writeString(ByteArrayOutputStream target, String s) {
byte[] bytes = s.getBytes(StandardCharsets.UTF_8);
- buffer.write(bytes, 0, bytes.length);
+ target.write(bytes, 0, bytes.length);
}
static String escape(String s) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
index ac46090d1c..a18f52a4d2 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLTikaBodyPartHandler.java
@@ -55,7 +55,7 @@ public class OOXMLTikaBodyPartHandler
private int pDepth = 0; //paragraph depth
private int tableDepth = 0;//table depth
private int sdtDepth = 0;//
- private final InlineTagManager inlineTags;
+ private FormattingTagManager formattingTags;
//TODO: fix this
//pWithinCell should be an array/stack of given cell depths
@@ -68,14 +68,9 @@ public class OOXMLTikaBodyPartHandler
//will need to replace this with a stack
//if we're marking more that the first level <p/> element
private String paragraphTag = null;
- private boolean pendingParagraph = false;
- private boolean paragraphTagOpen = false;
- private ParagraphProperties pendingParagraphProperties = null;
- private String pendingHyperlinkHref = null;
private OOXMLInlineBodyPartMap inlinePartMap =
OOXMLInlineBodyPartMap.EMPTY;
private ParseContext parseContext = null;
- private final java.util.List<String[]> pendingNoteIds = new
java.util.ArrayList<>();
private final java.util.List<String> pendingCommentIds = new
java.util.ArrayList<>();
private final java.util.Set<String> emittedCommentIds = new
java.util.HashSet<>();
private final Map<String, EmbeddedPartMetadata> embeddedPartMetadataMap =
new HashMap<>();
@@ -87,7 +82,7 @@ public class OOXMLTikaBodyPartHandler
public OOXMLTikaBodyPartHandler(XHTMLContentHandler xhtml, Metadata
metadata) {
this.xhtml = xhtml;
this.metadata = metadata;
- this.inlineTags = new InlineTagManager(xhtml);
+ this.formattingTags = new FormattingTagManager(xhtml);
this.styles = XWPFStylesShim.EMPTY_STYLES;
this.listManager = XWPFListManager.EMPTY_LIST;
this.includeDeletedText = false;
@@ -105,7 +100,7 @@ public class OOXMLTikaBodyPartHandler
OfficeParserConfig parserConfig, Metadata
metadata) {
this.xhtml = xhtml;
this.metadata = metadata;
- this.inlineTags = new InlineTagManager(xhtml);
+ this.formattingTags = new FormattingTagManager(xhtml);
this.styles = styles;
this.listManager = listManager;
this.includeDeletedText = parserConfig.isIncludeDeletedContent();
@@ -125,100 +120,35 @@ public class OOXMLTikaBodyPartHandler
@Override
public void run(RunProperties runProperties, String contents) throws
SAXException {
- ensureParagraphOpen();
- flushPendingHyperlink();
- inlineTags.applyFormatting(runProperties);
+ formattingTags.applyFormatting(runProperties);
xhtml.characters(contents);
}
- private void flushPendingHyperlink() throws SAXException {
- if (pendingHyperlinkHref != null) {
- inlineTags.openHyperlink(pendingHyperlinkHref);
- pendingHyperlinkHref = null;
- }
- }
-
@Override
public void hyperlinkStart(String link) throws SAXException {
- // Defer hyperlink opening if no paragraph is open yet.
- // Shape-level hyperlinks (cNvPr/hlinkClick) fire before any <p>,
- // so we store the link and open it when the paragraph opens.
- if (pendingParagraph || pDepth == 0) {
- pendingHyperlinkHref = link;
- } else {
- inlineTags.openHyperlink(link);
- }
+ formattingTags.openHyperlink(link);
}
@Override
public void hyperlinkEnd() throws SAXException {
- if (pendingHyperlinkHref != null) {
- pendingHyperlinkHref = null;
- } else {
- inlineTags.closeHyperlink();
- }
- }
-
- /**
- * Closes any open inline elements (hyperlinks, formatting tags) in
- * the correct nesting order. Called before closing any structural
- * element (paragraph, table cell, table row, table, etc.) to ensure
- * well-formed XHTML.
- */
- void closeInlineElements() throws SAXException {
- inlineTags.closeAll();
+ formattingTags.closeHyperlink();
}
-
@Override
public void startParagraph(ParagraphProperties paragraphProperties) throws
SAXException {
+
//if you're in a table cell and your after the first paragraph
//make sure to prepend a \n
if (tableCellDepth > 0 && pWithinCell > 0) {
xhtml.characters(NEWLINE, 0, 1);
}
- // If we're about to nest a paragraph (e.g. inside a text box / shape),
- // force-open the outer paragraph first so that inner content ends up
- // inside the outer <p> tag rather than floating as raw text.
- if (pendingParagraph && pDepth > 0) {
- ensureParagraphOpen();
- }
- // Record the paragraph as pending — don't emit <p> yet.
- // We defer opening until the first content arrives (via
ensureParagraphOpen)
- // so that style info from pPr is available.
- pendingParagraph = true;
- pendingParagraphProperties = paragraphProperties;
- pDepth++;
- }
-
- @Override
- public void setParagraphProperties(ParagraphProperties paragraphProperties)
- throws SAXException {
- // Copy the properties — the caller may reset the object after this
call.
- // The <p> tag hasn't been emitted yet, so this style will be applied
when it opens.
- if (pendingParagraph) {
- pendingParagraphProperties = new
ParagraphProperties(paragraphProperties);
- }
- }
-
- /**
- * Ensures the current paragraph's XHTML tag is open. Called before any
- * content is written (runs, hyperlinks, etc.) so that the deferred
- * {@code <p>} tag is emitted with the correct style.
- */
- private void ensureParagraphOpen() throws SAXException {
- if (!pendingParagraph) {
- return;
- }
- pendingParagraph = false;
- if (pDepth == 1 && tableDepth == 0 && sdtDepth == 0) {
+ if (pDepth == 0 && tableDepth == 0 && sdtDepth == 0) {
paragraphTag = P;
String styleClass = null;
- ParagraphProperties pp = pendingParagraphProperties;
//TIKA-2144 check that styles is not null
- if (pp != null && pp.getStyleID() != null && styles != null) {
- String styleName = styles.getStyleName(pp.getStyleID());
+ if (paragraphProperties.getStyleID() != null && styles != null) {
+ String styleName =
styles.getStyleName(paragraphProperties.getStyleID());
if (styleName != null) {
WordExtractor.TagAndStyle tas =
WordExtractor.buildParagraphTagAndStyle(styleName,
false);
@@ -227,39 +157,33 @@ public class OOXMLTikaBodyPartHandler
}
}
+
if (styleClass == null) {
xhtml.startElement(paragraphTag);
} else {
xhtml.startElement(paragraphTag, "class", styleClass);
}
- paragraphTagOpen = true;
}
- if (pendingParagraphProperties != null) {
- writeParagraphNumber(pendingParagraphProperties.getNumId(),
- pendingParagraphProperties.getIlvl(), listManager, xhtml);
- }
- pendingParagraphProperties = null;
+ writeParagraphNumber(paragraphProperties.getNumId(),
paragraphProperties.getIlvl(),
+ listManager, xhtml);
+ pDepth++;
}
+
@Override
public void endParagraph() throws SAXException {
- ensureParagraphOpen();
- closeInlineElements();
- if (paragraphTagOpen) {
+ formattingTags.closeAll();
+ if (pDepth == 1 && tableDepth == 0) {
xhtml.endElement(paragraphTag);
- paragraphTagOpen = false;
} else if (tableCellDepth > 0 && pWithinCell > 0) {
xhtml.characters(NEWLINE, 0, 1);
} else if (tableCellDepth == 0) {
xhtml.characters(NEWLINE, 0, 1);
}
- // Emit any pending footnote/endnote and comment content after the
- // paragraph closes. Inlining mid-paragraph would create <div> inside
- // <p>, and the inner handler's endParagraph() would close the outer
- // <p> tag, corrupting state.
- emitPendingNotes();
+ // Emit any pending comment content after the paragraph closes
+ // (matching the DOM parser's behavior of appending comments after
paragraphs)
emitPendingComments();
if (tableCellDepth > 0) {
@@ -268,27 +192,6 @@ public class OOXMLTikaBodyPartHandler
pDepth--;
}
- private void emitPendingNotes() throws SAXException {
- if (pendingNoteIds.isEmpty()) {
- return;
- }
- for (String[] noteTypeAndId : pendingNoteIds) {
- String noteType = noteTypeAndId[0];
- String id = noteTypeAndId[1];
- byte[] xml = "footnote".equals(noteType)
- ? inlinePartMap.getFootnote(id)
- : inlinePartMap.getEndnote(id);
- if (xml != null) {
- inlineNoteContent(xml, noteType);
- } else {
- xhtml.characters("[");
- xhtml.characters(id);
- xhtml.characters("]");
- }
- }
- pendingNoteIds.clear();
- }
-
private void emitPendingComments() throws SAXException {
if (pendingCommentIds.isEmpty()) {
return;
@@ -313,12 +216,7 @@ public class OOXMLTikaBodyPartHandler
@Override
public void startTable() throws SAXException {
- // Close any open paragraph — <table> can't nest inside <p> in XHTML
- closeInlineElements();
- if (paragraphTagOpen) {
- xhtml.endElement(paragraphTag);
- paragraphTagOpen = false;
- }
+
xhtml.startElement("table");
tableDepth++;
@@ -326,7 +224,7 @@ public class OOXMLTikaBodyPartHandler
@Override
public void endTable() throws SAXException {
- closeInlineElements();
+
xhtml.endElement("table");
tableDepth--;
@@ -339,7 +237,6 @@ public class OOXMLTikaBodyPartHandler
@Override
public void endTableRow() throws SAXException {
- closeInlineElements();
xhtml.endElement("tr");
}
@@ -351,7 +248,6 @@ public class OOXMLTikaBodyPartHandler
@Override
public void endTableCell() throws SAXException {
- closeInlineElements();
xhtml.endElement("td");
pWithinCell = 0;
tableCellDepth--;
@@ -359,7 +255,7 @@ public class OOXMLTikaBodyPartHandler
@Override
public void startSDT() throws SAXException {
- inlineTags.closeAll();
+ formattingTags.closeAll();
sdtDepth++;
}
@@ -389,10 +285,14 @@ public class OOXMLTikaBodyPartHandler
if (id == null) {
return;
}
- // Defer footnote emission to after the paragraph closes.
- // Inlining mid-paragraph creates <div> inside <p>, and the inner
- // handler's endParagraph() closes the outer <p> tag, corrupting state.
- pendingNoteIds.add(new String[]{"footnote", id});
+ byte[] xml = inlinePartMap.getFootnote(id);
+ if (xml != null) {
+ inlineNoteContent(xml, "footnote");
+ } else {
+ xhtml.characters("[");
+ xhtml.characters(id);
+ xhtml.characters("]");
+ }
}
@Override
@@ -400,7 +300,14 @@ public class OOXMLTikaBodyPartHandler
if (id == null) {
return;
}
- pendingNoteIds.add(new String[]{"endnote", id});
+ byte[] xml = inlinePartMap.getEndnote(id);
+ if (xml != null) {
+ inlineNoteContent(xml, "endnote");
+ } else {
+ xhtml.characters("[");
+ xhtml.characters(id);
+ xhtml.characters("]");
+ }
}
@Override
@@ -411,25 +318,19 @@ public class OOXMLTikaBodyPartHandler
}
private void inlineNoteContent(byte[] xml, String cssClass) throws
SAXException {
- // Close any open inline elements before inlining note content
- // to ensure the <div> nests correctly
- closeInlineElements();
// Use the inline part map's relationship map which includes
relationships
// from the footnote/endnote parts (needed for picture resolution)
Map<String, String> noteRelationships =
inlinePartMap.getLinkedRelationships();
xhtml.startElement("div", "class", cssClass);
- OOXMLTikaBodyPartHandler innerHandler = new
OOXMLTikaBodyPartHandler(xhtml);
try {
XMLReaderUtils.parseSAX(new ByteArrayInputStream(xml),
new EmbeddedContentHandler(
new OOXMLWordAndPowerPointTextHandler(
- innerHandler,
+ new OOXMLTikaBodyPartHandler(xhtml),
noteRelationships)),
parseContext);
} catch (TikaException | IOException e) {
xhtml.characters("[" + cssClass + " parse error]");
- } finally {
- innerHandler.closeInlineElements();
}
xhtml.endElement("div");
}
@@ -529,7 +430,7 @@ public class OOXMLTikaBodyPartHandler
@Override
public void startBookmark(String id, String name) throws SAXException {
//skip bookmarks within hyperlinks
- if (name != null && !inlineTags.isHyperlinkOpen()) {
+ if (name != null && !formattingTags.isHyperlinkActive()) {
xhtml.startElement("a", "name", name);
xhtml.endElement("a");
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
index 00f1ff6c4a..46e25b299d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLWordAndPowerPointTextHandler.java
@@ -111,10 +111,6 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
private final static String COMMENT_REFERENCE = "commentReference";
private static final String TEXTBOX = "textbox";
private static final String TXBX = "txbx"; // DrawingML text box (wps:txbx
in mc:Choice)
- private static final String SDT = "sdt";
- private static final String SDT_PR = "sdtPr";
- private static final String SDT_CONTENT = "sdtContent";
- private static final String SHOWING_PLCHDR = "showingPlcHdr";
private final static String FLD_CHAR = "fldChar";
private final static String INSTR_TEXT = "instrText";
private final static String FLD_CHAR_TYPE = "fldCharType";
@@ -141,7 +137,14 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
private boolean inRPr = false;
private boolean inNumPr = false;
private boolean inRt = false;
- private boolean inPPr = false;
+ //mechanism used to determine when to
+ //signal the start of the p, and still
+ //handle p with pPr and those without
+ private boolean lastStartElementWasP = false;
+ //have we signaled the start of a p?
+ //pPr can happen multiple times within a p
+ //<p><pPr/><r><t>text</t></r><pPr></p>
+ private boolean pStarted = false;
//alternate content can be embedded in itself.
//need to track depth.
//preferACChoice controls which branch is processed:
@@ -151,9 +154,13 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
private int inACFallbackDepth = 0;
private boolean inDelText = false;
//buffers rt in ruby sections (see 17.3.3.25)
- private boolean inHlinkClick = false;
private boolean inTextBox = false;
private boolean inV = false; //in c:v in chart file
+ // True when we're inside a <pPr> that was a direct child of <p> (the
first child).
+ // Only those pPr elements should trigger startParagraph on close.
+ // pPr elements nested inside other elements (e.g., <a:pPr> inside <a:fld>)
+ // must not be treated as paragraph-level properties.
+ private boolean inParagraphLevelPPr = false;
// Field code tracking for instrText-based hyperlinks
private boolean inField = false;
private boolean inInstrText = false;
@@ -164,10 +171,6 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
private DateUtils dateUtils = new DateUtils();
private boolean hiddenSlide = false;
- // SDT (structured document tag) placeholder tracking
- private boolean inSdtPr = false;
- private boolean sdtIsPlaceholder = false;
- private int sdtPlaceholderDepth = 0;
private boolean hasAnimations = false;
public OOXMLWordAndPowerPointTextHandler(XWPFBodyContentsHandler
bodyContentsHandler,
@@ -226,6 +229,17 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
throws SAXException {
//TODO: checkBox, textBox, sym, headerReference, footerReference,
commentRangeEnd
+ if (lastStartElementWasP && PPR.equals(localName)) {
+ // pPr is the first child of <p> — this is a paragraph-level pPr.
+ // Defer startParagraph until </pPr> so properties (style,
numbering) are set first.
+ inParagraphLevelPPr = true;
+ } else if (lastStartElementWasP) {
+ // First child of <p> is not pPr — start paragraph immediately
with defaults.
+ bodyContentsHandler.startParagraph(currPProperties);
+ }
+
+ lastStartElementWasP = false;
+
if (uri != null && uri.equals(MC_NS)) {
if (CHOICE.equals(localName)) {
inACChoiceDepth++;
@@ -254,9 +268,7 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
} else if (TAB.equals(localName)) {
runBuffer.append(TAB_CHAR);
} else if (P.equals(localName)) {
- bodyContentsHandler.startParagraph(currPProperties);
- } else if (PPR.equals(localName)) {
- inPPr = true;
+ lastStartElementWasP = true;
} else if (B.equals(localName)) { //TODO: add bCs
if (inR && inRPr) {
currRunProperties.setBold(true);
@@ -318,8 +330,14 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
String hyperlink = null;
if (hyperlinkId != null) {
hyperlink = linkedRelationships.get(hyperlinkId);
- bodyContentsHandler.hyperlinkStart(hyperlink);
- inHlinkClick = true;
+ if (inR) {
+ // hlinkClick inside a run — treat as run property.
+ // FormattingTagManager opens/closes <a> with the run
lifecycle.
+ currRunProperties.setHlinkClickUrl(hyperlink);
+ } else if (hyperlink != null) {
+ // hlinkClick on a shape/picture (not in a run) — emit as
self-closing ref
+ bodyContentsHandler.externalRef("hlinkClick", hyperlink);
+ }
}
} else if (TBL.equals(localName)) {
bodyContentsHandler.startTable();
@@ -329,20 +347,8 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
pictureTracker.setDescription(atts.getValue("", "descr"));
} else if (PIC.equals(localName)) {
pictureTracker.startPic(); //check for PIC_NS?
- } else if (SDT.equals(localName)) {
- // SDTs can nest; only track placeholder at outermost level
- if (sdtPlaceholderDepth == 0) {
- sdtIsPlaceholder = false;
- }
- } else if (SDT_PR.equals(localName)) {
- inSdtPr = true;
- } else if (SHOWING_PLCHDR.equals(localName) && inSdtPr) {
- sdtIsPlaceholder = true;
- } else if (SDT_CONTENT.equals(localName)) {
- if (sdtIsPlaceholder) {
- sdtPlaceholderDepth++;
- }
- } else if (FOOTNOTE_REFERENCE.equals(localName)) {
+ } //TODO: add sdt, sdtPr, sdtContent goes here statistically
+ else if (FOOTNOTE_REFERENCE.equals(localName)) {
String id = atts.getValue(W_NS, "id");
bodyContentsHandler.footnoteReference(id);
} else if (IMAGEDATA.equals(localName)) {
@@ -500,10 +506,6 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
}
if (PIC.equals(localName)) { //PIC_NS
pictureTracker.endPicture();
- if (inHlinkClick) {
- bodyContentsHandler.hyperlinkEnd();
- inHlinkClick = false;
- }
return;
} else if (RPR.equals(localName)) {
inRPr = false;
@@ -511,10 +513,15 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
handleEndOfRun();
} else if (T.equals(localName)) {
inT = false;
- } else if (PPR.equals(localName)) {
- inPPr = false;
- bodyContentsHandler.setParagraphProperties(currPProperties);
+ } else if (PPR.equals(localName) && inParagraphLevelPPr) {
+ // Only process as paragraph properties if this pPr was a direct
child of <p>.
+ // pPr inside other elements (e.g., <a:fld> fields) must be
ignored.
+ if (!pStarted) {
+ bodyContentsHandler.startParagraph(currPProperties);
+ pStarted = true;
+ }
currPProperties.reset();
+ inParagraphLevelPPr = false;
} else if (P.equals(localName)) {
if (runBuffer.length() > 0) {
//<p><tab></p>...this will treat that as if it were
@@ -522,6 +529,7 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
bodyContentsHandler.run(currRunProperties,
runBuffer.toString());
runBuffer.setLength(0);
}
+ pStarted = false;
bodyContentsHandler.endParagraph();
} else if (TC.equals(localName)) {
bodyContentsHandler.endTableCell();
@@ -538,14 +546,6 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
editType = EditType.NONE;
} else if (HYPERLINK.equals(localName)) {
bodyContentsHandler.hyperlinkEnd();
- } else if (SDT_PR.equals(localName)) {
- inSdtPr = false;
- } else if (SDT_CONTENT.equals(localName)) {
- if (sdtPlaceholderDepth > 0) {
- sdtPlaceholderDepth--;
- }
- } else if (SDT.equals(localName)) {
- sdtIsPlaceholder = false;
} else if (PICT.equals(localName)) {
pictureTracker.endPicture();
} else if (V.equals(localName) && C_NS.equals(uri)) { // in value in a
chart
@@ -571,16 +571,13 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
private void handleEndOfRun() throws SAXException {
bodyContentsHandler.run(currRunProperties, runBuffer.toString());
- if (inHlinkClick) {
- bodyContentsHandler.hyperlinkEnd();
- inHlinkClick = false;
- }
inR = false;
runBuffer.setLength(0);
currRunProperties.setBold(false);
currRunProperties.setItalics(false);
currRunProperties.setStrike(false);
currRunProperties.setUnderline(UnderlinePatterns.NONE.name());
+ currRunProperties.setHlinkClickUrl(null);
}
@Override
@@ -590,8 +587,6 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
return;
} else if (!includeTextBox && inTextBox) {
return;
- } else if (sdtPlaceholderDepth > 0) {
- return;
}
if (editType.equals(EditType.MOVE_FROM) && inT) {
@@ -617,8 +612,6 @@ public class OOXMLWordAndPowerPointTextHandler extends
DefaultHandler {
return;
} else if (!includeTextBox && inTextBox) {
return;
- } else if (sdtPlaceholderDepth > 0) {
- return;
}
if (inT) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
index 54d149f333..efed9c1348 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/RunProperties.java
@@ -30,6 +30,9 @@ public class RunProperties {
UnderlinePatterns underline = UnderlinePatterns.NONE;
+ // PPTX hlinkClick hyperlink URL — set from <a:hlinkClick> inside <a:rPr>
+ String hlinkClickUrl = null;
+
public boolean isItalics() {
return italics;
}
@@ -68,4 +71,12 @@ public class RunProperties {
underline = UnderlinePatterns.SINGLE;
}
}
+
+ public String getHlinkClickUrl() {
+ return hlinkClickUrl;
+ }
+
+ public void setHlinkClickUrl(String url) {
+ this.hlinkClickUrl = url;
+ }
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
index d6f5b9759d..7d8030afad 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXSLFPowerPointExtractorDecorator.java
@@ -20,7 +20,6 @@ import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
-import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.zip.ZipException;
@@ -34,9 +33,7 @@ import
org.apache.poi.openxml4j.opc.PackageRelationshipCollection;
import org.apache.poi.openxml4j.opc.PackagingURIHelper;
import org.apache.poi.openxml4j.opc.TargetMode;
import org.apache.poi.xslf.usermodel.XSLFRelation;
-import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
@@ -106,15 +103,28 @@ public class SXSLFPowerPointExtractorDecorator extends
AbstractOOXMLExtractor {
loadCommentAuthors();
addCommentAuthorMetadata();
- List<PackagePart> orderedSlides = getOrderedSlideParts();
+ PackageRelationshipCollection slidesPRC = null;
+ try {
+ slidesPRC =
mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
+ } catch (InvalidFormatException e) {
+ metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+ ExceptionUtils.getStackTrace(e));
+ }
int hiddenSlideCount = 0;
- for (PackagePart slidePart : orderedSlides) {
- try {
- hiddenSlideCount += handleSlidePart(slidePart, xhtml);
- } catch (ZipException e) {
- metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
- ExceptionUtils.getStackTrace(e));
+ if (slidesPRC != null && slidesPRC.size() > 0) {
+ for (int i = 0; i < slidesPRC.size(); i++) {
+ try {
+ PackagePart slidePart =
+ safeGetRelatedPart(mainDocument,
slidesPRC.getRelationship(i));
+ if (slidePart == null) {
+ continue;
+ }
+ hiddenSlideCount += handleSlidePart(slidePart, xhtml);
+ } catch (InvalidFormatException | ZipException e) {
+
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
+ ExceptionUtils.getStackTrace(e));
+ }
}
}
if (hiddenSlideCount > 0) {
@@ -122,9 +132,16 @@ public class SXSLFPowerPointExtractorDecorator extends
AbstractOOXMLExtractor {
}
if (config.isIncludeSlideMasterContent()) {
- // Handout master is presentation-level, not per-slide
- handleTextPartWithCleanup(HANDOUT_MASTER, "slide-handout-master",
mainDocument,
- xhtml, new HashMap<>(), false);
+
handleGeneralTextContainingPart(XSLFRelation.SLIDE_MASTER.getRelation(),
"slide-master",
+ mainDocument, metadata, new PlaceHolderSkipper(
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml),
+ new HashMap<>())));
+
+ handleGeneralTextContainingPart(HANDOUT_MASTER,
"slide-handout-master", mainDocument,
+ metadata,
+ new OOXMLWordAndPowerPointTextHandler(new
OOXMLTikaBodyPartHandler(xhtml),
+ new HashMap<>()));
}
}
@@ -143,7 +160,7 @@ public class SXSLFPowerPointExtractorDecorator extends
AbstractOOXMLExtractor {
for (int i = 0; i < prc.size(); i++) {
PackagePart commentAuthorsPart = null;
try {
- commentAuthorsPart =
mainDocument.getRelatedPart(prc.getRelationship(i));
+ commentAuthorsPart = safeGetRelatedPart(mainDocument,
prc.getRelationship(i));
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
@@ -171,103 +188,6 @@ public class SXSLFPowerPointExtractorDecorator extends
AbstractOOXMLExtractor {
}
}
- /**
- * Returns the first related part for the given relationship type,
- * or null if none found.
- */
- private PackagePart getRelatedPartByType(PackagePart source, String
relationType) {
- try {
- PackageRelationshipCollection prc =
source.getRelationshipsByType(relationType);
- if (prc != null && prc.size() > 0) {
- return source.getRelatedPart(prc.getRelationship(0));
- }
- } catch (InvalidFormatException | IllegalArgumentException e) {
- // missing part
- }
- return null;
- }
-
- /**
- * Returns slide parts in presentation order by parsing the sldIdLst
- * from presentation.xml. Any slides found in .rels but not in
- * the sldIdLst are appended at the end.
- */
- private List<PackagePart> getOrderedSlideParts() {
- // Step 1: parse presentation.xml to get ordered rIds from sldIdLst
- List<String> orderedRIds = new ArrayList<>();
- try (InputStream is = mainDocument.getInputStream()) {
- XMLReaderUtils.parseSAX(is, new DefaultHandler() {
- private boolean inSldIdLst = false;
-
- @Override
- public void startElement(String uri, String localName, String
qName,
- Attributes atts) {
- if ("sldIdLst".equals(localName)) {
- inSldIdLst = true;
- } else if (inSldIdLst && "sldId".equals(localName)) {
- String rId = atts.getValue(
-
"http://schemas.openxmlformats.org/officeDocument/2006/relationships",
- "id");
- if (rId != null) {
- orderedRIds.add(rId);
- }
- }
- }
-
- @Override
- public void endElement(String uri, String localName, String
qName) {
- if ("sldIdLst".equals(localName)) {
- inSldIdLst = false;
- }
- }
- }, context);
- } catch (Exception e) {
- metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
- ExceptionUtils.getStackTrace(e));
- }
-
- // Step 2: build rId -> PackagePart map from relationships
- Map<String, PackagePart> rIdToSlide = new LinkedHashMap<>();
- try {
- PackageRelationshipCollection slidesPRC =
-
mainDocument.getRelationshipsByType(XSLFRelation.SLIDE.getRelation());
- if (slidesPRC != null) {
- for (int i = 0; i < slidesPRC.size(); i++) {
- PackageRelationship rel = slidesPRC.getRelationship(i);
- try {
- PackagePart part = mainDocument.getRelatedPart(rel);
- if (part != null) {
- rIdToSlide.put(rel.getId(), part);
- }
- } catch (InvalidFormatException | IllegalArgumentException
e) {
- // skip missing parts
- }
- }
- }
- } catch (InvalidFormatException e) {
- metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
- ExceptionUtils.getStackTrace(e));
- }
-
- // Step 3: assemble in presentation order, then append orphans
- List<PackagePart> result = new ArrayList<>();
- for (String rId : orderedRIds) {
- PackagePart part = rIdToSlide.remove(rId);
- if (part != null) {
- result.add(part);
- }
- }
- // append any slides in .rels but not in sldIdLst
- if (!rIdToSlide.isEmpty()) {
- metadata.set(Office.NUM_UNLISTED_SLIDES, rIdToSlide.size());
- for (PackagePart part : rIdToSlide.values()) {
- metadata.add(Office.UNLISTED_SLIDE_NAMES,
part.getPartName().getName());
- }
- result.addAll(rIdToSlide.values());
- }
- return result;
- }
-
/**
* @return 1 if the slide is hidden, 0 otherwise
*/
@@ -278,10 +198,9 @@ public class SXSLFPowerPointExtractorDecorator extends
AbstractOOXMLExtractor {
int hidden = 0;
xhtml.startElement("div", "class", "slide-content");
- OOXMLTikaBodyPartHandler bodyHandler = new
OOXMLTikaBodyPartHandler(xhtml, metadata);
try (InputStream stream = slidePart.getInputStream()) {
OOXMLWordAndPowerPointTextHandler wordAndPPTHandler = new
OOXMLWordAndPowerPointTextHandler(
- bodyHandler, linkedRelationships);
+ new OOXMLTikaBodyPartHandler(xhtml, metadata),
linkedRelationships);
XMLReaderUtils.parseSAX(stream,
new EmbeddedContentHandler(wordAndPPTHandler), context);
if (wordAndPPTHandler.isHiddenSlide()) {
@@ -295,64 +214,42 @@ public class SXSLFPowerPointExtractorDecorator extends
AbstractOOXMLExtractor {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
- bodyHandler.closeInlineElements();
+
xhtml.endElement("div");
if (config.isIncludeSlideMasterContent()) {
- // Extract the slide layout (per-slide)
- PackagePart layoutPart = getRelatedPartByType(slidePart,
- XSLFRelation.SLIDE_LAYOUT.getRelation());
- if (layoutPart != null) {
-
handleTextPartWithCleanup(XSLFRelation.SLIDE_LAYOUT.getRelation(),
- "slide-master-content", slidePart, xhtml,
linkedRelationships, true);
- // Follow layout → slide master chain
-
handleTextPartWithCleanup(XSLFRelation.SLIDE_MASTER.getRelation(),
- "slide-master-content", layoutPart, xhtml,
linkedRelationships, true);
- }
+
handleGeneralTextContainingPart(XSLFRelation.SLIDE_LAYOUT.getRelation(),
+ "slide-master-content", slidePart, metadata, new
PlaceHolderSkipper(
+ new OOXMLWordAndPowerPointTextHandler(
+ new OOXMLTikaBodyPartHandler(xhtml),
linkedRelationships)));
}
if (config.isIncludeSlideNotes()) {
- handleTextPartWithCleanup(XSLFRelation.NOTES.getRelation(),
"slide-notes",
- slidePart, xhtml, linkedRelationships, false);
+ handleGeneralTextContainingPart(XSLFRelation.NOTES.getRelation(),
"slide-notes",
+ slidePart, metadata,
+ new OOXMLWordAndPowerPointTextHandler(new
OOXMLTikaBodyPartHandler(xhtml),
+ linkedRelationships));
if (config.isIncludeSlideMasterContent()) {
-
handleTextPartWithCleanup(XSLFRelation.NOTES_MASTER.getRelation(),
- "slide-notes-master", slidePart, xhtml,
linkedRelationships, false);
+
handleGeneralTextContainingPart(XSLFRelation.NOTES_MASTER.getRelation(),
+ "slide-notes-master", slidePart, metadata,
+ new OOXMLWordAndPowerPointTextHandler(new
OOXMLTikaBodyPartHandler(xhtml),
+ linkedRelationships));
+
}
}
handleGeneralTextContainingPart(XSLFRelation.COMMENTS.getRelation(),
null, slidePart,
metadata, new XSLFCommentsHandler(xhtml, commentAuthors));
- handleTextPartWithCleanup(AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
- "diagram-data", slidePart, xhtml, linkedRelationships, false);
+
handleGeneralTextContainingPart(AbstractOOXMLExtractor.RELATION_DIAGRAM_DATA,
+ "diagram-data", slidePart, metadata,
+ new OOXMLWordAndPowerPointTextHandler(new
OOXMLTikaBodyPartHandler(xhtml),
+ linkedRelationships));
- handleTextPartWithCleanup(XSLFRelation.CHART.getRelation(), "chart",
slidePart,
- xhtml, linkedRelationships, false);
+ handleGeneralTextContainingPart(XSLFRelation.CHART.getRelation(),
"chart", slidePart,
+ metadata, new OOXMLWordAndPowerPointTextHandler(new
OOXMLTikaBodyPartHandler(xhtml),
+ linkedRelationships));
return hidden;
}
- /**
- * Handles a text-containing part with guaranteed inline element cleanup.
- * Creates an OOXMLTikaBodyPartHandler, parses the part, then calls
- * closeInlineElements() to ensure no unclosed tags leak into subsequent
output.
- *
- * @param usePlaceholderSkipper if true, wraps the handler in a
PlaceHolderSkipper
- */
- private void handleTextPartWithCleanup(String contentType, String
xhtmlClassLabel,
- PackagePart parentPart,
XHTMLContentHandler xhtml,
- Map<String, String>
linkedRelationships,
- boolean usePlaceholderSkipper)
throws SAXException {
- OOXMLTikaBodyPartHandler bodyHandler = new
OOXMLTikaBodyPartHandler(xhtml);
- OOXMLWordAndPowerPointTextHandler textHandler =
- new OOXMLWordAndPowerPointTextHandler(bodyHandler,
linkedRelationships);
- DefaultHandler handler = usePlaceholderSkipper
- ? new PlaceHolderSkipper(textHandler) : textHandler;
- try {
- handleGeneralTextContainingPart(contentType, xhtmlClassLabel,
parentPart,
- metadata, handler);
- } finally {
- bodyHandler.closeInlineElements();
- }
- }
-
/**
* In PowerPoint files, slides have things embedded in them,
* and slide drawings which have the images
@@ -374,7 +271,7 @@ public class SXSLFPowerPointExtractorDecorator extends
AbstractOOXMLExtractor {
for (int i = 0; i < slidePRC.size(); i++) {
PackagePart slidePart = null;
try {
- slidePart =
mainDocument.getRelatedPart(slidePRC.getRelationship(i));
+ slidePart = safeGetRelatedPart(mainDocument,
slidePRC.getRelationship(i));
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
@@ -397,7 +294,7 @@ public class SXSLFPowerPointExtractorDecorator extends
AbstractOOXMLExtractor {
for (int i = 0; i < prc.size(); i++) {
PackagePart pp = null;
try {
- pp =
mainDocument.getRelatedPart(prc.getRelationship(i));
+ pp = safeGetRelatedPart(mainDocument,
prc.getRelationship(i));
} catch (InvalidFormatException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
index 393370662b..88c7aaa0f7 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/SXWPFWordExtractorDecorator.java
@@ -45,6 +45,7 @@ import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.microsoft.EMFParser;
+import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFFeatureExtractor;
import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFNumberingShim;
@@ -126,16 +127,21 @@ public class SXWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
}
}
//handle glossary document
- pps =
opcPackage.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
- if (pps != null) {
- if (pps.size() > 0) {
- xhtml.startElement("div", "class", "glossary");
-
- for (PackagePart pp : pps) {
- //likely only one, but why not...
- handleDocumentPart(pp, xhtml);
+ OfficeParserConfig officeParserConfig =
context.get(OfficeParserConfig.class,
+ new OfficeParserConfig());
+ if (officeParserConfig.isIncludeGlossary()) {
+ pps = opcPackage.getPartsByContentType(
+ XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
+ if (pps != null) {
+ if (pps.size() > 0) {
+ xhtml.startElement("div", "class", "glossary");
+
+ for (PackagePart pp : pps) {
+ //likely only one, but why not...
+ handleDocumentPart(pp, xhtml);
+ }
+ xhtml.endElement("div");
}
- xhtml.endElement("div");
}
}
@@ -223,23 +229,8 @@ public class SXWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
}
}
}
- } catch (InvalidFormatException | IOException | TikaException |
SAXException |
- IllegalArgumentException e) {
- // swallow -- POI throws IllegalArgumentException when
- // a relationship references a part missing from the package
- }
- }
-
- /**
- * Safely resolves a related part from a relationship. Returns {@code
null}
- * instead of throwing {@link IllegalArgumentException} when the target
- * part is missing from the package (e.g. truncated / salvaged zips).
- */
- private static PackagePart safeGetRelatedPart(PackagePart source,
PackageRelationship rel) {
- try {
- return source.getRelatedPart(rel);
- } catch (InvalidFormatException | IllegalArgumentException e) {
- return null;
+ } catch (InvalidFormatException | IOException | TikaException |
SAXException e) {
+ // swallow
}
}
@@ -286,10 +277,11 @@ public class SXWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
for (int i = 0; i < headersPRC.size(); i++) {
PackagePart header =
safeGetRelatedPart(documentPart,
headersPRC.getRelationship(i));
- if (header != null) {
- handlePart(header, styles, listManager, xhtml,
- OOXMLInlineBodyPartMap.EMPTY);
+ if (header == null) {
+ continue;
}
+ handlePart(header, styles, listManager, xhtml,
+ OOXMLInlineBodyPartMap.EMPTY);
}
}
} catch (InvalidFormatException | ZipException e) {
@@ -327,10 +319,11 @@ public class SXWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
for (int i = 0; i < prc.size(); i++) {
PackagePart packagePart =
safeGetRelatedPart(documentPart,
prc.getRelationship(i));
- if (packagePart != null) {
- handlePart(packagePart, styles, listManager, xhtml,
- OOXMLInlineBodyPartMap.EMPTY);
+ if (packagePart == null) {
+ continue;
}
+ handlePart(packagePart, styles, listManager, xhtml,
+ OOXMLInlineBodyPartMap.EMPTY);
}
}
} catch (InvalidFormatException | ZipException e) {
@@ -391,11 +384,10 @@ public class SXWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
linkedRelationships,
config.isIncludeShapeBasedContent(),
config.isConcatenatePhoneticRuns(),
config.isPreferAlternateContentChoice())),
context);
- } catch (TikaException | IOException | SAXException e) {
+ } catch (TikaException | IOException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
}
- bodyHandler.closeInlineElements();
Map<String, EmbeddedPartMetadata> partMetadata =
bodyHandler.getEmbeddedPartMetadataMap();
resolveEmfNames(packagePart, partMetadata);
embeddedPartMetadataMap.putAll(partMetadata);
@@ -410,7 +402,7 @@ public class SXWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
continue;
}
try {
- PackagePart emfPart = documentPart.getRelatedPart(
+ PackagePart emfPart = safeGetRelatedPart(documentPart,
documentPart.getRelationship(emfRId));
if (emfPart == null || emfPart.getContentType() == null) {
continue;
@@ -471,23 +463,20 @@ public class SXWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
OOXMLPartContentCollector collector =
new OOXMLPartContentCollector(wrapperElements, skipIds);
for (int i = 0; i < prc.size(); i++) {
- try {
- PackagePart part =
documentPart.getRelatedPart(prc.getRelationship(i));
- // collect the part's linked relationships (for picture
resolution)
- Map<String, String> partRels =
- loadLinkedRelationships(part, true, metadata);
- allRelationships.putAll(partRels);
- try (InputStream stream = part.getInputStream()) {
- XMLReaderUtils.parseSAX(stream, collector, context);
- }
- } catch (InvalidFormatException | IOException | TikaException |
- SAXException e) {
-
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
- ExceptionUtils.getStackTrace(e));
+ PackagePart part = safeGetRelatedPart(documentPart,
prc.getRelationship(i));
+ if (part == null) {
+ continue;
+ }
+ // collect the part's linked relationships (for picture
resolution)
+ Map<String, String> partRels =
+ loadLinkedRelationships(part, true, metadata);
+ allRelationships.putAll(partRels);
+ try (InputStream stream = part.getInputStream()) {
+ XMLReaderUtils.parseSAX(stream, collector, context);
}
}
return collector.getContentMap();
- } catch (InvalidFormatException e) {
+ } catch (InvalidFormatException | IOException | TikaException |
SAXException e) {
metadata.add(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING,
ExceptionUtils.getStackTrace(e));
return Collections.emptyMap();
@@ -530,7 +519,7 @@ public class SXWPFWordExtractorDecorator extends
AbstractOOXMLExtractor {
}
return new XWPFNumberingShim(numberingPart, context);
}
- } catch (Exception e) {
+ } catch (IOException | InvalidFormatException | TikaException |
SAXException e) {
//swallow
}
return null;
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java
index dbd6996886..a9eb400e98 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/XWPFBodyContentsHandler.java
@@ -48,17 +48,6 @@ public interface XWPFBodyContentsHandler {
void startParagraph(ParagraphProperties paragraphProperties) throws
SAXException;
- /**
- * Updates the properties (style, numbering) for the current pending
paragraph.
- * Called when {@code </pPr>} is encountered, after {@link
#startParagraph} but
- * before any content. The body handler defers opening the XHTML {@code
<p>}
- * tag until the first content arrives, so this style info will be
available.
- */
- default void setParagraphProperties(ParagraphProperties
paragraphProperties)
- throws SAXException {
- // Default no-op for backward compatibility
- }
-
void endParagraph() throws SAXException;
void startTable() throws SAXException;
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
index 5a4676d631..74976ea34e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/XWPFEventBasedWordExtractor.java
@@ -42,6 +42,7 @@ import org.apache.tika.exception.RuntimeSAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.microsoft.ooxml.AbstractOOXMLExtractor;
import org.apache.tika.parser.microsoft.ooxml.EditType;
import
org.apache.tika.parser.microsoft.ooxml.OOXMLWordAndPowerPointTextHandler;
import org.apache.tika.parser.microsoft.ooxml.ParagraphProperties;
@@ -61,6 +62,7 @@ public class XWPFEventBasedWordExtractor implements
POIXMLTextExtractor {
private OPCPackage container;
private POIXMLProperties properties;
+ private boolean includeGlossary = true;
public XWPFEventBasedWordExtractor(OPCPackage container)
throws OpenXML4JException, IOException {
@@ -84,6 +86,10 @@ public class XWPFEventBasedWordExtractor implements
POIXMLTextExtractor {
return this.container;
}
+ public void setIncludeGlossary(boolean includeGlossary) {
+ this.includeGlossary = includeGlossary;
+ }
+
public POIXMLProperties.CoreProperties getCoreProperties() {
POIXMLProperties props = getOrCreateProperties();
return props != null ? props.getCoreProperties() : null;
@@ -130,23 +136,26 @@ public class XWPFEventBasedWordExtractor implements
POIXMLTextExtractor {
}
}
//handle glossary document
- pps =
container.getPartsByContentType(XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
-
- if (pps != null) {
- for (PackagePart pp : pps) {
- //likely only one, but why not...
- try {
- handleDocumentPart(pp, sb);
- } catch (IOException e) {
- LOG.warn("IOException handling glossary document part", e);
- } catch (SAXException e) {
- if (WriteLimitReachedException.isWriteLimitReached(e)) {
- throw new RuntimeSAXException(e);
+ if (includeGlossary) {
+ pps = container.getPartsByContentType(
+ XWPFRelation.GLOSSARY_DOCUMENT.getContentType());
+
+ if (pps != null) {
+ for (PackagePart pp : pps) {
+ //likely only one, but why not...
+ try {
+ handleDocumentPart(pp, sb);
+ } catch (IOException e) {
+ LOG.warn("IOException handling glossary document
part", e);
+ } catch (SAXException e) {
+ if (WriteLimitReachedException.isWriteLimitReached(e))
{
+ throw new RuntimeSAXException(e);
+ }
+ //swallow this because we don't actually call it
+ LOG.warn("SAXException handling glossary document
part", e);
+ } catch (TikaException e) {
+ LOG.warn("ParseException handling document part", e);
}
- //swallow this because we don't actually call it
- LOG.warn("SAXException handling glossary document part",
e);
- } catch (TikaException e) {
- LOG.warn("ParseException handling document part", e);
}
}
}
@@ -184,7 +193,11 @@ public class XWPFEventBasedWordExtractor implements
POIXMLTextExtractor {
documentPart.getRelationshipsByType(XWPFRelation.HEADER.getRelation());
if (headersPRC != null) {
for (int i = 0; i < headersPRC.size(); i++) {
- PackagePart header =
documentPart.getRelatedPart(headersPRC.getRelationship(i));
+ PackagePart header =
AbstractOOXMLExtractor.safeGetRelatedPart(
+ documentPart, headersPRC.getRelationship(i));
+ if (header == null) {
+ continue;
+ }
handlePart(header, xwpfListManager, sb);
}
}
@@ -204,7 +217,11 @@ public class XWPFEventBasedWordExtractor implements
POIXMLTextExtractor {
if (prc != null) {
for (int i = 0; i < prc.size(); i++) {
PackagePart packagePart =
-
documentPart.getRelatedPart(prc.getRelationship(i));
+ AbstractOOXMLExtractor.safeGetRelatedPart(
+ documentPart, prc.getRelationship(i));
+ if (packagePart == null) {
+ continue;
+ }
handlePart(packagePart, xwpfListManager, sb);
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLDocxSAXTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLDocxSAXTest.java
index 38f0cb080b..ac533400e0 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLDocxSAXTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLDocxSAXTest.java
@@ -30,6 +30,7 @@ import org.junit.jupiter.api.Test;
import org.apache.tika.config.loader.TikaLoader;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
import org.apache.tika.metadata.OfficeOpenXMLCore;
@@ -116,7 +117,7 @@ public class OOXMLDocxSAXTest extends AbstractOOXMLDocxTest
{
content);
assertContains("<td>Embedded table r1c1", content);
- assertContainsCount("This is text within a shape", content, 1);
+ assertContainsCount("<p>This is text within a shape", content, 1);
assertContains("<p>Rich text content control", content);
assertContains("<p>Simple text content control", content);
assertContains("Repeating content", content);
@@ -363,4 +364,32 @@ public class OOXMLDocxSAXTest extends
AbstractOOXMLDocxTest {
Metadata m = metadataList.get(0);
assertEquals("true", m.get(Office.HAS_FRAMESETS));
}
+
+ /**
+ * Test with external DOCX files known to trigger "prefix not bound"
+ * from missing namespace declarations in footnote/endnote fragments.
+ * Enable by setting system property "tika.test.docx.namespace" to a file
path.
+ */
+ @Test
+ public void testNamespaceInFragments() throws Exception {
+ String filePath = System.getProperty("tika.test.docx.namespace");
+ if (filePath == null) {
+ return;
+ }
+ java.io.File f = new java.io.File(filePath);
+ if (!f.isFile()) {
+ return;
+ }
+ AutoDetectParser parser = new AutoDetectParser();
+ Metadata metadata = new Metadata();
+ org.xml.sax.ContentHandler handler =
+ new org.apache.tika.sax.BodyContentHandler(-1);
+ try (TikaInputStream tis = TikaInputStream.get(f.toPath())) {
+ parser.parse(tis, handler, metadata, getParseContext());
+ }
+ String[] warnings =
metadata.getValues(TikaCoreProperties.TIKA_META_EXCEPTION_WARNING);
+ for (String w : warnings) {
+ assertNotContained("not bound", w);
+ }
+ }
}