This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4219-branch_2x in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/TIKA-4219-branch_2x by this push: new cba8e58b2 TIKA-4219 -- avoid namespace conflicts cba8e58b2 is described below commit cba8e58b225ede9950b94739b1bac6304bca2d39 Author: tallison <talli...@apache.org> AuthorDate: Mon Mar 25 09:37:21 2024 -0400 TIKA-4219 -- avoid namespace conflicts --- .../org/apache/tika/parser/epub/EpubParser.java | 42 +++++++++++++++++++++- 1 file changed, 41 insertions(+), 1 deletion(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java index ab51729fc..a572ad2cc 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java @@ -43,6 +43,7 @@ import org.apache.commons.lang3.StringUtils; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.config.Field; @@ -62,6 +63,7 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.xml.DcXMLParser; import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.ContentHandlerDecorator; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.ParserUtils; @@ -122,7 +124,8 @@ public class EpubParser extends AbstractParser { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); IOException caughtException = null; - ContentHandler childHandler = new EmbeddedContentHandler(new BodyContentHandler(xhtml)); + ContentHandler childHandler = new EmbeddedContentHandler( + new EpubNormalizingHandler(new BodyContentHandler(xhtml))); Set<String> encryptedItems = Collections.EMPTY_SET; if (streaming) { try { @@ -602,4 +605,41 @@ public class EpubParser extends AbstractParser { private static class EpubZipException extends IOException { } + + //for now, this simply converts all names to local names to avoid + //namespace conflicts in the content handler. This also removes namespaces + //from attributes + private class EpubNormalizingHandler extends ContentHandlerDecorator { + public EpubNormalizingHandler(ContentHandler contentHandler) { + super(contentHandler); + } + + @Override + public void startElement(String uri, String localName, String name, Attributes atts) + throws SAXException { + //some atts may have namespaces that were not included in the header + boolean needToRewrite = false; + for (int i = 0; i < atts.getLength(); i++) { + if (atts.getQName(i) != null && ! atts.getQName(i).equals(atts.getLocalName(i))) { + needToRewrite = true; + break; + } + } + if (needToRewrite) { + AttributesImpl simplifiedAtts = new AttributesImpl(); + for (int i = 0; i < atts.getLength(); i++) { + simplifiedAtts.addAttribute("", atts.getLocalName(i), atts.getLocalName(i), + atts.getType(i), atts.getValue(i)); + } + super.startElement(uri, localName, localName, simplifiedAtts); + } else { + super.startElement(uri, localName, localName, atts); + } + } + + @Override + public void endElement(String uri, String localName, String name) throws SAXException { + super.endElement(uri, localName, localName); + } + } }