This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4219-branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/TIKA-4219-branch_2x by this 
push:
     new cba8e58b2 TIKA-4219 -- avoid namespace conflicts
cba8e58b2 is described below

commit cba8e58b225ede9950b94739b1bac6304bca2d39
Author: tallison <talli...@apache.org>
AuthorDate: Mon Mar 25 09:37:21 2024 -0400

    TIKA-4219 -- avoid namespace conflicts
---
 .../org/apache/tika/parser/epub/EpubParser.java    | 42 +++++++++++++++++++++-
 1 file changed, 41 insertions(+), 1 deletion(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index ab51729fc..a572ad2cc 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -43,6 +43,7 @@ import org.apache.commons.lang3.StringUtils;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
 import org.xml.sax.helpers.DefaultHandler;
 
 import org.apache.tika.config.Field;
@@ -62,6 +63,7 @@ import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.xml.DcXMLParser;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerDecorator;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.utils.ParserUtils;
@@ -122,7 +124,8 @@ public class EpubParser extends AbstractParser {
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
         IOException caughtException = null;
-        ContentHandler childHandler = new EmbeddedContentHandler(new 
BodyContentHandler(xhtml));
+        ContentHandler childHandler = new EmbeddedContentHandler(
+                new EpubNormalizingHandler(new BodyContentHandler(xhtml)));
         Set<String> encryptedItems = Collections.EMPTY_SET;
         if (streaming) {
             try {
@@ -602,4 +605,41 @@ public class EpubParser extends AbstractParser {
     private static class EpubZipException extends IOException {
 
     }
+
+    //for now, this simply converts all names to local names to avoid
+    //namespace conflicts in the content handler. This also removes namespaces
+    //from attributes
+    private class EpubNormalizingHandler extends ContentHandlerDecorator {
+        public EpubNormalizingHandler(ContentHandler contentHandler) {
+            super(contentHandler);
+        }
+
+        @Override
+        public void startElement(String uri, String localName, String name, 
Attributes atts)
+                throws SAXException {
+            //some atts may have namespaces that were not included in the 
header
+            boolean needToRewrite = false;
+            for (int i = 0; i < atts.getLength(); i++) {
+                if (atts.getQName(i) != null && ! 
atts.getQName(i).equals(atts.getLocalName(i))) {
+                    needToRewrite = true;
+                    break;
+                }
+            }
+            if (needToRewrite) {
+                AttributesImpl simplifiedAtts = new AttributesImpl();
+                for (int i = 0; i < atts.getLength(); i++) {
+                    simplifiedAtts.addAttribute("", atts.getLocalName(i), 
atts.getLocalName(i),
+                            atts.getType(i), atts.getValue(i));
+                }
+                super.startElement(uri, localName, localName, simplifiedAtts);
+            } else {
+                super.startElement(uri, localName, localName, atts);
+            }
+        }
+
+        @Override
+        public void endElement(String uri, String localName, String name) 
throws SAXException {
+            super.endElement(uri, localName, localName);
+        }
+    }
 }

Reply via email to