This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_2x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_2x by this push:
     new fa58cd418 TIKA-4219 -- improve epub handling of encrypted 
non-text-containing items (#1684)
fa58cd418 is described below

commit fa58cd418eb922fefa5931d0cf803e551baba31e
Author: Tim Allison <talli...@apache.org>
AuthorDate: Mon Mar 25 12:24:49 2024 -0400

    TIKA-4219 -- improve epub handling of encrypted non-text-containing items 
(#1684)
    
    * TIKA-4219 -- improve epub handling of encrypted non-text-containing items
---
 .../apache/tika/parser/epub/EncryptionParser.java  |  88 ----------
 .../org/apache/tika/parser/epub/EpubParser.java    | 187 ++++++++++++++++-----
 2 files changed, 147 insertions(+), 128 deletions(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java
deleted file mode 100644
index 26aae7574..000000000
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.tika.parser.epub;
-
-import java.io.IOException;
-import java.io.InputStream;
-import java.util.Collections;
-import java.util.HashSet;
-import java.util.Set;
-
-import org.xml.sax.Attributes;
-import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
-
-import org.apache.tika.exception.EncryptedDocumentException;
-import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.ParseContext;
-import org.apache.tika.parser.Parser;
-import org.apache.tika.utils.XMLReaderUtils;
-
-public class EncryptionParser implements Parser {
-
-    @Override
-    public Set<MediaType> getSupportedTypes(ParseContext context) {
-        return Collections.EMPTY_SET;
-    }
-
-    @Override
-    public void parse(InputStream stream, ContentHandler handler, Metadata 
metadata,
-                      ParseContext context) throws IOException, SAXException, 
TikaException {
-
-        try {
-            XMLReaderUtils.parseSAX(stream, new EncryptionHandler(), context);
-        } catch (SAXException e) {
-            if (e.getCause() instanceof EncryptedDocumentException) {
-                throw (EncryptedDocumentException)e.getCause();
-            }
-        }
-    }
-
-    private class EncryptionHandler extends DefaultHandler {
-        Set<String> encryptedItems = new HashSet<>();
-        @Override
-        public void startElement(String uri, String localName, String qName, 
Attributes attributes) {
-            if ("CipherReference".equals(localName)) {
-                String encryptedUri = XMLReaderUtils.getAttrValue("URI", 
attributes);
-                encryptedItems.add(encryptedUri);
-            }
-        }
-
-        @Override
-        public void endDocument() throws SAXException {
-            if (encryptedItems.size() > 0) {
-                StringBuilder sb = new StringBuilder();
-                sb.append("EPUB contains encrypted items: ");
-                int added = 0;
-                for (String u : encryptedItems) {
-                    if (sb.length() > 500) {
-                        sb.append(" and others...");
-                        break;
-                    }
-                    if (added++ > 0) {
-                        sb.append(", ");
-                    }
-                    sb.append(u);
-                }
-                throw new SAXException(new 
EncryptedDocumentException(sb.toString()));
-            }
-        }
-    }
-}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index 1bdd95750..a572ad2cc 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -43,6 +43,7 @@ import org.apache.commons.lang3.StringUtils;
 import org.xml.sax.Attributes;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
+import org.xml.sax.helpers.AttributesImpl;
 import org.xml.sax.helpers.DefaultHandler;
 
 import org.apache.tika.config.Field;
@@ -55,12 +56,14 @@ import org.apache.tika.io.FilenameUtils;
 import org.apache.tika.io.TemporaryResources;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.xml.DcXMLParser;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ContentHandlerDecorator;
 import org.apache.tika.sax.EmbeddedContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.utils.ParserUtils;
@@ -121,7 +124,9 @@ public class EpubParser extends AbstractParser {
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
         IOException caughtException = null;
-        ContentHandler childHandler = new EmbeddedContentHandler(new 
BodyContentHandler(xhtml));
+        ContentHandler childHandler = new EmbeddedContentHandler(
+                new EpubNormalizingHandler(new BodyContentHandler(xhtml)));
+        Set<String> encryptedItems = Collections.EMPTY_SET;
         if (streaming) {
             try {
                 streamingParse(stream, childHandler, metadata, context);
@@ -130,7 +135,7 @@ public class EpubParser extends AbstractParser {
             }
         } else {
             try {
-                bufferedParse(stream, childHandler, xhtml, metadata, context);
+                encryptedItems = bufferedParse(stream, childHandler, xhtml, 
metadata, context);
             } catch (IOException e) {
                 caughtException = e;
             }
@@ -140,9 +145,11 @@ public class EpubParser extends AbstractParser {
         if (caughtException != null) {
             throw caughtException;
         }
+        maybeThrowEncryptedException(encryptedItems);
     }
 
-    private void streamingParse(InputStream stream, ContentHandler 
bodyHandler, Metadata metadata,
+    private Set<String> streamingParse(InputStream stream, ContentHandler 
bodyHandler,
+                                   Metadata metadata,
                                 ParseContext context)
             throws IOException, TikaException, SAXException {
         ZipArchiveInputStream zip = new ZipArchiveInputStream(stream, "UTF-8", 
false, true, false);
@@ -153,7 +160,8 @@ public class EpubParser extends AbstractParser {
             if (entry.getName().equals("mimetype")) {
                 updateMimeType(zip, metadata);
             } else if (entry.getName().equals(META_INF_ENCRYPTION)) {
-                checkForDRM(zip);
+                //when streaming, throw an encryption exception if anything is 
encrypted
+                checkForDRM(zip, context);
             } else if (entry.getName().equals("metadata.xml")) {
                 meta.parse(zip, new DefaultHandler(), metadata, context);
             } else if (entry.getName().endsWith(".opf")) {
@@ -176,6 +184,9 @@ public class EpubParser extends AbstractParser {
         if (sax != null) {
             throw sax;
         }
+        //always empty -- we throw an encryption exception
+        //as soon as checkForDRM hits an encrypted item
+        return Collections.EMPTY_SET;
     }
 
     private void updateMimeType(InputStream is, Metadata metadata) throws 
IOException {
@@ -188,7 +199,7 @@ public class EpubParser extends AbstractParser {
 
     }
 
-    private void bufferedParse(InputStream stream, ContentHandler bodyHandler,
+    private Set<String> bufferedParse(InputStream stream, ContentHandler 
bodyHandler,
                                XHTMLContentHandler xhtml, Metadata metadata, 
ParseContext context)
             throws IOException, TikaException, SAXException {
         TikaInputStream tis;
@@ -196,9 +207,8 @@ public class EpubParser extends AbstractParser {
         if (TikaInputStream.isTikaInputStream(stream)) {
             tis = TikaInputStream.cast(stream);
             if (tis.getOpenContainer() instanceof ZipFile) {
-                bufferedParseZipFile((ZipFile) tis.getOpenContainer(), 
bodyHandler, xhtml, metadata,
-                        context, true);
-                return;
+                return bufferedParseZipFile((ZipFile) tis.getOpenContainer(), 
bodyHandler, xhtml,
+                        metadata, context, true);
             }
         } else {
             temporaryResources = new TemporaryResources();
@@ -209,8 +219,7 @@ public class EpubParser extends AbstractParser {
             zipFile = new ZipFile(tis.getPath().toFile());
         } catch (IOException e) {
             ParserUtils.recordParserFailure(this, e, metadata);
-            trySalvage(tis.getPath(), bodyHandler, xhtml, metadata, context);
-            return;
+            return trySalvage(tis.getPath(), bodyHandler, xhtml, metadata, 
context);
         } finally {
             //if we had to wrap tis
             if (temporaryResources != null) {
@@ -218,44 +227,42 @@ public class EpubParser extends AbstractParser {
             }
         }
         try {
-            bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, 
context, true);
+            return bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, 
context, true);
         } finally {
             zipFile.close();
         }
     }
 
-    private void trySalvage(Path brokenZip, ContentHandler bodyHandler, 
XHTMLContentHandler xhtml,
+    private Set<String> trySalvage(Path brokenZip, ContentHandler bodyHandler,
+                               XHTMLContentHandler xhtml,
                             Metadata metadata, ParseContext context)
             throws IOException, TikaException, SAXException {
         try (TemporaryResources resources = new TemporaryResources()) {
             Path salvaged =
                     
resources.createTempFile(FilenameUtils.getSuffixFromPath(brokenZip.getFileName().toString()));
             ZipSalvager.salvageCopy(brokenZip.toFile(), salvaged.toFile());
-            boolean success = false;
             try (ZipFile zipFile = new ZipFile(salvaged.toFile())) {
-                success =
-                        bufferedParseZipFile(zipFile, bodyHandler, xhtml, 
metadata, context, false);
-            }
-            if (!success) {
+                return bufferedParseZipFile(zipFile, bodyHandler, xhtml, 
metadata, context, false);
+            } catch (EpubZipException e) {
                 try (InputStream is = TikaInputStream.get(salvaged)) {
-                    streamingParse(is, xhtml, metadata, context);
+                    return streamingParse(is, xhtml, metadata, context);
                 }
             }
         }
     }
 
-    private boolean bufferedParseZipFile(ZipFile zipFile, ContentHandler 
bodyHandler,
+    private Set<String> bufferedParseZipFile(ZipFile zipFile, ContentHandler 
bodyHandler,
                                          XHTMLContentHandler xhtml, Metadata 
metadata,
                                          ParseContext context, boolean 
isStrict)
-            throws IOException, TikaException, SAXException {
+            throws IOException, TikaException, SAXException, EpubZipException {
 
         String rootOPF = getRoot(zipFile, context);
         if (rootOPF == null) {
-            return false;
+            throw new EpubZipException();
         }
         ZipArchiveEntry zae = zipFile.getEntry(rootOPF);
         if (zae == null || !zipFile.canReadEntryData(zae)) {
-            return false;
+            throw new EpubZipException();
         }
         opf.parse(zipFile.getInputStream(zae), new DefaultHandler(), metadata, 
context);
 
@@ -265,7 +272,7 @@ public class EpubParser extends AbstractParser {
         }
         //if no content items, false
         if (contentOrderScraper.contentItems.size() == 0) {
-            return false;
+            throw new EpubZipException();
         }
         String relativePath = "";
         if (rootOPF.lastIndexOf("/") > -1) {
@@ -286,13 +293,14 @@ public class EpubParser extends AbstractParser {
             //if not perfect match btwn items and readable items
             //return false
             if (found != contentOrderScraper.contentItems.size()) {
-                return false;
+                throw new EpubZipException();
             }
         }
 
         extractMetadata(zipFile, metadata, context);
-        checkForDRM(zipFile);
+        Set<String> encryptedItems = checkForDRM(zipFile);
         Set<String> processed = new HashSet<>();
+        Set<SAXException> saxExceptions = new HashSet<>();
         for (String id : contentOrderScraper.contentItems) {
             HRefMediaPair hRefMediaPair = 
contentOrderScraper.locationMap.get(id);
             if (hRefMediaPair != null && hRefMediaPair.href != null) {
@@ -309,10 +317,21 @@ public class EpubParser extends AbstractParser {
                     shouldParse = true;
                 }
                 if (shouldParse) {
+                    String path = relativePath + hRefMediaPair.href;
+                    //if content is encrypted, do not parse it, throw an 
exception now
+                    if (encryptedItems.contains(path)) {
+                        maybeThrowEncryptedException(encryptedItems);
+                    }
                     zae = zipFile.getEntry(relativePath + hRefMediaPair.href);
                     if (zae != null) {
                         try (InputStream is = zipFile.getInputStream(zae)) {
                             content.parse(is, bodyHandler, metadata, context);
+                        } catch (SAXException e) {
+                            if 
(WriteLimitReachedException.isWriteLimitReached(e)) {
+                                throw e;
+                            }
+                            saxExceptions.add(e);
+                        } finally {
                             processed.add(id);
                         }
                     }
@@ -326,37 +345,59 @@ public class EpubParser extends AbstractParser {
         for (String id : contentOrderScraper.locationMap.keySet()) {
             if (!processed.contains(id)) {
                 HRefMediaPair hRefMediaPair = 
contentOrderScraper.locationMap.get(id);
+                String fullPath = relativePath + hRefMediaPair.href;
+                if (encryptedItems.contains(fullPath)) {
+                    continue;
+                }
                 if (shouldHandleEmbedded(hRefMediaPair.media)) {
                     handleEmbedded(zipFile, relativePath, hRefMediaPair, 
embeddedDocumentExtractor,
                             xhtml, metadata);
                 }
             }
         }
-        return true;
+        //throw SAXException if any from the parse of the body contents
+        for (SAXException e : saxExceptions) {
+            throw e;
+        }
+        return encryptedItems;
     }
 
-    private void checkForDRM(ZipFile zipFile) throws IOException, 
EncryptedDocumentException {
+    private Set<String> checkForDRM(ZipFile zipFile) throws IOException, 
TikaException,
+            SAXException {
         ZipArchiveEntry zae = zipFile.getEntry(META_INF_ENCRYPTION);
         if (zae == null) {
-            return;
+            return Collections.EMPTY_SET;
         }
         try (InputStream is = zipFile.getInputStream(zae)) {
-            new EncryptionParser().parse(is, new DefaultHandler(), new 
Metadata(), new ParseContext());
-        } catch (EncryptedDocumentException e) {
-            throw e;
-        } catch (TikaException | SAXException e) {
-            //swallow ?!
+            return EncryptionHandler.parse(is, new ParseContext());
         }
     }
 
-    private void checkForDRM(InputStream is) throws IOException, 
EncryptedDocumentException {
-        try {
-            new EncryptionParser().parse(is, new DefaultHandler(), new 
Metadata(), new ParseContext());
-        } catch (EncryptedDocumentException e) {
-            throw e;
-        } catch (TikaException | SAXException e) {
-            //swallow ?!
+    private void checkForDRM(InputStream is, ParseContext parseContext)
+            throws IOException, TikaException, SAXException {
+        Set<String> encryptedItems = EncryptionHandler.parse(is, parseContext);
+        maybeThrowEncryptedException(encryptedItems);
+    }
+
+    private void maybeThrowEncryptedException(Set<String> encryptedItems)
+            throws EncryptedDocumentException {
+        if (encryptedItems.size() == 0) {
+            return;
         }
+        StringBuilder sb = new StringBuilder();
+        sb.append("EPUB contains encrypted items: ");
+        int added = 0;
+        for (String u : encryptedItems) {
+            if (sb.length() > 500) {
+                sb.append(" and others...");
+                break;
+            }
+            if (added++ > 0) {
+                sb.append(", ");
+            }
+            sb.append(u);
+        }
+        throw new EncryptedDocumentException(sb.toString());
     }
 
     private boolean shouldHandleEmbedded(String media) {
@@ -395,6 +436,7 @@ public class EpubParser extends AbstractParser {
         if (!StringUtils.isBlank(hRefMediaPair.media)) {
             embeddedMetadata.set(Metadata.CONTENT_TYPE, hRefMediaPair.media);
         }
+        embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fullPath);
         if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
             return;
         }
@@ -535,4 +577,69 @@ public class EpubParser extends AbstractParser {
             return "HRefMediaPair{" + "href='" + href + '\'' + ", media='" + 
media + '\'' + '}';
         }
     }
+
+
+    private static class EncryptionHandler extends DefaultHandler {
+        private static Set<String> parse(InputStream is, ParseContext 
parseContext)
+                throws TikaException, IOException, SAXException {
+            EncryptionHandler handler = new EncryptionHandler();
+            XMLReaderUtils.parseSAX(is, handler, parseContext);
+            return handler.getEncryptedItems();
+        }
+
+        Set<String> encryptedItems = new HashSet<>();
+        @Override
+        public void startElement(String uri, String localName, String qName, 
Attributes attributes) {
+            if ("CipherReference".equals(localName)) {
+                String encryptedUri = XMLReaderUtils.getAttrValue("URI", 
attributes);
+                encryptedItems.add(encryptedUri);
+            }
+        }
+        public Set<String> getEncryptedItems() {
+            return encryptedItems;
+        }
+    }
+
+    //any problem with parsing an epub file when it is
+    //a zip file
+    private static class EpubZipException extends IOException {
+
+    }
+
+    //for now, this simply converts all names to local names to avoid
+    //namespace conflicts in the content handler. This also removes namespaces
+    //from attributes
+    private class EpubNormalizingHandler extends ContentHandlerDecorator {
+        public EpubNormalizingHandler(ContentHandler contentHandler) {
+            super(contentHandler);
+        }
+
+        @Override
+        public void startElement(String uri, String localName, String name, 
Attributes atts)
+                throws SAXException {
+            //some atts may have namespaces that were not included in the 
header
+            boolean needToRewrite = false;
+            for (int i = 0; i < atts.getLength(); i++) {
+                if (atts.getQName(i) != null && ! 
atts.getQName(i).equals(atts.getLocalName(i))) {
+                    needToRewrite = true;
+                    break;
+                }
+            }
+            if (needToRewrite) {
+                AttributesImpl simplifiedAtts = new AttributesImpl();
+                for (int i = 0; i < atts.getLength(); i++) {
+                    simplifiedAtts.addAttribute("", atts.getLocalName(i), 
atts.getLocalName(i),
+                            atts.getType(i), atts.getValue(i));
+                }
+                super.startElement(uri, localName, localName, simplifiedAtts);
+            } else {
+                super.startElement(uri, localName, localName, atts);
+            }
+        }
+
+        @Override
+        public void endElement(String uri, String localName, String name) 
throws SAXException {
+            super.endElement(uri, localName, localName);
+        }
+    }
 }

Reply via email to