This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_2x in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_2x by this push: new fa58cd418 TIKA-4219 -- improve epub handling of encrypted non-text-containing items (#1684) fa58cd418 is described below commit fa58cd418eb922fefa5931d0cf803e551baba31e Author: Tim Allison <talli...@apache.org> AuthorDate: Mon Mar 25 12:24:49 2024 -0400 TIKA-4219 -- improve epub handling of encrypted non-text-containing items (#1684) * TIKA-4219 -- improve epub handling of encrypted non-text-containing items --- .../apache/tika/parser/epub/EncryptionParser.java | 88 ---------- .../org/apache/tika/parser/epub/EpubParser.java | 187 ++++++++++++++++----- 2 files changed, 147 insertions(+), 128 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java deleted file mode 100644 index 26aae7574..000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.epub; - -import java.io.IOException; -import java.io.InputStream; -import java.util.Collections; -import java.util.HashSet; -import java.util.Set; - -import org.xml.sax.Attributes; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; -import org.xml.sax.helpers.DefaultHandler; - -import org.apache.tika.exception.EncryptedDocumentException; -import org.apache.tika.exception.TikaException; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.utils.XMLReaderUtils; - -public class EncryptionParser implements Parser { - - @Override - public Set<MediaType> getSupportedTypes(ParseContext context) { - return Collections.EMPTY_SET; - } - - @Override - public void parse(InputStream stream, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { - - try { - XMLReaderUtils.parseSAX(stream, new EncryptionHandler(), context); - } catch (SAXException e) { - if (e.getCause() instanceof EncryptedDocumentException) { - throw (EncryptedDocumentException)e.getCause(); - } - } - } - - private class EncryptionHandler extends DefaultHandler { - Set<String> encryptedItems = new HashSet<>(); - @Override - public void startElement(String uri, String localName, String qName, Attributes attributes) { - if ("CipherReference".equals(localName)) { - String encryptedUri = XMLReaderUtils.getAttrValue("URI", attributes); - encryptedItems.add(encryptedUri); - } - } - - @Override - public void endDocument() throws SAXException { - if (encryptedItems.size() > 0) { - StringBuilder sb = new StringBuilder(); - sb.append("EPUB contains encrypted items: "); - int added = 0; - for (String u : encryptedItems) { - if (sb.length() > 500) { - sb.append(" and others..."); - break; - } - if (added++ > 0) { - sb.append(", "); - } - sb.append(u); - } - throw new SAXException(new EncryptedDocumentException(sb.toString())); - } - } - } -} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java index 1bdd95750..a572ad2cc 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java @@ -43,6 +43,7 @@ import org.apache.commons.lang3.StringUtils; import org.xml.sax.Attributes; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import org.xml.sax.helpers.AttributesImpl; import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.config.Field; @@ -55,12 +56,14 @@ import org.apache.tika.io.FilenameUtils; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.xml.DcXMLParser; import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.ContentHandlerDecorator; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.ParserUtils; @@ -121,7 +124,9 @@ public class EpubParser extends AbstractParser { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); IOException caughtException = null; - ContentHandler childHandler = new EmbeddedContentHandler(new BodyContentHandler(xhtml)); + ContentHandler childHandler = new EmbeddedContentHandler( + new EpubNormalizingHandler(new BodyContentHandler(xhtml))); + Set<String> encryptedItems = Collections.EMPTY_SET; if (streaming) { try { streamingParse(stream, childHandler, metadata, context); @@ -130,7 +135,7 @@ public class EpubParser extends AbstractParser { } } else { try { - bufferedParse(stream, childHandler, xhtml, metadata, context); + encryptedItems = bufferedParse(stream, childHandler, xhtml, metadata, context); } catch (IOException e) { caughtException = e; } @@ -140,9 +145,11 @@ public class EpubParser extends AbstractParser { if (caughtException != null) { throw caughtException; } + maybeThrowEncryptedException(encryptedItems); } - private void streamingParse(InputStream stream, ContentHandler bodyHandler, Metadata metadata, + private Set<String> streamingParse(InputStream stream, ContentHandler bodyHandler, + Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException { ZipArchiveInputStream zip = new ZipArchiveInputStream(stream, "UTF-8", false, true, false); @@ -153,7 +160,8 @@ public class EpubParser extends AbstractParser { if (entry.getName().equals("mimetype")) { updateMimeType(zip, metadata); } else if (entry.getName().equals(META_INF_ENCRYPTION)) { - checkForDRM(zip); + //when streaming, throw an encryption exception if anything is encrypted + checkForDRM(zip, context); } else if (entry.getName().equals("metadata.xml")) { meta.parse(zip, new DefaultHandler(), metadata, context); } else if (entry.getName().endsWith(".opf")) { @@ -176,6 +184,9 @@ public class EpubParser extends AbstractParser { if (sax != null) { throw sax; } + //always empty -- we throw an encryption exception + //as soon as checkForDRM hits an encrypted item + return Collections.EMPTY_SET; } private void updateMimeType(InputStream is, Metadata metadata) throws IOException { @@ -188,7 +199,7 @@ public class EpubParser extends AbstractParser { } - private void bufferedParse(InputStream stream, ContentHandler bodyHandler, + private Set<String> bufferedParse(InputStream stream, ContentHandler bodyHandler, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException { TikaInputStream tis; @@ -196,9 +207,8 @@ public class EpubParser extends AbstractParser { if (TikaInputStream.isTikaInputStream(stream)) { tis = TikaInputStream.cast(stream); if (tis.getOpenContainer() instanceof ZipFile) { - bufferedParseZipFile((ZipFile) tis.getOpenContainer(), bodyHandler, xhtml, metadata, - context, true); - return; + return bufferedParseZipFile((ZipFile) tis.getOpenContainer(), bodyHandler, xhtml, + metadata, context, true); } } else { temporaryResources = new TemporaryResources(); @@ -209,8 +219,7 @@ public class EpubParser extends AbstractParser { zipFile = new ZipFile(tis.getPath().toFile()); } catch (IOException e) { ParserUtils.recordParserFailure(this, e, metadata); - trySalvage(tis.getPath(), bodyHandler, xhtml, metadata, context); - return; + return trySalvage(tis.getPath(), bodyHandler, xhtml, metadata, context); } finally { //if we had to wrap tis if (temporaryResources != null) { @@ -218,44 +227,42 @@ public class EpubParser extends AbstractParser { } } try { - bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, true); + return bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, true); } finally { zipFile.close(); } } - private void trySalvage(Path brokenZip, ContentHandler bodyHandler, XHTMLContentHandler xhtml, + private Set<String> trySalvage(Path brokenZip, ContentHandler bodyHandler, + XHTMLContentHandler xhtml, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException { try (TemporaryResources resources = new TemporaryResources()) { Path salvaged = resources.createTempFile(FilenameUtils.getSuffixFromPath(brokenZip.getFileName().toString())); ZipSalvager.salvageCopy(brokenZip.toFile(), salvaged.toFile()); - boolean success = false; try (ZipFile zipFile = new ZipFile(salvaged.toFile())) { - success = - bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, false); - } - if (!success) { + return bufferedParseZipFile(zipFile, bodyHandler, xhtml, metadata, context, false); + } catch (EpubZipException e) { try (InputStream is = TikaInputStream.get(salvaged)) { - streamingParse(is, xhtml, metadata, context); + return streamingParse(is, xhtml, metadata, context); } } } } - private boolean bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHandler, + private Set<String> bufferedParseZipFile(ZipFile zipFile, ContentHandler bodyHandler, XHTMLContentHandler xhtml, Metadata metadata, ParseContext context, boolean isStrict) - throws IOException, TikaException, SAXException { + throws IOException, TikaException, SAXException, EpubZipException { String rootOPF = getRoot(zipFile, context); if (rootOPF == null) { - return false; + throw new EpubZipException(); } ZipArchiveEntry zae = zipFile.getEntry(rootOPF); if (zae == null || !zipFile.canReadEntryData(zae)) { - return false; + throw new EpubZipException(); } opf.parse(zipFile.getInputStream(zae), new DefaultHandler(), metadata, context); @@ -265,7 +272,7 @@ public class EpubParser extends AbstractParser { } //if no content items, false if (contentOrderScraper.contentItems.size() == 0) { - return false; + throw new EpubZipException(); } String relativePath = ""; if (rootOPF.lastIndexOf("/") > -1) { @@ -286,13 +293,14 @@ public class EpubParser extends AbstractParser { //if not perfect match btwn items and readable items //return false if (found != contentOrderScraper.contentItems.size()) { - return false; + throw new EpubZipException(); } } extractMetadata(zipFile, metadata, context); - checkForDRM(zipFile); + Set<String> encryptedItems = checkForDRM(zipFile); Set<String> processed = new HashSet<>(); + Set<SAXException> saxExceptions = new HashSet<>(); for (String id : contentOrderScraper.contentItems) { HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id); if (hRefMediaPair != null && hRefMediaPair.href != null) { @@ -309,10 +317,21 @@ public class EpubParser extends AbstractParser { shouldParse = true; } if (shouldParse) { + String path = relativePath + hRefMediaPair.href; + //if content is encrypted, do not parse it, throw an exception now + if (encryptedItems.contains(path)) { + maybeThrowEncryptedException(encryptedItems); + } zae = zipFile.getEntry(relativePath + hRefMediaPair.href); if (zae != null) { try (InputStream is = zipFile.getInputStream(zae)) { content.parse(is, bodyHandler, metadata, context); + } catch (SAXException e) { + if (WriteLimitReachedException.isWriteLimitReached(e)) { + throw e; + } + saxExceptions.add(e); + } finally { processed.add(id); } } @@ -326,37 +345,59 @@ public class EpubParser extends AbstractParser { for (String id : contentOrderScraper.locationMap.keySet()) { if (!processed.contains(id)) { HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id); + String fullPath = relativePath + hRefMediaPair.href; + if (encryptedItems.contains(fullPath)) { + continue; + } if (shouldHandleEmbedded(hRefMediaPair.media)) { handleEmbedded(zipFile, relativePath, hRefMediaPair, embeddedDocumentExtractor, xhtml, metadata); } } } - return true; + //throw SAXException if any from the parse of the body contents + for (SAXException e : saxExceptions) { + throw e; + } + return encryptedItems; } - private void checkForDRM(ZipFile zipFile) throws IOException, EncryptedDocumentException { + private Set<String> checkForDRM(ZipFile zipFile) throws IOException, TikaException, + SAXException { ZipArchiveEntry zae = zipFile.getEntry(META_INF_ENCRYPTION); if (zae == null) { - return; + return Collections.EMPTY_SET; } try (InputStream is = zipFile.getInputStream(zae)) { - new EncryptionParser().parse(is, new DefaultHandler(), new Metadata(), new ParseContext()); - } catch (EncryptedDocumentException e) { - throw e; - } catch (TikaException | SAXException e) { - //swallow ?! + return EncryptionHandler.parse(is, new ParseContext()); } } - private void checkForDRM(InputStream is) throws IOException, EncryptedDocumentException { - try { - new EncryptionParser().parse(is, new DefaultHandler(), new Metadata(), new ParseContext()); - } catch (EncryptedDocumentException e) { - throw e; - } catch (TikaException | SAXException e) { - //swallow ?! + private void checkForDRM(InputStream is, ParseContext parseContext) + throws IOException, TikaException, SAXException { + Set<String> encryptedItems = EncryptionHandler.parse(is, parseContext); + maybeThrowEncryptedException(encryptedItems); + } + + private void maybeThrowEncryptedException(Set<String> encryptedItems) + throws EncryptedDocumentException { + if (encryptedItems.size() == 0) { + return; } + StringBuilder sb = new StringBuilder(); + sb.append("EPUB contains encrypted items: "); + int added = 0; + for (String u : encryptedItems) { + if (sb.length() > 500) { + sb.append(" and others..."); + break; + } + if (added++ > 0) { + sb.append(", "); + } + sb.append(u); + } + throw new EncryptedDocumentException(sb.toString()); } private boolean shouldHandleEmbedded(String media) { @@ -395,6 +436,7 @@ public class EpubParser extends AbstractParser { if (!StringUtils.isBlank(hRefMediaPair.media)) { embeddedMetadata.set(Metadata.CONTENT_TYPE, hRefMediaPair.media); } + embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, fullPath); if (!embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { return; } @@ -535,4 +577,69 @@ public class EpubParser extends AbstractParser { return "HRefMediaPair{" + "href='" + href + '\'' + ", media='" + media + '\'' + '}'; } } + + + private static class EncryptionHandler extends DefaultHandler { + private static Set<String> parse(InputStream is, ParseContext parseContext) + throws TikaException, IOException, SAXException { + EncryptionHandler handler = new EncryptionHandler(); + XMLReaderUtils.parseSAX(is, handler, parseContext); + return handler.getEncryptedItems(); + } + + Set<String> encryptedItems = new HashSet<>(); + @Override + public void startElement(String uri, String localName, String qName, Attributes attributes) { + if ("CipherReference".equals(localName)) { + String encryptedUri = XMLReaderUtils.getAttrValue("URI", attributes); + encryptedItems.add(encryptedUri); + } + } + public Set<String> getEncryptedItems() { + return encryptedItems; + } + } + + //any problem with parsing an epub file when it is + //a zip file + private static class EpubZipException extends IOException { + + } + + //for now, this simply converts all names to local names to avoid + //namespace conflicts in the content handler. This also removes namespaces + //from attributes + private class EpubNormalizingHandler extends ContentHandlerDecorator { + public EpubNormalizingHandler(ContentHandler contentHandler) { + super(contentHandler); + } + + @Override + public void startElement(String uri, String localName, String name, Attributes atts) + throws SAXException { + //some atts may have namespaces that were not included in the header + boolean needToRewrite = false; + for (int i = 0; i < atts.getLength(); i++) { + if (atts.getQName(i) != null && ! atts.getQName(i).equals(atts.getLocalName(i))) { + needToRewrite = true; + break; + } + } + if (needToRewrite) { + AttributesImpl simplifiedAtts = new AttributesImpl(); + for (int i = 0; i < atts.getLength(); i++) { + simplifiedAtts.addAttribute("", atts.getLocalName(i), atts.getLocalName(i), + atts.getType(i), atts.getValue(i)); + } + super.startElement(uri, localName, localName, simplifiedAtts); + } else { + super.startElement(uri, localName, localName, atts); + } + } + + @Override + public void endElement(String uri, String localName, String name) throws SAXException { + super.endElement(uri, localName, localName); + } + } }