This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4176 in repository https://gitbox.apache.org/repos/asf/tika.git
commit f083609b8c09c7a4e0ee67fdc1468bf73578eb1b Author: tallison <talli...@apache.org> AuthorDate: Tue Jan 2 10:36:36 2024 -0500 TIKA-4176 -- throw EncryptedDocumentException for DRM protected epubs --- .../apache/tika/parser/epub/EncryptionParser.java | 88 ++++++++++++++++++++++ .../org/apache/tika/parser/epub/EpubParser.java | 54 ++++++++++++- 2 files changed, 140 insertions(+), 2 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java new file mode 100644 index 000000000..26aae7574 --- /dev/null +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EncryptionParser.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.epub; + +import java.io.IOException; +import java.io.InputStream; +import java.util.Collections; +import java.util.HashSet; +import java.util.Set; + +import org.xml.sax.Attributes; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import org.xml.sax.helpers.DefaultHandler; + +import org.apache.tika.exception.EncryptedDocumentException; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.utils.XMLReaderUtils; + +public class EncryptionParser implements Parser { + + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + return Collections.EMPTY_SET; + } + + @Override + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, + ParseContext context) throws IOException, SAXException, TikaException { + + try { + XMLReaderUtils.parseSAX(stream, new EncryptionHandler(), context); + } catch (SAXException e) { + if (e.getCause() instanceof EncryptedDocumentException) { + throw (EncryptedDocumentException)e.getCause(); + } + } + } + + private class EncryptionHandler extends DefaultHandler { + Set<String> encryptedItems = new HashSet<>(); + @Override + public void startElement(String uri, String localName, String qName, Attributes attributes) { + if ("CipherReference".equals(localName)) { + String encryptedUri = XMLReaderUtils.getAttrValue("URI", attributes); + encryptedItems.add(encryptedUri); + } + } + + @Override + public void endDocument() throws SAXException { + if (encryptedItems.size() > 0) { + StringBuilder sb = new StringBuilder(); + sb.append("EPUB contains encrypted items: "); + int added = 0; + for (String u : encryptedItems) { + if (sb.length() > 500) { + sb.append(" and others..."); + break; + } + if (added++ > 0) { + sb.append(", "); + } + sb.append(u); + } + throw new SAXException(new EncryptedDocumentException(sb.toString())); + } + } + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java index 00b42d77f..97b27f27f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java @@ -46,7 +46,9 @@ import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.config.Field; +import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; +import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.FilenameUtils; @@ -77,6 +79,8 @@ public class EpubParser implements Parser { private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet( new HashSet<>(Arrays.asList(MediaType.application("epub+zip"), MediaType.application("x-ibooks+zip")))); + + private static final String META_INF_ENCRYPTION = "META-INF/encryption.xml"; @Field boolean streaming = false; private Parser meta = new DcXMLParser(); @@ -100,6 +104,11 @@ public class EpubParser implements Parser { this.content = content; } + @Field + public void setStreaming(boolean streaming) { + this.streaming = streaming; + } + public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } @@ -135,22 +144,37 @@ public class EpubParser implements Parser { private void streamingParse(InputStream stream, ContentHandler bodyHandler, Metadata metadata, ParseContext context) throws IOException, TikaException, SAXException { - ZipArchiveInputStream zip = new ZipArchiveInputStream(stream); + ZipArchiveInputStream zip = new ZipArchiveInputStream(stream, "UTF-8", false, true, false); ZipArchiveEntry entry = zip.getNextZipEntry(); + SAXException sax = null; while (entry != null) { if (entry.getName().equals("mimetype")) { updateMimeType(zip, metadata); + } else if (entry.getName().equals(META_INF_ENCRYPTION)) { + checkForDRM(zip); } else if (entry.getName().equals("metadata.xml")) { meta.parse(zip, new DefaultHandler(), metadata, context); } else if (entry.getName().endsWith(".opf")) { opf.parse(zip, new DefaultHandler(), metadata, context); } else if (entry.getName().endsWith(".htm") || entry.getName().endsWith(".html") || entry.getName().endsWith(".xhtml") || entry.getName().endsWith(".xml")) { - content.parse(zip, bodyHandler, metadata, context); + try { + content.parse(zip, bodyHandler, metadata, context); + } catch (SAXException e) { + if (WriteLimitReachedException.isWriteLimitReached(e)) { + throw e; + } + if (sax == null) { + sax = e; + } + } } entry = zip.getNextZipEntry(); } + if (sax != null) { + throw sax; + } } private void updateMimeType(InputStream is, Metadata metadata) throws IOException { @@ -223,6 +247,7 @@ public class EpubParser implements Parser { XHTMLContentHandler xhtml, Metadata metadata, ParseContext context, boolean isStrict) throws IOException, TikaException, SAXException { + String rootOPF = getRoot(zipFile, context); if (rootOPF == null) { return false; @@ -265,6 +290,7 @@ public class EpubParser implements Parser { } extractMetadata(zipFile, metadata, context); + checkForDRM(zipFile); Set<String> processed = new HashSet<>(); for (String id : contentOrderScraper.contentItems) { HRefMediaPair hRefMediaPair = contentOrderScraper.locationMap.get(id); @@ -308,6 +334,30 @@ public class EpubParser implements Parser { return true; } + private void checkForDRM(ZipFile zipFile) throws IOException, EncryptedDocumentException { + ZipArchiveEntry zae = zipFile.getEntry(META_INF_ENCRYPTION); + if (zae == null) { + return; + } + try (InputStream is = zipFile.getInputStream(zae)) { + new EncryptionParser().parse(is, new DefaultHandler(), new Metadata(), new ParseContext()); + } catch (EncryptedDocumentException e) { + throw e; + } catch (TikaException | SAXException e) { + //swallow ?! + } + } + + private void checkForDRM(InputStream is) throws IOException, EncryptedDocumentException { + try { + new EncryptionParser().parse(is, new DefaultHandler(), new Metadata(), new ParseContext()); + } catch (EncryptedDocumentException e) { + throw e; + } catch (TikaException | SAXException e) { + //swallow ?! + } + } + private boolean shouldHandleEmbedded(String media) { if (media == null) { return true;