This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 118734a TIKA-3644 -- Improve consistency in reporting package-entry
divs across all parsers for embedded files
118734a is described below
commit 118734a1317fa13ad66959fdc28969ca50a49643
Author: tallison <[email protected]>
AuthorDate: Thu Jan 13 14:57:29 2022 -0500
TIKA-3644 -- Improve consistency in reporting package-entry divs across all
parsers for embedded files
---
CHANGES.txt | 3 +++
.../tika/parser/apple/AppleSingleFileParser.java | 2 +-
.../org/apache/tika/parser/apple/PListParser.java | 2 +-
.../org/apache/tika/parser/crypto/TSDParser.java | 2 +-
.../org/apache/tika/parser/html/HtmlHandler.java | 6 ++---
.../apache/tika/parser/microsoft/EMFParser.java | 4 +--
.../apache/tika/parser/microsoft/TNEFParser.java | 2 +-
.../microsoft/onenote/OneNoteTreeWalker.java | 2 +-
.../microsoft/ooxml/AbstractOOXMLExtractor.java | 30 +++++++++++-----------
.../microsoft/ooxml/OOXMLExtractorFactory.java | 2 +-
.../ooxml/xwpf/ml2006/BinaryDataHandler.java | 2 +-
.../parser/microsoft/rtf/RTFEmbObjHandler.java | 2 +-
.../tika/parser/microsoft/xml/WordMLParser.java | 2 +-
.../org/apache/tika/parser/epub/EpubParser.java | 2 +-
.../parser/odf/FlatOpenDocumentMacroHandler.java | 2 +-
.../tika/parser/odf/OpenDocumentBodyHandler.java | 2 +-
.../apache/tika/parser/odf/OpenDocumentParser.java | 6 +++--
.../org/apache/tika/parser/odf/ODFParserTest.java | 2 +-
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 6 ++---
.../tika/parser/pdf/ImageGraphicsEngine.java | 4 +--
.../tika/parser/RecursiveParserWrapperTest.java | 2 +-
.../tika/parser/microsoft/XML2003ParserTest.java | 3 ++-
22 files changed, 48 insertions(+), 42 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index 9fbe8ed..3a92b82 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 2.2.2 - ???
+ * Improve consistency in reporting package-entry divs across
+ all parsers for embedded files (TIKA-3644).
+
* Improve configuration of maps as params for parsers in
TikaConfig (TIKA-3645).
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
index afe7b69..cf1fa8c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
@@ -100,7 +100,7 @@ public class AppleSingleFileParser extends AbstractParser {
// stream to ensure that not more than contentFieldInfo.length
bytes
// are read
ex.parseEmbedded(new CloseShieldInputStream(stream), xhtml,
embeddedMetadata,
- false);
+ true);
}
}
xhtml.endDocument();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java
index 8f0537d..8b15839 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java
@@ -199,7 +199,7 @@ public class PListParser extends AbstractParser {
try (TikaInputStream tis = TikaInputStream.get(value.bytes())) {
state.embeddedDocumentExtractor
- .parseEmbedded(tis, state.xhtml, embeddedMetadata, false);
+ .parseEmbedded(tis, state.xhtml, embeddedMetadata, true);
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
index 5d0b05e..4c38b11 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
@@ -172,7 +172,7 @@ public class TSDParser extends AbstractParser {
cmsTimeStampedDataParser = new
CMSTimeStampedDataParser(stream);
try (InputStream is =
TikaInputStream.get(cmsTimeStampedDataParser.getContent())) {
- edx.parseEmbedded(is, handler, metadata, false);
+ edx.parseEmbedded(is, handler, metadata, true);
}
} catch (SecurityException e) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
index 4749c43..4a208d8 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
@@ -323,7 +323,7 @@ class HtmlHandler extends TextContentHandler {
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
try (InputStream stream = dataURIScheme.getInputStream()) {
- embeddedDocumentExtractor.parseEmbedded(stream, xhtml, m,
false);
+ embeddedDocumentExtractor.parseEmbedded(stream, xhtml, m,
true);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e,
metadata);
}
@@ -357,7 +357,7 @@ class HtmlHandler extends TextContentHandler {
if
(embeddedDocumentExtractor.shouldParseEmbedded(dataUriMetadata)) {
try (InputStream dataURISchemeInputStream =
dataURIScheme.getInputStream()) {
embeddedDocumentExtractor
- .parseEmbedded(dataURISchemeInputStream, xhtml,
dataUriMetadata, false);
+ .parseEmbedded(dataURISchemeInputStream, xhtml,
dataUriMetadata, true);
} catch (IOException e) {
//swallow
}
@@ -366,7 +366,7 @@ class HtmlHandler extends TextContentHandler {
try (InputStream stream = new ByteArrayInputStream(
script.toString().getBytes(StandardCharsets.UTF_8))) {
- embeddedDocumentExtractor.parseEmbedded(stream, xhtml, m, false);
+ embeddedDocumentExtractor.parseEmbedded(stream, xhtml, m, true);
} catch (IOException e) {
//shouldn't ever happen
} finally {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
index 204b99f..f699756 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
@@ -67,7 +67,7 @@ public class EMFParser extends AbstractParser {
if
(embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
embeddedDocumentExtractor
.parseEmbedded(is, new
EmbeddedContentHandler(handler), embeddedMetadata,
- false);
+ true);
}
} catch (IOException e) {
//swallow
@@ -155,7 +155,7 @@ public class EMFParser extends AbstractParser {
try (InputStream is = TikaInputStream.get(bytes)) {
embeddedDocumentExtractor
.parseEmbedded(is, new
EmbeddedContentHandler(contentHandler),
- embeddedMetadata, false);
+ embeddedMetadata, true);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
index aeec6ef..1fba5e3 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
@@ -117,7 +117,7 @@ public class TNEFParser extends AbstractParser {
if (embeddedExtractor.shouldParseEmbedded(metadata)) {
embeddedExtractor.parseEmbedded(TikaInputStream.get(contents),
- new EmbeddedContentHandler(handler), metadata, false);
+ new EmbeddedContentHandler(handler), metadata, true);
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
index ee6938a..3dbe8e7 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
@@ -347,7 +347,7 @@ class OneNoteTreeWalker {
try {
stream = TikaInputStream.get(buf.array());
embeddedDocumentExtractor.parseEmbedded(stream, new
EmbeddedContentHandler(xhtml),
- embeddedMetadata, false);
+ embeddedMetadata, true);
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
xhtml.startElement("div", attributes);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 2cdf4ff..f965dd4 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -185,7 +185,7 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) {
embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream),
- new EmbeddedContentHandler(handler),
thumbnailMetadata, false);
+ new EmbeddedContentHandler(handler),
thumbnailMetadata, true);
}
tStream.close();
@@ -200,7 +200,7 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
}
}
- private void handleEmbeddedParts(ContentHandler handler, Metadata metadata)
+ private void handleEmbeddedParts(XHTMLContentHandler xhtml, Metadata
metadata)
throws TikaException, IOException, SAXException {
//keep track of media items that have been handled
//there can be multiple relationships pointing to the
@@ -215,7 +215,7 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
}
for (PackageRelationship rel : source.getRelationships()) {
try {
- handleEmbeddedPart(source, rel, handler, metadata,
handledTarget);
+ handleEmbeddedPart(source, rel, xhtml, metadata,
handledTarget);
} catch (SAXException | SecurityException e) {
throw e;
} catch (Exception e) {
@@ -229,7 +229,7 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
}
private void handleEmbeddedPart(PackagePart source, PackageRelationship
rel,
- ContentHandler handler, Metadata
parentMetadata,
+ XHTMLContentHandler xhtml, Metadata
parentMetadata,
Set<String> handledTarget)
throws IOException, SAXException, TikaException,
InvalidFormatException {
URI targetURI = rel.getTargetURI();
@@ -265,7 +265,7 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
String type = rel.getRelationshipType();
if (POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type) &&
TYPE_OLE_OBJECT.equals(target.getContentType())) {
- handleEmbeddedOLE(target, handler, sourceDesc + rel.getId(),
parentMetadata);
+ handleEmbeddedOLE(target, xhtml, sourceDesc + rel.getId(),
parentMetadata);
if (targetURI != null) {
handledTarget.add(targetURI.toString());
}
@@ -273,12 +273,12 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
RELATION_AUDIO.equals(type) ||
PackageRelationshipTypes.IMAGE_PART.equals(type) ||
POIXMLDocument.PACK_OBJECT_REL_TYPE.equals(type) ||
POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type)) {
- handleEmbeddedFile(target, handler, sourceDesc + rel.getId());
+ handleEmbeddedFile(target, xhtml, sourceDesc + rel.getId());
if (targetURI != null) {
handledTarget.add(targetURI.toString());
}
} else if (XSSFRelation.VBA_MACROS.getRelation().equals(type)) {
- handleMacros(target, handler);
+ handleMacros(target, xhtml);
if (targetURI != null) {
handledTarget.add(targetURI.toString());
}
@@ -289,7 +289,7 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
/**
* Handles an embedded OLE object in the document
*/
- private void handleEmbeddedOLE(PackagePart part, ContentHandler handler,
String rel,
+ private void handleEmbeddedOLE(PackagePart part, XHTMLContentHandler
xhtml, String rel,
Metadata parentMetadata) throws
IOException, SAXException {
// A POIFSFileSystem needs to be at least 3 blocks big to be valid
if (part.getSize() >= 0 && part.getSize() < 512 * 3) {
@@ -323,8 +323,8 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
stream =
TikaInputStream.get(fs.createDocumentInputStream(packageEntryName));
if (embeddedExtractor.shouldParseEmbedded(metadata)) {
embeddedExtractor
- .parseEmbedded(stream, new
EmbeddedContentHandler(handler), metadata,
- false);
+ .parseEmbedded(stream, xhtml, metadata,
+ true);
}
} else if (POIFSDocumentType.OLE10_NATIVE == type) {
// TIKA-704: OLE 1.0 embedded document
@@ -345,11 +345,11 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
if (stream != null &&
embeddedExtractor.shouldParseEmbedded(metadata)) {
embeddedExtractor
- .parseEmbedded(stream, new
EmbeddedContentHandler(handler), metadata,
- false);
+ .parseEmbedded(stream, xhtml, metadata,
+ true);
}
} else {
- handleEmbeddedFile(part, handler, rel);
+ handleEmbeddedFile(part, xhtml, rel);
}
} catch (FileNotFoundException e) {
// There was no CONTENTS entry, so skip this part
@@ -387,7 +387,7 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
/**
* Handles an embedded file in the document
*/
- protected void handleEmbeddedFile(PackagePart part, ContentHandler
handler, String rel)
+ protected void handleEmbeddedFile(PackagePart part, XHTMLContentHandler
xhtml, String rel)
throws SAXException, IOException {
Metadata metadata = new Metadata();
metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, rel);
@@ -404,7 +404,7 @@ public abstract class AbstractOOXMLExtractor implements
OOXMLExtractor {
if (embeddedExtractor.shouldParseEmbedded(metadata)) {
try (TikaInputStream tis =
TikaInputStream.get(part.getInputStream())) {
embeddedExtractor
- .parseEmbedded(tis, new
EmbeddedContentHandler(handler), metadata, false);
+ .parseEmbedded(tis, xhtml, metadata, true);
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 24e824e..33c92fe 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -249,7 +249,7 @@ public class OOXMLExtractorFactory {
} catch (OpenXML4JException | XmlException e) {
throw new TikaException("Error creating OOXML extractor", e);
} catch (RuntimeSAXException e) {
- throw(SAXException) e.getCause();
+ throw (SAXException) e.getCause();
} finally {
if (tmpRepairedCopy != null) {
if (pkg != null) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BinaryDataHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BinaryDataHandler.java
index 75c7d46..e0898fe 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BinaryDataHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BinaryDataHandler.java
@@ -67,7 +67,7 @@ class BinaryDataHandler extends AbstractPartHandler {
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
Metadata embeddedMetadata = new Metadata();
try (TikaInputStream stream =
TikaInputStream.get(getInputStream())) {
- embeddedDocumentExtractor.parseEmbedded(stream, handler,
embeddedMetadata, false);
+ embeddedDocumentExtractor.parseEmbedded(stream, handler,
embeddedMetadata, true);
} catch (IOException e) {
throw new TikaException("error in finishing part", e);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java
index 02f67ef..a927f5d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java
@@ -222,7 +222,7 @@ class RTFEmbObjHandler {
try {
embeddedDocumentUtil
.parseEmbedded(stream, new
EmbeddedContentHandler(handler), metadata,
- false);
+ true);
} catch (IOException e) {
EmbeddedDocumentUtil.recordEmbeddedStreamException(e,
metadata);
} finally {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
index 5ed1399..6b4851b 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
@@ -285,7 +285,7 @@ public class WordMLParser extends AbstractXML2003Parser {
metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, pictSource);
}
if
(embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
- embeddedDocumentExtractor.parseEmbedded(is, handler,
metadata, false);
+ embeddedDocumentExtractor.parseEmbedded(is, handler,
metadata, true);
}
} catch (IOException e) {
//log
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index dc96a4c..31fc80f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -359,7 +359,7 @@ public class EpubParser extends AbstractParser {
try {
embeddedDocumentExtractor
.parseEmbedded(stream, new EmbeddedContentHandler(xhtml),
embeddedMetadata,
- false);
+ true);
} finally {
IOUtils.closeQuietly(stream);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java
index 7848f39..3bd4b92 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java
@@ -110,7 +110,7 @@ class FlatOpenDocumentMacroHandler extends
ContentHandlerDecorator {
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
try (InputStream is = TikaInputStream.get(bytes)) {
embeddedDocumentExtractor
- .parseEmbedded(is, contentHandler, embeddedMetadata,
false);
+ .parseEmbedded(is, contentHandler, embeddedMetadata,
true);
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
index 792bcaa..94b1d86 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
@@ -530,7 +530,7 @@ class OpenDocumentBodyHandler extends
ElementMappingContentHandler {
Metadata embeddedMetadata = new Metadata();
if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
try (InputStream is = TikaInputStream.get(bytes)) {
- embeddedDocumentExtractor.parseEmbedded(is, handler,
embeddedMetadata, false);
+ embeddedDocumentExtractor.parseEmbedded(is, handler,
embeddedMetadata, true);
}
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index fb64fbd..395eb2e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -256,7 +256,9 @@ public class OpenDocumentParser extends AbstractParser {
EmbeddedDocumentUtil embeddedDocumentUtil)
throws IOException, SAXException, TikaException {
-
+ if (entry.isDirectory()) {
+ return;
+ }
if (entry.getName().contains("manifest.xml")) {
checkForEncryption(zip, context);
} else if (entry.getName().equals("mimetype")) {
@@ -308,7 +310,7 @@ public class OpenDocumentParser extends AbstractParser {
if
(embeddedDocumentUtil.shouldParseEmbedded(embeddedMetadata)) {
embeddedDocumentUtil.parseEmbedded(stream, new
EmbeddedContentHandler(handler),
- embeddedMetadata, false);
+ embeddedMetadata, true);
}
} else if (extractMacros && embeddedName.contains("Basic/")) {
//process all files under Basic/; let maybeHandleMacro figure
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index 27c3848..fd9bf6f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -176,7 +176,7 @@ public class ODFParserTest extends TikaTest {
// Note - contents of maths files not currently supported
String content = handler.toString().trim();
- assertEquals("", content);
+ assertEquals("Thumbnails/thumbnail.png", content.trim());
}
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 41567b5..ae8c6bb 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -289,7 +289,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
try {
embeddedDocumentExtractor
.parseEmbedded(stream, new EmbeddedContentHandler(xhtml),
embeddedMetadata,
- false);
+ true);
} catch (IOException e) {
handleCatchableIOE(e);
}
@@ -424,7 +424,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
try {
embeddedDocumentExtractor
.parseEmbedded(stream, new EmbeddedContentHandler(xhtml),
embeddedMetadata,
- false);
+ true);
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", fileName);
@@ -746,7 +746,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
js = (js == null) ? "" : js;
if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
try (InputStream is =
TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
- embeddedDocumentExtractor.parseEmbedded(is, xhtml, m,
false);
+ embeddedDocumentExtractor.parseEmbedded(is, xhtml, m,
true);
}
}
addNonNullAttribute("class", "javascript", attributes);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
index 63c3825..8515fb3 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
@@ -408,7 +408,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
try (InputStream embeddedIs =
TikaInputStream.get(buffer.toByteArray())) {
embeddedDocumentExtractor
.parseEmbedded(embeddedIs, new
EmbeddedContentHandler(xhtml), metadata,
- false);
+ true);
}
}
@@ -429,7 +429,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class,
ZeroByteFileException.IGNORE_ZERO_BYTE_FILE_EXCEPTION);
embeddedDocumentExtractor.parseEmbedded(TikaInputStream.get(new
byte[0]),
- new EmbeddedContentHandler(xhtml), metadata, false);
+ new EmbeddedContentHandler(xhtml), metadata, true);
} finally {
//replace whatever was there before
parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class,
before);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 9cd9346..e90ea73 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -95,7 +95,7 @@ public class RecursiveParserWrapperTest extends TikaTest {
RecursiveParserWrapper wrapper = new
RecursiveParserWrapper(AUTO_DETECT_PARSER);
InputStream stream =
getResourceAsStream("/test-documents/test_recursive_embedded.docx");
RecursiveParserWrapperHandler handler = new
RecursiveParserWrapperHandler(
- new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60));
+ new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 70));
wrapper.parse(stream, handler, metadata, context);
List<Metadata> list = handler.getMetadataList();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/XML2003ParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/XML2003ParserTest.java
index 2765a2a..ec5d230 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/XML2003ParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/XML2003ParserTest.java
@@ -42,7 +42,8 @@ public class XML2003ParserTest extends TikaTest {
assertContains("<meta name=\"meta:character-count-with-spaces\"
content=\"256\"", xml);
//do not allow nested <p> elements
assertContains(
- "<p /> <img href=\"02000003.jpg\" /><p /> <p><img
href=\"02000004.jpg\" /></p>",
+ "<img href=\"02000003.jpg\" /><div
class=\"package-entry\"><h1>02000003.jpg</h1> " +
+ "</div> <p /> <p><img href=\"02000004.jpg\" />",
xml);
assertContains("<table><tbody>", xml);
assertContains("</tbody></table>", xml);