This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 118734a  TIKA-3644 -- Improve consistency in reporting package-entry 
divs across all parsers for embedded files
118734a is described below

commit 118734a1317fa13ad66959fdc28969ca50a49643
Author: tallison <[email protected]>
AuthorDate: Thu Jan 13 14:57:29 2022 -0500

    TIKA-3644 -- Improve consistency in reporting package-entry divs across all 
parsers for embedded files
---
 CHANGES.txt                                        |  3 +++
 .../tika/parser/apple/AppleSingleFileParser.java   |  2 +-
 .../org/apache/tika/parser/apple/PListParser.java  |  2 +-
 .../org/apache/tika/parser/crypto/TSDParser.java   |  2 +-
 .../org/apache/tika/parser/html/HtmlHandler.java   |  6 ++---
 .../apache/tika/parser/microsoft/EMFParser.java    |  4 +--
 .../apache/tika/parser/microsoft/TNEFParser.java   |  2 +-
 .../microsoft/onenote/OneNoteTreeWalker.java       |  2 +-
 .../microsoft/ooxml/AbstractOOXMLExtractor.java    | 30 +++++++++++-----------
 .../microsoft/ooxml/OOXMLExtractorFactory.java     |  2 +-
 .../ooxml/xwpf/ml2006/BinaryDataHandler.java       |  2 +-
 .../parser/microsoft/rtf/RTFEmbObjHandler.java     |  2 +-
 .../tika/parser/microsoft/xml/WordMLParser.java    |  2 +-
 .../org/apache/tika/parser/epub/EpubParser.java    |  2 +-
 .../parser/odf/FlatOpenDocumentMacroHandler.java   |  2 +-
 .../tika/parser/odf/OpenDocumentBodyHandler.java   |  2 +-
 .../apache/tika/parser/odf/OpenDocumentParser.java |  6 +++--
 .../org/apache/tika/parser/odf/ODFParserTest.java  |  2 +-
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |  6 ++---
 .../tika/parser/pdf/ImageGraphicsEngine.java       |  4 +--
 .../tika/parser/RecursiveParserWrapperTest.java    |  2 +-
 .../tika/parser/microsoft/XML2003ParserTest.java   |  3 ++-
 22 files changed, 48 insertions(+), 42 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index 9fbe8ed..3a92b82 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 2.2.2 - ???
 
+   * Improve consistency in reporting package-entry divs across
+     all parsers for embedded files (TIKA-3644).
+
    * Improve configuration of maps as params for parsers in
      TikaConfig (TIKA-3645).
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
index afe7b69..cf1fa8c 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/AppleSingleFileParser.java
@@ -100,7 +100,7 @@ public class AppleSingleFileParser extends AbstractParser {
                 // stream to ensure that not more than contentFieldInfo.length 
bytes
                 // are read
                 ex.parseEmbedded(new CloseShieldInputStream(stream), xhtml, 
embeddedMetadata,
-                        false);
+                        true);
             }
         }
         xhtml.endDocument();
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java
index 8f0537d..8b15839 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-apple-module/src/main/java/org/apache/tika/parser/apple/PListParser.java
@@ -199,7 +199,7 @@ public class PListParser extends AbstractParser {
 
         try (TikaInputStream tis = TikaInputStream.get(value.bytes())) {
             state.embeddedDocumentExtractor
-                    .parseEmbedded(tis, state.xhtml, embeddedMetadata, false);
+                    .parseEmbedded(tis, state.xhtml, embeddedMetadata, true);
         }
     }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
index 5d0b05e..4c38b11 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-crypto-module/src/main/java/org/apache/tika/parser/crypto/TSDParser.java
@@ -172,7 +172,7 @@ public class TSDParser extends AbstractParser {
                 cmsTimeStampedDataParser = new 
CMSTimeStampedDataParser(stream);
 
                 try (InputStream is = 
TikaInputStream.get(cmsTimeStampedDataParser.getContent())) {
-                    edx.parseEmbedded(is, handler, metadata, false);
+                    edx.parseEmbedded(is, handler, metadata, true);
                 }
 
             } catch (SecurityException e) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
index 4749c43..4a208d8 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
@@ -323,7 +323,7 @@ class HtmlHandler extends TextContentHandler {
                 EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context);
         if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
             try (InputStream stream = dataURIScheme.getInputStream()) {
-                embeddedDocumentExtractor.parseEmbedded(stream, xhtml, m, 
false);
+                embeddedDocumentExtractor.parseEmbedded(stream, xhtml, m, 
true);
             } catch (IOException e) {
                 EmbeddedDocumentUtil.recordEmbeddedStreamException(e, 
metadata);
             }
@@ -357,7 +357,7 @@ class HtmlHandler extends TextContentHandler {
             if 
(embeddedDocumentExtractor.shouldParseEmbedded(dataUriMetadata)) {
                 try (InputStream dataURISchemeInputStream = 
dataURIScheme.getInputStream()) {
                     embeddedDocumentExtractor
-                            .parseEmbedded(dataURISchemeInputStream, xhtml, 
dataUriMetadata, false);
+                            .parseEmbedded(dataURISchemeInputStream, xhtml, 
dataUriMetadata, true);
                 } catch (IOException e) {
                     //swallow
                 }
@@ -366,7 +366,7 @@ class HtmlHandler extends TextContentHandler {
 
         try (InputStream stream = new ByteArrayInputStream(
                 script.toString().getBytes(StandardCharsets.UTF_8))) {
-            embeddedDocumentExtractor.parseEmbedded(stream, xhtml, m, false);
+            embeddedDocumentExtractor.parseEmbedded(stream, xhtml, m, true);
         } catch (IOException e) {
             //shouldn't ever happen
         } finally {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
index 204b99f..f699756 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/EMFParser.java
@@ -67,7 +67,7 @@ public class EMFParser extends AbstractParser {
             if 
(embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
                 embeddedDocumentExtractor
                         .parseEmbedded(is, new 
EmbeddedContentHandler(handler), embeddedMetadata,
-                                false);
+                                true);
             }
         } catch (IOException e) {
             //swallow
@@ -155,7 +155,7 @@ public class EMFParser extends AbstractParser {
             try (InputStream is = TikaInputStream.get(bytes)) {
                 embeddedDocumentExtractor
                         .parseEmbedded(is, new 
EmbeddedContentHandler(contentHandler),
-                                embeddedMetadata, false);
+                                embeddedMetadata, true);
 
             }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
index aeec6ef..1fba5e3 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/TNEFParser.java
@@ -117,7 +117,7 @@ public class TNEFParser extends AbstractParser {
 
         if (embeddedExtractor.shouldParseEmbedded(metadata)) {
             embeddedExtractor.parseEmbedded(TikaInputStream.get(contents),
-                    new EmbeddedContentHandler(handler), metadata, false);
+                    new EmbeddedContentHandler(handler), metadata, true);
         }
     }
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
index ee6938a..3dbe8e7 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNoteTreeWalker.java
@@ -347,7 +347,7 @@ class OneNoteTreeWalker {
         try {
             stream = TikaInputStream.get(buf.array());
             embeddedDocumentExtractor.parseEmbedded(stream, new 
EmbeddedContentHandler(xhtml),
-                    embeddedMetadata, false);
+                    embeddedMetadata, true);
             AttributesImpl attributes = new AttributesImpl();
             attributes.addAttribute("", "class", "class", "CDATA", "embedded");
             xhtml.startElement("div", attributes);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
index 2cdf4ff..f965dd4 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java
@@ -185,7 +185,7 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
 
                 if (embeddedExtractor.shouldParseEmbedded(thumbnailMetadata)) {
                     
embeddedExtractor.parseEmbedded(TikaInputStream.get(tStream),
-                            new EmbeddedContentHandler(handler), 
thumbnailMetadata, false);
+                            new EmbeddedContentHandler(handler), 
thumbnailMetadata, true);
                 }
 
                 tStream.close();
@@ -200,7 +200,7 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
         }
     }
 
-    private void handleEmbeddedParts(ContentHandler handler, Metadata metadata)
+    private void handleEmbeddedParts(XHTMLContentHandler xhtml, Metadata 
metadata)
             throws TikaException, IOException, SAXException {
         //keep track of media items that have been handled
         //there can be multiple relationships pointing to the
@@ -215,7 +215,7 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
                 }
                 for (PackageRelationship rel : source.getRelationships()) {
                     try {
-                        handleEmbeddedPart(source, rel, handler, metadata, 
handledTarget);
+                        handleEmbeddedPart(source, rel, xhtml, metadata, 
handledTarget);
                     } catch (SAXException | SecurityException e) {
                         throw e;
                     } catch (Exception e) {
@@ -229,7 +229,7 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
     }
 
     private void handleEmbeddedPart(PackagePart source, PackageRelationship 
rel,
-                                    ContentHandler handler, Metadata 
parentMetadata,
+                                    XHTMLContentHandler xhtml, Metadata 
parentMetadata,
                                     Set<String> handledTarget)
             throws IOException, SAXException, TikaException, 
InvalidFormatException {
         URI targetURI = rel.getTargetURI();
@@ -265,7 +265,7 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
         String type = rel.getRelationshipType();
         if (POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type) &&
                 TYPE_OLE_OBJECT.equals(target.getContentType())) {
-            handleEmbeddedOLE(target, handler, sourceDesc + rel.getId(), 
parentMetadata);
+            handleEmbeddedOLE(target, xhtml, sourceDesc + rel.getId(), 
parentMetadata);
             if (targetURI != null) {
                 handledTarget.add(targetURI.toString());
             }
@@ -273,12 +273,12 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
                 RELATION_AUDIO.equals(type) || 
PackageRelationshipTypes.IMAGE_PART.equals(type) ||
                 POIXMLDocument.PACK_OBJECT_REL_TYPE.equals(type) ||
                 POIXMLDocument.OLE_OBJECT_REL_TYPE.equals(type)) {
-            handleEmbeddedFile(target, handler, sourceDesc + rel.getId());
+            handleEmbeddedFile(target, xhtml, sourceDesc + rel.getId());
             if (targetURI != null) {
                 handledTarget.add(targetURI.toString());
             }
         } else if (XSSFRelation.VBA_MACROS.getRelation().equals(type)) {
-            handleMacros(target, handler);
+            handleMacros(target, xhtml);
             if (targetURI != null) {
                 handledTarget.add(targetURI.toString());
             }
@@ -289,7 +289,7 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
     /**
      * Handles an embedded OLE object in the document
      */
-    private void handleEmbeddedOLE(PackagePart part, ContentHandler handler, 
String rel,
+    private void handleEmbeddedOLE(PackagePart part, XHTMLContentHandler 
xhtml, String rel,
                                    Metadata parentMetadata) throws 
IOException, SAXException {
         // A POIFSFileSystem needs to be at least 3 blocks big to be valid
         if (part.getSize() >= 0 && part.getSize() < 512 * 3) {
@@ -323,8 +323,8 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
                 stream = 
TikaInputStream.get(fs.createDocumentInputStream(packageEntryName));
                 if (embeddedExtractor.shouldParseEmbedded(metadata)) {
                     embeddedExtractor
-                            .parseEmbedded(stream, new 
EmbeddedContentHandler(handler), metadata,
-                                    false);
+                            .parseEmbedded(stream, xhtml, metadata,
+                                    true);
                 }
             } else if (POIFSDocumentType.OLE10_NATIVE == type) {
                 // TIKA-704: OLE 1.0 embedded document
@@ -345,11 +345,11 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
 
                 if (stream != null && 
embeddedExtractor.shouldParseEmbedded(metadata)) {
                     embeddedExtractor
-                            .parseEmbedded(stream, new 
EmbeddedContentHandler(handler), metadata,
-                                    false);
+                            .parseEmbedded(stream, xhtml, metadata,
+                                    true);
                 }
             } else {
-                handleEmbeddedFile(part, handler, rel);
+                handleEmbeddedFile(part, xhtml, rel);
             }
         } catch (FileNotFoundException e) {
             // There was no CONTENTS entry, so skip this part
@@ -387,7 +387,7 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
     /**
      * Handles an embedded file in the document
      */
-    protected void handleEmbeddedFile(PackagePart part, ContentHandler 
handler, String rel)
+    protected void handleEmbeddedFile(PackagePart part, XHTMLContentHandler 
xhtml, String rel)
             throws SAXException, IOException {
         Metadata metadata = new Metadata();
         metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, rel);
@@ -404,7 +404,7 @@ public abstract class AbstractOOXMLExtractor implements 
OOXMLExtractor {
         if (embeddedExtractor.shouldParseEmbedded(metadata)) {
             try (TikaInputStream tis = 
TikaInputStream.get(part.getInputStream())) {
                 embeddedExtractor
-                        .parseEmbedded(tis, new 
EmbeddedContentHandler(handler), metadata, false);
+                        .parseEmbedded(tis, xhtml, metadata, true);
             }
         }
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
index 24e824e..33c92fe 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java
@@ -249,7 +249,7 @@ public class OOXMLExtractorFactory {
         } catch (OpenXML4JException | XmlException e) {
             throw new TikaException("Error creating OOXML extractor", e);
         } catch (RuntimeSAXException e) {
-            throw(SAXException) e.getCause();
+            throw (SAXException) e.getCause();
         } finally {
             if (tmpRepairedCopy != null) {
                 if (pkg != null) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BinaryDataHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BinaryDataHandler.java
index 75c7d46..e0898fe 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BinaryDataHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/xwpf/ml2006/BinaryDataHandler.java
@@ -67,7 +67,7 @@ class BinaryDataHandler extends AbstractPartHandler {
                     
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
             Metadata embeddedMetadata = new Metadata();
             try (TikaInputStream stream = 
TikaInputStream.get(getInputStream())) {
-                embeddedDocumentExtractor.parseEmbedded(stream, handler, 
embeddedMetadata, false);
+                embeddedDocumentExtractor.parseEmbedded(stream, handler, 
embeddedMetadata, true);
             } catch (IOException e) {
                 throw new TikaException("error in finishing part", e);
             }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java
index 02f67ef..a927f5d 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFEmbObjHandler.java
@@ -222,7 +222,7 @@ class RTFEmbObjHandler {
             try {
                 embeddedDocumentUtil
                         .parseEmbedded(stream, new 
EmbeddedContentHandler(handler), metadata,
-                                false);
+                                true);
             } catch (IOException e) {
                 EmbeddedDocumentUtil.recordEmbeddedStreamException(e, 
metadata);
             } finally {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
index 5ed1399..6b4851b 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/xml/WordMLParser.java
@@ -285,7 +285,7 @@ public class WordMLParser extends AbstractXML2003Parser {
                         
metadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, pictSource);
                     }
                     if 
(embeddedDocumentExtractor.shouldParseEmbedded(metadata)) {
-                        embeddedDocumentExtractor.parseEmbedded(is, handler, 
metadata, false);
+                        embeddedDocumentExtractor.parseEmbedded(is, handler, 
metadata, true);
                     }
                 } catch (IOException e) {
                     //log
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
index dc96a4c..31fc80f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/epub/EpubParser.java
@@ -359,7 +359,7 @@ public class EpubParser extends AbstractParser {
         try {
             embeddedDocumentExtractor
                     .parseEmbedded(stream, new EmbeddedContentHandler(xhtml), 
embeddedMetadata,
-                            false);
+                            true);
 
         } finally {
             IOUtils.closeQuietly(stream);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java
index 7848f39..3bd4b92 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/FlatOpenDocumentMacroHandler.java
@@ -110,7 +110,7 @@ class FlatOpenDocumentMacroHandler extends 
ContentHandlerDecorator {
         if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
             try (InputStream is = TikaInputStream.get(bytes)) {
                 embeddedDocumentExtractor
-                        .parseEmbedded(is, contentHandler, embeddedMetadata, 
false);
+                        .parseEmbedded(is, contentHandler, embeddedMetadata, 
true);
             }
         }
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
index 792bcaa..94b1d86 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java
@@ -530,7 +530,7 @@ class OpenDocumentBodyHandler extends 
ElementMappingContentHandler {
         Metadata embeddedMetadata = new Metadata();
         if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) {
             try (InputStream is = TikaInputStream.get(bytes)) {
-                embeddedDocumentExtractor.parseEmbedded(is, handler, 
embeddedMetadata, false);
+                embeddedDocumentExtractor.parseEmbedded(is, handler, 
embeddedMetadata, true);
             }
         }
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index fb64fbd..395eb2e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -256,7 +256,9 @@ public class OpenDocumentParser extends AbstractParser {
                                 EmbeddedDocumentUtil embeddedDocumentUtil)
             throws IOException, SAXException, TikaException {
 
-
+        if (entry.isDirectory()) {
+            return;
+        }
         if (entry.getName().contains("manifest.xml")) {
             checkForEncryption(zip, context);
         } else if (entry.getName().equals("mimetype")) {
@@ -308,7 +310,7 @@ public class OpenDocumentParser extends AbstractParser {
 
                 if 
(embeddedDocumentUtil.shouldParseEmbedded(embeddedMetadata)) {
                     embeddedDocumentUtil.parseEmbedded(stream, new 
EmbeddedContentHandler(handler),
-                            embeddedMetadata, false);
+                            embeddedMetadata, true);
                 }
             } else if (extractMacros && embeddedName.contains("Basic/")) {
                 //process all files under Basic/; let maybeHandleMacro figure
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index 27c3848..fd9bf6f 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -176,7 +176,7 @@ public class ODFParserTest extends TikaTest {
 
             // Note - contents of maths files not currently supported
             String content = handler.toString().trim();
-            assertEquals("", content);
+            assertEquals("Thumbnails/thumbnail.png", content.trim());
         }
     }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 41567b5..ae8c6bb 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -289,7 +289,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         try {
             embeddedDocumentExtractor
                     .parseEmbedded(stream, new EmbeddedContentHandler(xhtml), 
embeddedMetadata,
-                            false);
+                            true);
         } catch (IOException e) {
             handleCatchableIOE(e);
         }
@@ -424,7 +424,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         try {
             embeddedDocumentExtractor
                     .parseEmbedded(stream, new EmbeddedContentHandler(xhtml), 
embeddedMetadata,
-                            false);
+                            true);
 
             attributes.addAttribute("", "class", "class", "CDATA", "embedded");
             attributes.addAttribute("", "id", "id", "CDATA", fileName);
@@ -746,7 +746,7 @@ class AbstractPDF2XHTML extends PDFTextStripper {
             js = (js == null) ? "" : js;
             if (embeddedDocumentExtractor.shouldParseEmbedded(m)) {
                 try (InputStream is = 
TikaInputStream.get(js.getBytes(StandardCharsets.UTF_8))) {
-                    embeddedDocumentExtractor.parseEmbedded(is, xhtml, m, 
false);
+                    embeddedDocumentExtractor.parseEmbedded(is, xhtml, m, 
true);
                 }
             }
             addNonNullAttribute("class", "javascript", attributes);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
index 63c3825..8515fb3 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/ImageGraphicsEngine.java
@@ -408,7 +408,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
             try (InputStream embeddedIs = 
TikaInputStream.get(buffer.toByteArray())) {
                 embeddedDocumentExtractor
                         .parseEmbedded(embeddedIs, new 
EmbeddedContentHandler(xhtml), metadata,
-                                false);
+                                true);
             }
         }
 
@@ -429,7 +429,7 @@ class ImageGraphicsEngine extends PDFGraphicsStreamEngine {
             
parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class,
                     ZeroByteFileException.IGNORE_ZERO_BYTE_FILE_EXCEPTION);
             embeddedDocumentExtractor.parseEmbedded(TikaInputStream.get(new 
byte[0]),
-                    new EmbeddedContentHandler(xhtml), metadata, false);
+                    new EmbeddedContentHandler(xhtml), metadata, true);
         } finally {
             //replace whatever was there before
             
parseContext.set(ZeroByteFileException.IgnoreZeroByteFileException.class, 
before);
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 9cd9346..e90ea73 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -95,7 +95,7 @@ public class RecursiveParserWrapperTest extends TikaTest {
         RecursiveParserWrapper wrapper = new 
RecursiveParserWrapper(AUTO_DETECT_PARSER);
         InputStream stream = 
getResourceAsStream("/test-documents/test_recursive_embedded.docx");
         RecursiveParserWrapperHandler handler = new 
RecursiveParserWrapperHandler(
-                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 60));
+                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, 70));
         wrapper.parse(stream, handler, metadata, context);
         List<Metadata> list = handler.getMetadataList();
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/XML2003ParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/XML2003ParserTest.java
index 2765a2a..ec5d230 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/XML2003ParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/XML2003ParserTest.java
@@ -42,7 +42,8 @@ public class XML2003ParserTest extends TikaTest {
         assertContains("<meta name=\"meta:character-count-with-spaces\" 
content=\"256\"", xml);
         //do not allow nested <p> elements
         assertContains(
-                "<p /> <img href=\"02000003.jpg\" /><p /> <p><img 
href=\"02000004.jpg\" /></p>",
+                "<img href=\"02000003.jpg\" /><div 
class=\"package-entry\"><h1>02000003.jpg</h1> " +
+                        "</div> <p /> <p><img href=\"02000004.jpg\" />",
                 xml);
         assertContains("<table><tbody>", xml);
         assertContains("</tbody></table>", xml);

Reply via email to