This is an automated email from the ASF dual-hosted git repository. dmeikle pushed a commit to branch branch_1x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 38d226801725ce3742bbc29ca62400cee115927a Author: David Meikle <[email protected]> AuthorDate: Sun Nov 8 23:23:06 2020 +0000 TIKA-3156: Added ability to read hyperlinked images from ODT files --- .../tika/parser/odf/OpenDocumentBodyHandler.java | 13 ++++++++++++ .../apache/tika/parser/odf/OpenDocumentParser.java | 23 ++++++++++++++------- .../org/apache/tika/parser/odf/ODFParserTest.java | 8 +++++++ .../test-documents/testODTEmbeddedImageLink.odt | Bin 0 -> 32873 bytes 4 files changed, 37 insertions(+), 7 deletions(-) diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java index 0349c7d..104f510 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentBodyHandler.java @@ -160,6 +160,9 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler { MAPPINGS.put( new QName(TEXT_NS, "a"), new TargetElement(XHTML, "a", aAttsMapping)); + MAPPINGS.put( + new QName(DRAW_NS, "a"), + new TargetElement(XHTML, "a", aAttsMapping)); // create HTML tables from table:-tags MAPPINGS.put( @@ -432,6 +435,16 @@ class OpenDocumentBodyHandler extends ElementMappingContentHandler { String namespaceURI, String localName, String qName, Attributes attrs) throws SAXException { + if (DRAW_NS.equals(namespaceURI) && "image".equals(localName)) { + String link = attrs.getValue(XLINK_NS, "href"); + AttributesImpl attr = new AttributesImpl(); + if (!StringUtils.isEmpty(link)) { + attr.addAttribute("", "src", "src", "CDATA", "embedded:" + link); + } + handler.startElement(XHTMLContentHandler.XHTML, "img", "img", attr); + handler.endElement(XHTMLContentHandler.XHTML, "img", "img"); + } + if (BINARY_DATA.equals(localName)) { inBinaryData = true; return; diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java index b408ccf..851d3b6 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java @@ -32,9 +32,7 @@ import java.util.zip.ZipInputStream; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.CloseShieldInputStream; import org.apache.tika.config.Field; -import org.apache.tika.detect.XmlRootExtractor; import org.apache.tika.exception.TikaException; -import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -101,6 +99,8 @@ public class OpenDocumentParser extends AbstractParser { private static final String META_NAME = "meta.xml"; + private EmbeddedDocumentUtil embeddedDocumentUtil; + private Parser meta = new OpenDocumentMetaParser(); private Parser content = new OpenDocumentContentParser(); @@ -132,6 +132,8 @@ public class OpenDocumentParser extends AbstractParser { Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { + embeddedDocumentUtil = new EmbeddedDocumentUtil(context); + // Open the Zip stream // Use a File if we can, and an already open zip is even better ZipFile zipFile = null; @@ -245,21 +247,28 @@ public class OpenDocumentParser extends AbstractParser { if (embeddedName.contains("Thumbnails/") || embeddedName.contains("Pictures/")) { - EmbeddedDocumentExtractor embeddedDocumentExtractor = - EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(context); Metadata embeddedMetadata = new Metadata(); - embeddedMetadata.set(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, entry.getName()); + TikaInputStream stream = TikaInputStream.get(zip); + + embeddedMetadata.set(Metadata.RESOURCE_NAME_KEY, entry.getName()); if (embeddedName.startsWith("Thumbnails/")) { embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.THUMBNAIL.toString()); } + if (embeddedName.contains("Pictures/")) { embeddedMetadata.set(TikaMetadataKeys.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); + + MediaType embeddedMimeType = embeddedDocumentUtil.getDetector().detect(stream, embeddedMetadata); + if (embeddedMimeType != null) { + embeddedMetadata.set(Metadata.CONTENT_TYPE, embeddedMimeType.toString()); + } + stream.reset(); } - if (embeddedDocumentExtractor.shouldParseEmbedded(embeddedMetadata)) { - embeddedDocumentExtractor.parseEmbedded(zip, + if (embeddedDocumentUtil.shouldParseEmbedded(embeddedMetadata)) { + embeddedDocumentUtil.parseEmbedded(stream, new EmbeddedContentHandler(handler), embeddedMetadata, false); } } else if (extractMacros && embeddedName.contains("Basic/")) { diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java index 719aae5..0b0e2ad 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java @@ -421,6 +421,14 @@ public class ODFParserTest extends TikaTest { assertEquals(3, metadataList.size()); } + @Test + public void testEmbeddedImageAndLink() throws Exception { + String xml = getXML("testODTEmbeddedImageLink.odt").xml; + assertContains("<a href=\"https://tika.apache.org/\">" + + "<img src=\"embedded:Pictures/10000201000001240000006457F5B1D1243E0671.png\" />" + + "<span>Visit Tika</span></a>", xml); + } + @Test(expected = IOException.class) public void testInvalidFromStream() throws Exception { try (InputStream is = this.getClass().getResource( diff --git a/tika-parsers/src/test/resources/test-documents/testODTEmbeddedImageLink.odt b/tika-parsers/src/test/resources/test-documents/testODTEmbeddedImageLink.odt new file mode 100644 index 0000000..88970f7 Binary files /dev/null and b/tika-parsers/src/test/resources/test-documents/testODTEmbeddedImageLink.odt differ
