This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push: new f6290858b TIKA-4124 -- extract alternate format chunk from ooxml (#1317) f6290858b is described below commit f6290858bae72ed1c561ce75812c577e6b736a32 Author: Tim Allison <talli...@apache.org> AuthorDate: Tue Sep 5 16:29:00 2023 -0400 TIKA-4124 -- extract alternate format chunk from ooxml (#1317) --- .../java/org/apache/tika/metadata/TikaCoreProperties.java | 3 ++- .../tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java | 12 ++++++++++++ .../parser/microsoft/ooxml/OOXMLContainerExtractionTest.java | 11 +++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java index bbf3cd61a..a75eb8acf 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java @@ -365,6 +365,7 @@ public interface TikaCoreProperties { FONT,//embedded font files THUMBNAIL, //TODO: set this in parsers that handle thumbnails RENDERING, //if a file has been rendered - VERSION //an earlier version of a file + VERSION, //an earlier version of a file + ALTERNATE_FORMAT_CHUNK //OOXML inline alternate format chunk } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java index 55d3893e6..1475b7838 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java @@ -89,6 +89,10 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { "http://schemas.openxmlformats.org/officeDocument/2006/relationships/video"; static final String RELATION_DIAGRAM_DATA = "http://schemas.openxmlformats.org/officeDocument/2006/relationships/diagramData"; + + static final String RELATION_ALTERNATE_FORMAT_CHUNK = + "http://schemas.openxmlformats.org/officeDocument/2006/relationships/aFChunk"; + protected static final String[] EMBEDDED_RELATIONSHIPS = new String[]{RELATION_AUDIO, PackageRelationshipTypes.IMAGE_PART, POIXMLDocument.PACK_OBJECT_REL_TYPE, PackageRelationshipTypes.CORE_DOCUMENT, @@ -301,6 +305,14 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { if (targetURI != null) { handledTarget.add(targetURI.toString()); } + } else if (RELATION_ALTERNATE_FORMAT_CHUNK.equals(type)) { + //TODO check for targetMode=INTERNAL? + handleEmbeddedFile(target, xhtml, sourceDesc + rel.getId(), + embeddedPartMetadata, + TikaCoreProperties.EmbeddedResourceType.ALTERNATE_FORMAT_CHUNK); + if (targetURI != null) { + handledTarget.add(targetURI.toString()); + } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java index dfe86f204..83641751f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLContainerExtractionTest.java @@ -23,6 +23,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.List; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.apache.tika.Tika; @@ -321,4 +322,14 @@ public class OOXMLContainerExtractionTest extends AbstractPOIContainerExtraction assertTrue(found, "didn't find chart in " + suffix); } } + + @Test + @Disabled("until we can add test file to repo") + public void testAltFileChunk() throws Exception { + //not included test file from: https://github.com/jgm/pandoc/files/1290782/Sample_DOCX_using_MHT_container.docx + //Tika is not correctly identifying rfc822, but rather, treating it as html. :( + List<Metadata> metadataList = getRecursiveMetadata("testAltChunkMHT.docx"); + assertEquals(2, metadataList.size()); + assertContains("Example of a table", metadataList.get(1).get(TikaCoreProperties.TIKA_CONTENT)); + } }