Repository: tika Updated Branches: refs/heads/master 6c0b7906e -> 559557aa3
TIKA-1854: add handling for embeddedStorageClassId in MSOffice docs (patch from Daniel Bonniot de Ruisselet) Project: http://git-wip-us.apache.org/repos/asf/tika/repo Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/559557aa Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/559557aa Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/559557aa Branch: refs/heads/master Commit: 559557aa311c6dd81f26d8142a6df6f6fc55513f Parents: 6c0b790 Author: tballison <[email protected]> Authored: Fri Feb 5 08:20:46 2016 -0500 Committer: tballison <[email protected]> Committed: Fri Feb 5 08:20:46 2016 -0500 ---------------------------------------------------------------------- .../apache/tika/metadata/TikaMetadataKeys.java | 2 ++ .../microsoft/AbstractPOIFSExtractor.java | 16 ++++++++- .../microsoft/POIContainerExtractionTest.java | 36 ++++++++++++++++++++ 3 files changed, 53 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/tika/blob/559557aa/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java ---------------------------------------------------------------------- diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java index 0c18beb..ce40a11 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java @@ -27,6 +27,8 @@ public interface TikaMetadataKeys { String EMBEDDED_RELATIONSHIP_ID = "embeddedRelationshipId"; + String EMBEDDED_STORAGE_CLASS_ID = "embeddedStorageClassId"; + String EMBEDDED_RESOURCE_TYPE = "embeddedResourceType"; } http://git-wip-us.apache.org/repos/asf/tika/blob/559557aa/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java index 5526c99..cf9d250 100644 --- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java +++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java @@ -28,6 +28,7 @@ import org.apache.poi.poifs.filesystem.DocumentInputStream; import org.apache.poi.poifs.filesystem.Entry; import org.apache.poi.poifs.filesystem.Ole10Native; import org.apache.poi.poifs.filesystem.Ole10NativeException; +import org.apache.poi.hpsf.ClassID; import org.apache.tika.config.TikaConfig; import org.apache.tika.detect.Detector; import org.apache.tika.exception.TikaException; @@ -112,6 +113,13 @@ abstract class AbstractPOIFSExtractor { String relationshipID, String mediaType, XHTMLContentHandler xhtml, boolean outputHtml) throws IOException, SAXException, TikaException { + handleEmbeddedResource(resource, filename, relationshipID, null, mediaType, xhtml, outputHtml); + } + + protected void handleEmbeddedResource(TikaInputStream resource, String filename, + String relationshipID, ClassID storageClassID, String mediaType, XHTMLContentHandler xhtml, + boolean outputHtml) + throws IOException, SAXException, TikaException { try { Metadata metadata = new Metadata(); if (filename != null) { @@ -121,6 +129,9 @@ abstract class AbstractPOIFSExtractor { if (relationshipID != null) { metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, relationshipID); } + if (storageClassID != null) { + metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID, storageClassID.toString()); + } if (mediaType != null) { metadata.set(Metadata.CONTENT_TYPE, mediaType); } @@ -150,7 +161,7 @@ abstract class AbstractPOIFSExtractor { new DocumentInputStream((DocumentEntry) ooxml))) { ZipContainerDetector detector = new ZipContainerDetector(); MediaType type = detector.detect(stream, new Metadata()); - handleEmbeddedResource(stream, null, dir.getName(), type.toString(), xhtml, true); + handleEmbeddedResource(stream, null, dir.getName(), dir.getStorageClsid(), type.toString(), xhtml, true); return; } } @@ -160,6 +171,9 @@ abstract class AbstractPOIFSExtractor { // What kind of document is it? Metadata metadata = new Metadata(); metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName()); + if (dir.getStorageClsid() != null) { + metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID, dir.getStorageClsid().toString()); + } POIFSDocumentType type = POIFSDocumentType.detectType(dir); TikaInputStream embedded = null; http://git-wip-us.apache.org/repos/asf/tika/blob/559557aa/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java ---------------------------------------------------------------------- diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java index 33f0802..ceb8181 100644 --- a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java +++ b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java @@ -20,11 +20,22 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; +import java.io.InputStream; +import java.util.List; + import org.apache.tika.TikaTest.TrackingHandler; import org.apache.tika.extractor.ContainerExtractor; import org.apache.tika.extractor.ParserContainerExtractor; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.metadata.TikaMetadataKeys; import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.RecursiveParserWrapper; +import org.apache.tika.sax.BasicContentHandlerFactory; import org.junit.Test; +import org.xml.sax.helpers.DefaultHandler; /** * Tests that the various POI powered parsers are @@ -342,4 +353,29 @@ public class POIContainerExtractionTest extends AbstractPOIContainerExtractionTe assertTrue(handler.mediaTypes.contains(new MediaType("image", "jpeg"))); assertTrue(handler.mediaTypes.contains(new MediaType("image", "png"))); } + + @Test + public void testEmbeddedStorageId() throws Exception { + + Parser p = new AutoDetectParser(); + + RecursiveParserWrapper w = new RecursiveParserWrapper(p, + new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + try (InputStream is = getTestFile("testWORD_embeded.doc")) { + Metadata meta = new Metadata(); + ParseContext c = new ParseContext(); + w.parse(is, new DefaultHandler(), meta, c); + } + List<Metadata> list = w.getMetadata(); + //.docx + assertEquals("{F4754C9B-64F5-4B40-8AF4-679732AC0607}", + list.get(10).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID)); + //_1345471035.ppt + assertEquals("{64818D10-4F9B-11CF-86EA-00AA00B929E8}", + list.get(14).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID)); + //_1345470949.xls + assertEquals("{00020820-0000-0000-C000-000000000046}", + list.get(16).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID)); + + } }
