Repository: tika
Updated Branches:
  refs/heads/master 6c0b7906e -> 559557aa3


TIKA-1854: add handling for embeddedStorageClassId in MSOffice docs (patch from 
Daniel Bonniot de Ruisselet)


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/559557aa
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/559557aa
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/559557aa

Branch: refs/heads/master
Commit: 559557aa311c6dd81f26d8142a6df6f6fc55513f
Parents: 6c0b790
Author: tballison <[email protected]>
Authored: Fri Feb 5 08:20:46 2016 -0500
Committer: tballison <[email protected]>
Committed: Fri Feb 5 08:20:46 2016 -0500

----------------------------------------------------------------------
 .../apache/tika/metadata/TikaMetadataKeys.java  |  2 ++
 .../microsoft/AbstractPOIFSExtractor.java       | 16 ++++++++-
 .../microsoft/POIContainerExtractionTest.java   | 36 ++++++++++++++++++++
 3 files changed, 53 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/559557aa/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java
----------------------------------------------------------------------
diff --git 
a/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java 
b/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java
index 0c18beb..ce40a11 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaMetadataKeys.java
@@ -27,6 +27,8 @@ public interface TikaMetadataKeys {
 
     String EMBEDDED_RELATIONSHIP_ID = "embeddedRelationshipId";
 
+    String EMBEDDED_STORAGE_CLASS_ID = "embeddedStorageClassId";
+
     String EMBEDDED_RESOURCE_TYPE = "embeddedResourceType";
 
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/559557aa/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 5526c99..cf9d250 100644
--- 
a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ 
b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -28,6 +28,7 @@ import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.poi.hpsf.ClassID;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
@@ -112,6 +113,13 @@ abstract class AbstractPOIFSExtractor {
                                           String relationshipID, String 
mediaType, XHTMLContentHandler xhtml,
                                           boolean outputHtml)
             throws IOException, SAXException, TikaException {
+        handleEmbeddedResource(resource, filename, relationshipID, null, 
mediaType, xhtml, outputHtml);
+    }
+
+    protected void handleEmbeddedResource(TikaInputStream resource, String 
filename,
+                                          String relationshipID, ClassID 
storageClassID, String mediaType, XHTMLContentHandler xhtml,
+                                          boolean outputHtml)
+            throws IOException, SAXException, TikaException {
         try {
             Metadata metadata = new Metadata();
             if (filename != null) {
@@ -121,6 +129,9 @@ abstract class AbstractPOIFSExtractor {
             if (relationshipID != null) {
                 metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, 
relationshipID);
             }
+            if (storageClassID != null) {
+                metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID, 
storageClassID.toString());
+            }
             if (mediaType != null) {
                 metadata.set(Metadata.CONTENT_TYPE, mediaType);
             }
@@ -150,7 +161,7 @@ abstract class AbstractPOIFSExtractor {
                     new DocumentInputStream((DocumentEntry) ooxml))) {
                 ZipContainerDetector detector = new ZipContainerDetector();
                 MediaType type = detector.detect(stream, new Metadata());
-                handleEmbeddedResource(stream, null, dir.getName(), 
type.toString(), xhtml, true);
+                handleEmbeddedResource(stream, null, dir.getName(), 
dir.getStorageClsid(), type.toString(), xhtml, true);
                 return;
             }
         }
@@ -160,6 +171,9 @@ abstract class AbstractPOIFSExtractor {
         // What kind of document is it?
         Metadata metadata = new Metadata();
         metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
+        if (dir.getStorageClsid() != null) {
+            metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID, 
dir.getStorageClsid().toString());
+        }
         POIFSDocumentType type = POIFSDocumentType.detectType(dir);
         TikaInputStream embedded = null;
 

http://git-wip-us.apache.org/repos/asf/tika/blob/559557aa/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
----------------------------------------------------------------------
diff --git 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
index 33f0802..ceb8181 100644
--- 
a/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
+++ 
b/tika-parsers/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
@@ -20,11 +20,22 @@ import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertTrue;
 
+import java.io.InputStream;
+import java.util.List;
+
 import org.apache.tika.TikaTest.TrackingHandler;
 import org.apache.tika.extractor.ContainerExtractor;
 import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.TikaMetadataKeys;
 import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.RecursiveParserWrapper;
+import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.junit.Test;
+import org.xml.sax.helpers.DefaultHandler;
 
 /**
  * Tests that the various POI powered parsers are
@@ -342,4 +353,29 @@ public class POIContainerExtractionTest extends 
AbstractPOIContainerExtractionTe
         assertTrue(handler.mediaTypes.contains(new MediaType("image", 
"jpeg")));
         assertTrue(handler.mediaTypes.contains(new MediaType("image", "png")));
     }
+
+    @Test
+    public void testEmbeddedStorageId() throws Exception {
+
+        Parser p = new AutoDetectParser();
+
+        RecursiveParserWrapper w = new RecursiveParserWrapper(p,
+                new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+        try (InputStream is = getTestFile("testWORD_embeded.doc")) {
+            Metadata meta = new Metadata();
+            ParseContext c = new ParseContext();
+            w.parse(is, new DefaultHandler(), meta, c);
+        }
+        List<Metadata> list = w.getMetadata();
+        //.docx
+        assertEquals("{F4754C9B-64F5-4B40-8AF4-679732AC0607}",
+                list.get(10).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
+        //_1345471035.ppt
+        assertEquals("{64818D10-4F9B-11CF-86EA-00AA00B929E8}",
+                list.get(14).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
+        //_1345470949.xls
+        assertEquals("{00020820-0000-0000-C000-000000000046}",
+                list.get(16).get(TikaMetadataKeys.EMBEDDED_STORAGE_CLASS_ID));
+
+    }
 }

Reply via email to