tika-parser...

tallison Wed, 16 Apr 2014 11:05:27 -0700

Author: tallison
Date: Wed Apr 16 18:04:20 2014
New Revision: 1588005

URL: http://svn.apache.org/r1588005
Log:
TIKA-1010 extract embedded documents from RTF


Added:
    tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFEmbeddedFiles.rtf
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFEmbeddedLink.rtf
    
tika/trunk/tika-parsers/src/test/resources/test-documents/testRTFRegularImages.rtf
Modified:
    tika/trunk/CHANGES.txt
    tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
    tika/trunk/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java

Modified: tika/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/tika/trunk/CHANGES.txt?rev=1588005&r1=1588004&r2=1588005&view=diff
==============================================================================
--- tika/trunk/CHANGES.txt (original)
+++ tika/trunk/CHANGES.txt Wed Apr 16 18:04:20 2014
@@ -1,5 +1,7 @@
 Release 1.6 - ??/??/2014
 
+  * Extract attachments from RTF files (TIKA-1010)
+
   * Support Outlook Personal Folders File Format *.pst (TIKA-623)
   
   * Added mime entries for additional Ogg based formats (TIKA-1259)

Modified: 
tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java?rev=1588005&r1=1588004&r2=1588005&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java 
(original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java 
Wed Apr 16 18:04:20 2014
@@ -21,6 +21,7 @@ import java.util.HashSet;
 
 public class FilenameUtils {
 
+
     /**
      * Reserved characters
      */
@@ -34,12 +35,14 @@ public class FilenameUtils {
 
     private final static HashSet<Character> RESERVED = new 
HashSet<Character>(38);
 
+
     static {
         for (int i=0; i<RESERVED_FILENAME_CHARACTERS.length; ++i) {
             RESERVED.add(RESERVED_FILENAME_CHARACTERS[i]);
         }
     }
 
+
     /**
      * Scans the given file name for reserved characters on different OSs and
      * file systems and returns a sanitized version of the name with the
@@ -70,4 +73,40 @@ public class FilenameUtils {
 
         return sb.toString();
     }
+
+    /**
+     * This is a duplication of the algorithm and functionality
+     * available in commons io FilenameUtils.  If Java's File were 
+     * able handle Windows file paths correctly in linux,
+     * we wouldn't need this.
+     * <p>
+     * The goal of this is to get a filename from a path.
+     * The package parsers and some other embedded doc
+     * extractors could put anything into Metadata.RESOURCE_NAME_KEY.
+     * <p>
+     * If a careless client used that filename as if it were a
+     * filename and not a path when writing embedded files,
+     * bad things could happen.  Consider: "../../../my_ppt.ppt".
+     * <p>
+     * Consider using this in combination with {@link #normalize(String)}.
+     * 
+     * @param path path to strip
+     * @return empty string or a filename, never null
+     */
+    public static String getName(final String path) {
+        
+        if (path == null || path.length() == 0) {
+            return "";
+        }
+        int unix = path.lastIndexOf("/");
+        int windows = path.lastIndexOf("\\");
+        //some macintosh file names are stored with : as the delimiter
+        //also necessary to properly handle C:somefilename
+        int colon = path.lastIndexOf(":");
+        String cand = path.substring(Math.max(colon, Math.max(unix, 
windows))+1);
+        if (cand.equals("..") || cand.equals(".")){
+            return "";
+        }
+        return cand;
+    }
 }

Added: 
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java?rev=1588005&view=auto
==============================================================================
--- 
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java 
(added)
+++ 
tika/trunk/tika-core/src/main/java/org/apache/tika/metadata/RTFMetadata.java 
Wed Apr 16 18:04:20 2014
@@ -0,0 +1,46 @@
+package org.apache.tika.metadata; /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */ import org.apache.tika.metadata.Metadata; import 
org.apache.tika.metadata.Property; public interface 
+RTFMetadata {
+    public static final String PREFIX_RTF_META = "rtf_meta";
+    
+    
+    public static final String RTF_PICT_META_PREFIX = "rtf_pict:";
+    
+    /**
+     * if set to true, this means that an image file is probably a "thumbnail"
+     * any time a pict/emf/wmf is in an object
+     */
+    Property THUMBNAIL = Property.internalBoolean(PREFIX_RTF_META+
+            Metadata.NAMESPACE_PREFIX_DELIMITER+"thumbnail");
+    
+    /**
+     * if an application and version is given as part of the
+     * embedded object, this is the literal string
+     */
+    Property EMB_APP_VERSION = Property.internalText(PREFIX_RTF_META+
+            Metadata.NAMESPACE_PREFIX_DELIMITER+"emb_app_version");
+    
+    Property EMB_CLASS = Property.internalText(PREFIX_RTF_META+
+            Metadata.NAMESPACE_PREFIX_DELIMITER+"emb_class");
+    
+    Property EMB_TOPIC = Property.internalText(PREFIX_RTF_META+
+            Metadata.NAMESPACE_PREFIX_DELIMITER+"emb_topic");
+    
+    Property EMB_ITEM = Property.internalText(PREFIX_RTF_META+
+            Metadata.NAMESPACE_PREFIX_DELIMITER+"emb_item");
+    
+}

Modified: 
tika/trunk/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java?rev=1588005&r1=1588004&r2=1588005&view=diff
==============================================================================
--- 
tika/trunk/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java 
(original)
+++ 
tika/trunk/tika-core/src/test/java/org/apache/tika/io/FilenameUtilsTest.java 
Wed Apr 16 18:04:20 2014
@@ -94,5 +94,24 @@ public class FilenameUtilsTest {
         assertEquals(EXPECTED_NAME, FilenameUtils.normalize(TEST_NAME));
     }
 
+    @Test
+    public void testGetName() throws Exception {
+        testFilenameEquality("quick.ppt", "C:\\the\\quick.ppt");
+        testFilenameEquality("quick.ppt", "/the/quick.ppt");
+        testFilenameEquality("", "/the/quick/");
+        testFilenameEquality("", "~/the/quick////\\\\//");
+        testFilenameEquality("~~quick", "~~quick");
+        testFilenameEquality("quick.ppt", "quick.ppt");
+        testFilenameEquality("", "////");
+        testFilenameEquality("", "C:////");
+        testFilenameEquality("", "..");
+        testFilenameEquality("quick", "C:////../the/D:/quick");
+        testFilenameEquality("file.ppt", "path:to:file.ppt" );
+        testFilenameEquality("HW.txt", "_1457338542/HW.txt" );
+    }
+
+    private void testFilenameEquality(String expected, String path) {
+        assertEquals(expected, FilenameUtils.getName(path));
+    }
 
 }

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java?rev=1588005&r1=1588004&r2=1588005&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/GroupState.java
 Wed Apr 16 18:04:20 2014
@@ -33,6 +33,18 @@ class GroupState {
     public int list;
     public int listLevel;
     public Charset fontCharset;
+    //in objdata
+    public boolean objdata;
+    //depth in pict, 1 = at pict level
+    public int pictDepth;
+    //in picprop key/value pair
+    public boolean sp;
+    //in picprop's name 
+    public boolean sn;
+    //in picprop's value
+    public boolean sv;
+    //in embedded object or not
+    public boolean object;
 
     // Create default (root) GroupState
     public GroupState() {
@@ -48,5 +60,8 @@ class GroupState {
         listLevel = other.listLevel;
         fontCharset = other.fontCharset;
         depth = 1+other.depth;
+        pictDepth = other.pictDepth > 0 ? other.pictDepth + 1 : 0;
+        //do not inherit object, sn, sv or sp
+
     }
 }

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java?rev=1588005&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
 Wed Apr 16 18:04:20 2014
@@ -0,0 +1,276 @@
+package org.apache.tika.parser.rtf; /*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */ import java.io.ByteArrayOutputStream; import java.io.IOException; import 
java.io.InputStream; import 
+java.util.concurrent.atomic.AtomicInteger; import 
org.apache.tika.config.TikaConfig; import 
+org.apache.tika.detect.Detector; import 
org.apache.tika.exception.TikaException; import 
+org.apache.tika.extractor.EmbeddedDocumentExtractor; import 
+org.apache.tika.extractor.ParsingEmbeddedDocumentExtractor; import 
org.apache.tika.io.FilenameUtils; import 
+org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; 
import 
+org.apache.tika.metadata.RTFMetadata; import org.apache.tika.mime.MediaType; 
import 
+org.apache.tika.mime.MimeType; import org.apache.tika.mime.MimeTypeException; 
import 
+org.apache.tika.mime.MimeTypes; import org.apache.tika.parser.ParseContext; 
import 
+org.apache.tika.sax.EmbeddedContentHandler; import org.xml.sax.ContentHandler; 
import 
+org.xml.sax.SAXException; /**
+ * This class buffers data from embedded objects and pictures.
+ *
+ * <p/>
+ *
+ * When the parser has finished an object or picture and called
+ * {@link #handleCompletedObject()}, this will write the object
+ * to the {@link #handler}.
+ *
+ * <p/>
+ *
+ * This (in combination with TextExtractor) expects basically a flat parse.  
It will pull out
+ * all pict whether they are tied to objdata or are intended
+ * to be standalone.
+ *
+ * <p/>
+ * This tries to pull metadata around a pict that is encoded
+ * with {sp {sn} {sv}} types of data.  This information
+ * sometimes contains the name and even full file path of the original file.
+ *
+ */ class RTFEmbObjHandler {
+    
+    private static final String EMPTY_STRING = "";
+    
+    private enum EMB_STATE {
+      PICT, //recording pict data
+      OBJDATA, //recording objdata
+      NADA
+    };
+    
+    //high hex cached for writing hexpair chars (data)
+    private int hi = -1;
+    
+    private int thumbCount = 0;
+    //don't need atomic, do need mutable
+    private AtomicInteger unknownFilenameCount = new AtomicInteger();
+    
+    private boolean inObject = false;
+    
+    private String sv = EMPTY_STRING;
+    private String sn = EMPTY_STRING;
+    
+    private StringBuilder sb = new StringBuilder();
+    
+    private final ContentHandler handler;
+    private Metadata metadata;
+    private final ParseContext context;
+    
+    private final ByteArrayOutputStream os;
+    private EMB_STATE state = EMB_STATE.NADA;
+    
+    protected RTFEmbObjHandler(ContentHandler handler, Metadata metadata, 
ParseContext context) {
+        this.handler = handler;
+        this.context = context;
+        os = new ByteArrayOutputStream();
+    }
+    protected void startPict() {
+        state = EMB_STATE.PICT;
+        metadata = new Metadata();
+    }
+    
+    protected void startObjData() {
+        state = EMB_STATE.OBJDATA;
+        metadata = new Metadata();
+    }
+    
+    protected void startSN() {
+        sb.setLength(0);
+        sb.append(RTFMetadata.RTF_PICT_META_PREFIX);
+    }
+    
+    protected void endSN() {
+        sn = sb.toString();
+    }
+    
+    protected void startSV() {
+        sb.setLength(0);
+    }
+    
+    protected void endSV() {
+        sv = sb.toString();
+    }
+    
+    //end metadata pair
+    protected void endSP() {
+        metadata.add(sn, sv);
+    }
+    
+    protected void setInObject(boolean v) {
+        inObject = v;
+    }
+    
+    protected boolean getInObject() {
+        return inObject;
+    }
+    
+    protected void writeMetadataChar(char c) {
+        sb.append(c);
+    }
+    
+    protected void writeHexChar(int b) throws IOException, TikaException {
+        //if not hexchar, ignore
+        //white space is common
+        if (TextExtractor.isHexChar(b)) {
+            if (hi == -1) {
+                hi = 16*TextExtractor.hexValue(b);
+            } else {
+                long sum = hi+TextExtractor.hexValue(b);
+                if (sum > Integer.MAX_VALUE || sum < 0) {
+                    throw new IOException("hex char to byte overflow");
+                }
+                
+                os.write((int)sum);
+                
+                hi = -1;
+            }
+            return;
+        }
+        if (b == -1) {
+            throw new TikaException("hit end of stream before finishing byte 
pair");
+        }
+    }
+    
+    
+    protected void writeBytes(InputStream is, int len) throws IOException, 
TikaException {
+        if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) {
+            throw new IOException("length of bytes to read out of bounds: " + 
len);
+        }
+        
+        byte[] bytes = new byte[len];
+        int bytesRead = is.read(bytes);
+        if (bytesRead < len) {
+            throw new TikaException("unexpected end of file: need " + len +
+                   " bytes of binary data, found " + (len-bytesRead));
+        }
+        os.write(bytes);
+    }
+    
+    /**
+     * Call this when the objdata/pict has completed
+     * @throws IOException
+     * @throws SAXException
+     * @throws TikaException
+     */
+    protected void handleCompletedObject() throws IOException, SAXException, 
TikaException {
+       EmbeddedDocumentExtractor embeddedExtractor = 
context.get(EmbeddedDocumentExtractor.class);
+       
+       if (embeddedExtractor == null) {
+           embeddedExtractor = new ParsingEmbeddedDocumentExtractor(context);
+       }
+       
+       byte[] bytes = os.toByteArray();
+       if (state == EMB_STATE.OBJDATA) {
+           RTFObjDataParser objParser = new RTFObjDataParser();
+           try{
+               byte[] objBytes = objParser.parse(bytes, metadata, 
unknownFilenameCount);
+               extractObj(objBytes, handler, embeddedExtractor, metadata);
+           } catch (IOException e) {
+              //swallow.  If anything goes wrong, ignore.
+           }
+       } else if (state == EMB_STATE.PICT) {
+           String filePath = 
metadata.get(RTFMetadata.RTF_PICT_META_PREFIX+"wzDescription");
+           if (filePath != null && filePath.length() > 0){
+               metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, filePath);
+               metadata.set(Metadata.RESOURCE_NAME_KEY, 
FilenameUtils.getName(filePath));
+           }
+           metadata.set(RTFMetadata.THUMBNAIL, Boolean.toString(inObject));
+           extractObj(bytes, handler, embeddedExtractor, metadata);
+           
+       } else if (state == EMB_STATE.NADA) {
+           //swallow...no start for pict or embed?!
+       }
+       reset();
+    }
+    
+    private void extractObj(byte[] bytes, ContentHandler handler,
+            EmbeddedDocumentExtractor embeddedExtractor, Metadata metadata)
+                    throws SAXException, IOException, TikaException {
+        
+        if (bytes == null) {
+            return;
+        }
+        
+        metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(bytes.length));
+        
+        if (embeddedExtractor.shouldParseEmbedded(metadata)) {
+            TikaInputStream stream = TikaInputStream.get(bytes);
+            if (metadata.get(Metadata.RESOURCE_NAME_KEY) == null) {
+                String extension = getExtension(stream, metadata);
+                stream.reset();
+                if (inObject && state == EMB_STATE.PICT) {
+                    metadata.set(Metadata.RESOURCE_NAME_KEY, 
"thumbnail_"+thumbCount++ + extension);
+                    metadata.set(RTFMetadata.THUMBNAIL, "true");
+                } else {
+                    metadata.set(Metadata.RESOURCE_NAME_KEY, 
"file_"+unknownFilenameCount.getAndIncrement() + 
+extension);
+                }
+            }
+            try {
+                embeddedExtractor.parseEmbedded(
+                        stream,
+                        new EmbeddedContentHandler(handler),
+                        metadata, false);
+            } finally {
+                stream.close();
+            }
+        }
+    }
+    
+    private String getExtension(TikaInputStream is, Metadata metadata) {
+        String cType = metadata.get(Metadata.CONTENT_TYPE);
+        TikaConfig config = getConfig();
+        if (cType == null) {
+            Detector detector = config.getDetector();
+            try {
+                MediaType mediaType = detector.detect(is, metadata);
+                MimeTypes types = config.getMimeRepository();
+                MimeType mime = types.forName(mediaType.toString());
+                metadata.set(Metadata.CONTENT_TYPE, mediaType.getSubtype());
+                return mime.getExtension();
+            } catch (IOException e) {
+                //swallow
+            } catch (MimeTypeException e) {
+                
+            }
+        }
+        return ".bin";
+    }
+    
+    private TikaConfig getConfig() {
+        TikaConfig config = context.get(TikaConfig.class);
+        if (config == null) {
+            config = TikaConfig.getDefaultConfig();
+        }
+        return config;
+    }
+    
+    /**
+     * reset state after each object.
+     * Do not reset unknown file number.
+     */
+    protected void reset() {
+        state = EMB_STATE.NADA;
+        os.reset();
+        metadata = new Metadata();
+        hi = -1;
+        sv = EMPTY_STRING;
+        sn = EMPTY_STRING;
+        sb.setLength(0);
+    }
+}

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java?rev=1588005&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
 Wed Apr 16 18:04:20 2014
@@ -0,0 +1,312 @@
+package org.apache.tika.parser.rtf;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.apache.poi.poifs.filesystem.DirectoryNode;
+import org.apache.poi.poifs.filesystem.DocumentEntry;
+import org.apache.poi.poifs.filesystem.DocumentInputStream;
+import org.apache.poi.poifs.filesystem.Entry;
+import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
+import org.apache.poi.poifs.filesystem.Ole10Native;
+import org.apache.poi.poifs.filesystem.Ole10NativeException;
+import org.apache.poi.util.IOUtils;
+import org.apache.tika.io.FilenameUtils;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.metadata.RTFMetadata;
+import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
+
+/**
+ * Many thanks to Simon Mourier for:
+ * 
http://stackoverflow.com/questions/14779647/extract-embedded-image-object-in-rtf
 
+ * and for granting permission to use his code in Tika.
+ * 
+ */
+class RTFObjDataParser {
+
+    private final static int[] INT_LE_POWS = new int[]{
+        1, 256, 65536, 16777216
+    };
+
+    private final static String WIN_ASCII = "WINDOWS-1252";
+
+    /**
+     * Parses the embedded object/pict string
+     * 
+     * @param bytes actual bytes (already converted from the 
+     *  hex pair string stored in the embedded object data into actual bytes 
or read
+     *  as raw binary bytes)
+     * @return a SimpleRTFEmbObj or null
+     * @throws IOException if there are any surprise surprises during parsing
+     */
+    
+    /**
+     * 
+     * @param bytes
+     * @param metadata incoming metadata
+     * @param unknownFilenameCount 
+     * @return byte[] for contents of obj data
+     * @throws IOException
+     */
+    protected byte[] parse(byte[] bytes, Metadata metadata, AtomicInteger 
unknownFilenameCount) 
+            throws IOException {
+        ByteArrayInputStream is = new ByteArrayInputStream(bytes);
+        long version = readUInt(is);
+        metadata.add(RTFMetadata.EMB_APP_VERSION, Long.toString(version));
+
+        long formatId = readUInt(is);
+        //2 is an embedded object. 1 is a link.
+        if (formatId != 2L) {
+            return null;
+        }
+        String className = readLengthPrefixedAnsiString(is).trim(); 
+        String topicName = readLengthPrefixedAnsiString(is).trim();
+        String itemName = readLengthPrefixedAnsiString(is).trim(); 
+
+        if (className != null && className.length() > 0) {
+            metadata.add(RTFMetadata.EMB_CLASS, className);
+        }
+        if (topicName != null && topicName.length() > 0) {
+            metadata.add(RTFMetadata.EMB_TOPIC, topicName);
+        }
+        if (itemName != null && itemName.length() > 0) {
+            metadata.add(RTFMetadata.EMB_ITEM, itemName);
+        }
+
+        long dataSz = readUInt(is);
+
+        //readBytes tests for reading too many bytes
+        byte[] embObjBytes = readBytes(is, dataSz);
+
+        if (className.toLowerCase().equals("package")){
+            return handlePackage(embObjBytes, metadata);
+        } else if (className.toLowerCase().equals("pbrush")) {
+            //simple bitmap bytes
+            return embObjBytes;
+        } else {
+            ByteArrayInputStream embIs = new ByteArrayInputStream(embObjBytes);
+            if (NPOIFSFileSystem.hasPOIFSHeader(embIs)){
+                try{
+                    return handleEmbeddedPOIFS(embIs, metadata, 
unknownFilenameCount);
+                } catch (IOException e) {
+                    //swallow
+                }   
+            }
+        }
+        return embObjBytes;
+    }
+
+
+    //will throw IOException if not actually POIFS
+    //can return null byte[]
+    private byte[] handleEmbeddedPOIFS(InputStream is, Metadata metadata, 
+            AtomicInteger unknownFilenameCount) 
+            throws IOException {
+
+        NPOIFSFileSystem fs = null;
+        byte[] ret = null;
+        try {
+
+            fs = new NPOIFSFileSystem(is);
+
+            DirectoryNode root = fs.getRoot();
+
+            if (root == null) {
+                return ret;
+            }
+
+            if (root.hasEntry("Package")){
+                Entry ooxml = root.getEntry("Package");
+                TikaInputStream stream = TikaInputStream.get(new 
DocumentInputStream((DocumentEntry) ooxml));
+
+                ByteArrayOutputStream out = new ByteArrayOutputStream();
+
+                IOUtils.copy(stream, out);
+                ret = out.toByteArray();
+            } else {
+                //try poifs
+                POIFSDocumentType type = POIFSDocumentType.detectType(root);
+                if (type == POIFSDocumentType.OLE10_NATIVE) {
+                    try {
+                        // Try to un-wrap the OLE10Native record:
+                        Ole10Native ole = 
Ole10Native.createFromEmbeddedOleObject((DirectoryNode)root);
+                        ret = ole.getDataBuffer();
+                    } catch (Ole10NativeException ex) {
+                        // Not a valid OLE10Native record, skip it
+                    }
+                } else if (type == POIFSDocumentType.COMP_OBJ) {
+
+                    DocumentEntry contentsEntry;
+                    try {
+                        contentsEntry = 
(DocumentEntry)root.getEntry("CONTENTS");
+                    } catch (FileNotFoundException ioe) {
+                        contentsEntry = 
(DocumentEntry)root.getEntry("Contents");
+                    }
+
+                    DocumentInputStream inp = null;
+                    try {
+                        inp = new DocumentInputStream(contentsEntry);
+                        ret = new byte[contentsEntry.getSize()];
+                        inp.readFully(ret);
+                    } finally {
+                        if (inp != null) {
+                            inp.close();
+                        }
+                    }
+                } else {
+
+                    ByteArrayOutputStream out = new ByteArrayOutputStream();
+                    is.reset();
+                    IOUtils.copy(is, out);
+                    ret = out.toByteArray();
+                    metadata.set(Metadata.RESOURCE_NAME_KEY, 
"file_"+unknownFilenameCount.getAndIncrement() + "."+type.getExtension());
+                    metadata.set(Metadata.CONTENT_TYPE, 
type.getType().toString());
+                }
+            }
+        } finally {
+            if (fs != null) {
+                fs.close();
+            }
+        }
+        return ret;
+    }
+
+
+
+    /**
+     * can return null if there is a linked object 
+     * instead of an embedded file
+     */
+    private byte[] handlePackage(byte[] pkgBytes, Metadata metadata) throws 
IOException { 
+        //now parse the package header
+        ByteArrayInputStream is = new ByteArrayInputStream(pkgBytes);
+        readUShort(is);
+
+        String displayName = readAnsiString(is);
+
+        //should we add this to the metadata?
+        readAnsiString(is); //iconFilePath
+        readUShort(is); //iconIndex
+        int type = readUShort(is); //type
+
+        //1 is link, 3 is embedded object
+        //this only handles embedded objects
+        if (type != 3) {
+            return null;
+        }
+        //should we really be ignoring this filePathLen?
+        readUInt(is); //filePathLen
+
+        String ansiFilePath = readAnsiString(is); //filePath
+        long bytesLen = readUInt(is);
+        byte[] objBytes = initByteArray(bytesLen);
+        is.read(objBytes);
+        StringBuilder unicodeFilePath = new StringBuilder();
+
+        try {
+            long unicodeLen = readUInt(is);
+
+            for (int i = 0; i < unicodeLen; i++){
+                int lo = is.read();
+                int hi = is.read();
+                int sum = lo+256*hi;
+                if (hi == -1 || lo == -1){
+                    //stream ran out; empty SB and stop
+                    unicodeFilePath.setLength(0);
+                    break;
+                }
+                unicodeFilePath.append((char)sum);
+            }
+        } catch (IOException e) {
+            //swallow; the unicode file path is optional and might not happen
+            unicodeFilePath.setLength(0);
+        }
+        String fileNameToUse = "";
+        String pathToUse = "";
+        if (unicodeFilePath.length() > 0){
+            String p = unicodeFilePath.toString();
+            fileNameToUse = p;
+            pathToUse = p;
+        } else {
+            fileNameToUse = displayName == null ? "" : displayName;
+            pathToUse = ansiFilePath == null ? "" : ansiFilePath;
+        }
+        metadata.set(Metadata.RESOURCE_NAME_KEY, 
FilenameUtils.getName(fileNameToUse));
+        metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, pathToUse);
+
+        return objBytes;
+    }
+
+
+    private int readUShort(InputStream is) throws IOException {
+        int lo = is.read();
+        int hi = is.read()*256;
+        if (lo == -1 || hi == -1) {
+            throw new IOException("Hit end of stream before reading little 
endian unsigned short.");
+        }
+        return hi+lo;
+    }
+
+    private long readUInt(InputStream is) throws IOException {
+        long sum = 0;
+        for (int i = 0; i < 4; i++){
+            int v = is.read();
+            if (v == -1) {
+                throw new IOException("Hit end of stream before finishing 
little endian unsigned int.");
+            }
+            sum += v*(long)INT_LE_POWS[i];
+        }
+        return sum;
+    }
+
+    private String readAnsiString(InputStream is) throws IOException {
+        StringBuilder sb = new StringBuilder();
+        int c = is.read();
+        while (c > 0) {
+            sb.append((char)c);
+            c = is.read();
+        }
+        if (c == -1) {
+            throw new IOException("Hit end of stream before end of 
AnsiString");
+        }
+        return sb.toString();
+    }
+
+    private String readLengthPrefixedAnsiString(InputStream is) throws 
IOException {
+        long len = readUInt(is);
+        byte[] bytes = readBytes(is, len);
+        try {
+            return new String(bytes, WIN_ASCII);
+        } catch (UnsupportedEncodingException e) {
+            //shouldn't ever happen
+            throw new IOException("Unsupported encoding");
+        }
+    }
+
+
+    private byte[] readBytes(InputStream is, long len) throws IOException {
+        //initByteArray tests for "reading of too many bytes"
+        byte[] bytes = initByteArray(len);
+        int read = is.read(bytes);
+        if (read != len) {
+            throw new IOException("Hit end of stream before reading all 
bytes");
+        }
+
+        return bytes;
+    }
+    
+    private byte[] initByteArray(long len) throws IOException {
+        if (len < 0 || len > RTFParser.getMaxBytesForEmbeddedObject()) {
+            throw new IOException("Requested length for reading bytes is out 
of bounds: " + len);
+        }
+        return new byte[(int)len];
+        
+    }
+}
+

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java?rev=1588005&r1=1588004&r2=1588005&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java 
(original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/RTFParser.java 
Wed Apr 16 18:04:20 2014
@@ -46,13 +46,43 @@ public class RTFParser extends AbstractP
         return SUPPORTED_TYPES;
     }
 
+    /** maximum number of bytes per embedded object/pict (default: 20MB)*/
+    private static int EMB_OBJ_MAX_BYTES = 20*1024*1024; //20MB
+
+    /**
+     * Bytes for embedded objects are currently cached in memory.  
+     * If something goes wrong during the parsing of an embedded object, 
+     * it is possible that a read length may be crazily too long 
+     * and cause a heap crash.
+     *  
+     * @param max maximum number of bytes to allow for embedded objects.  If 
+     * the embedded object has more than this number of bytes, skip it.
+     */
+    public static void setMaxBytesForEmbeddedObject(int max) {
+        EMB_OBJ_MAX_BYTES = max;
+    }
+    
+    /**
+     * See {@link #setMaxBytesForEmbeddedObject(int)}.
+     * 
+     * @return maximum number of bytes allowed for an embedded object.
+     * 
+     */
+    public static int getMaxBytesForEmbeddedObject() {
+        return EMB_OBJ_MAX_BYTES;
+    }
+
     public void parse(
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
         throws IOException, SAXException, TikaException {
         TaggedInputStream tagged = new TaggedInputStream(stream);
         try {
-            final TextExtractor ert = new TextExtractor(new 
XHTMLContentHandler(handler, metadata), metadata);
+            RTFEmbObjHandler embObjHandler = new RTFEmbObjHandler(handler,
+                    metadata, context);
+            final TextExtractor ert = 
+                    new TextExtractor(new XHTMLContentHandler(handler, 
+                    metadata), metadata, embObjHandler);
             ert.extract(stream);
             metadata.add(Metadata.CONTENT_TYPE, "application/rtf");
         } catch (IOException e) {

Modified: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java?rev=1588005&r1=1588004&r2=1588005&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
 (original)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
 Wed Apr 16 18:04:20 2014
@@ -130,6 +130,7 @@ final class TextExtractor {
     private static final Charset GB2312 = getCharset("GB2312");
     private static final Charset MS949 = getCharset("ms949");
 
+    private int written = 0;
     // Hold pending bytes (encoded in the current charset)
     // for text output:
     private byte[] pendingBytes = new byte[16];
@@ -205,6 +206,7 @@ final class TextExtractor {
 
     private final XHTMLContentHandler out;
     private final Metadata metadata;
+    private final RTFEmbObjHandler embObjHandler;
 
     // Used when extracting CREATION date:
     private int year, month, day, hour, minute;
@@ -327,9 +329,11 @@ final class TextExtractor {
         ANSICPG_MAP.put(57011, WINDOWS_57011);   // Punjabi
     }
 
-    public TextExtractor(XHTMLContentHandler out, Metadata metadata) {
+    public TextExtractor(XHTMLContentHandler out, Metadata metadata,
+            RTFEmbObjHandler embObjHandler) {
         this.metadata = metadata;
         this.out = out;
+        this.embObjHandler = embObjHandler;
     }
 
     public boolean isIgnoringLists() {
@@ -340,7 +344,7 @@ final class TextExtractor {
         this.ignoreLists = ignore;
     }
 
-    private static boolean isHexChar(int ch) {
+    protected static boolean isHexChar(int ch) {
         return (ch >= '0' && ch <= '9') ||
             (ch >= 'a' && ch <= 'f') ||
             (ch >= 'A' && ch <= 'F');
@@ -355,7 +359,7 @@ final class TextExtractor {
         return ch >= '0' && ch <= '9';
     }
 
-    private static int hexValue(int ch) {
+    protected static int hexValue(int ch) {
         if (ch >= '0' && ch <= '9') {
             return ch - '0';
         } else if (ch >= 'a' && ch <= 'z') {
@@ -384,16 +388,19 @@ final class TextExtractor {
         if (pendingCharCount != 0) {
             pushChars();
         }
-
-        // Save the byte in pending buffer:
-        if (pendingByteCount == pendingBytes.length) {
-            // Gradual but exponential growth:
-            final byte[] newArray = new byte[(int) (pendingBytes.length*1.25)];
-            System.arraycopy(pendingBytes, 0, newArray, 0, 
pendingBytes.length);
-            pendingBytes = newArray;
-            pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
-        }
-        pendingBytes[pendingByteCount++] = (byte) b;
+        if (groupState.pictDepth > 0) {
+            embObjHandler.writeMetadataChar((char)b);
+        } else {
+            // Save the byte in pending buffer:
+            if (pendingByteCount == pendingBytes.length) {
+                // Gradual but exponential growth:
+                final byte[] newArray = new byte[(int) 
(pendingBytes.length*1.25)];
+                System.arraycopy(pendingBytes, 0, newArray, 0, 
pendingBytes.length);
+                pendingBytes = newArray;
+                pendingByteBuffer = ByteBuffer.wrap(pendingBytes);
+            }
+            pendingBytes[pendingByteCount++] = (byte) b;
+       }
     }
 
    // Buffers a byte as part of a control word:
@@ -417,6 +424,8 @@ final class TextExtractor {
 
         if (inHeader || fieldState == 1) {
             pendingBuffer.append(ch);
+        } else if (groupState.sn == true || groupState.sv == true) {
+            embObjHandler.writeMetadataChar(ch);
         } else {
             if (pendingCharCount == pendingChars.length) {
                 // Gradual but exponential growth:
@@ -467,7 +476,12 @@ final class TextExtractor {
                     // parsed document closing brace
                     break;
                 }
-            } else if (b != '\r' && b != '\n' && (!groupState.ignore || 
nextMetaData != null)) {
+            } else if (groupState.objdata == true ||
+                groupState.pictDepth == 1) {
+                embObjHandler.writeHexChar(b);
+            } else if (b != '\r' && b != '\n' 
+                    && (!groupState.ignore || nextMetaData != null ||
+                    groupState.sn == true || groupState.sv == true)) {
                 // Linefeed and carriage return are not
                 // significant
                 if (ansiSkip != 0) {
@@ -924,7 +938,7 @@ final class TextExtractor {
         // in the header can be unicode escaped as well:
         if (equals("u")) {
             // Unicode escape
-            if (!groupState.ignore) {
+            if (!groupState.ignore || groupState.sv || groupState.sn) {
                 final char utf16CodeUnit = (char) (param & 0xffff);
                 addOutputChar(utf16CodeUnit);
             }
@@ -938,14 +952,25 @@ final class TextExtractor {
             groupState.ucSkip = (int) param;
         } else if (equals("bin")) {
             if (param >= 0) {
-                int bytesToRead = param;
-                byte[] tmpArray = new byte[Math.min(1024, bytesToRead)];
-                while (bytesToRead > 0) {
-                    int r = in.read(tmpArray, 0, Math.min(bytesToRead, 
tmpArray.length));
-                    if (r < 0) {
-                        throw new TikaException("unexpected end of file: need 
" + param + " bytes of binary data, found " + (param-bytesToRead));
+                if (groupState.pictDepth == 1) {
+                    try{
+                        embObjHandler.writeBytes(in, param);
+                    } catch (IOException e) {
+                        //param was out of bounds or something went wrong 
during writing.
+                        //skip this obj and move on
+                        //TODO: log.warn
+                        embObjHandler.reset();
+                    }
+                } else {
+                    int bytesToRead = param;
+                    byte[] tmpArray = new byte[Math.min(1024, bytesToRead)];
+                    while (bytesToRead > 0) {
+                        int r = in.read(tmpArray, 0, Math.min(bytesToRead, 
tmpArray.length));
+                        if (r < 0) {
+                            throw new TikaException("unexpected end of file: 
need " + param + " bytes of binary data, found " + (param-bytesToRead));
+                        }
+                        bytesToRead -= r;
                     }
-                    bytesToRead -= r;
                 }
             } else {
                 // log some warning?
@@ -1156,11 +1181,27 @@ final class TextExtractor {
             // TODO: we should produce a table output here?
             //addOutputChar(' ');
             endParagraph(true);
+        } else if (equals("sp")) {
+            groupState.sp = true;
+        } else if (equals("sn")) {
+            embObjHandler.startSN();
+            groupState.sn = true;
+        } else if (equals("sv")) {
+            embObjHandler.startSV();
+            groupState.sv = true;
+        } else if (equals("object")) {
+            pushText();
+            embObjHandler.setInObject(true);
+            groupState.object = true;
+        } else if (equals("objdata")) {
+            groupState.objdata = true;
+            embObjHandler.startObjData();
         } else if (equals("pict")) {
             pushText();
             // TODO: create img tag?  but can that support
             // embedded image data?
-            groupState.ignore = true;
+            groupState.pictDepth = 1;
+            embObjHandler.startPict();
         } else if (equals("line")) {
             if (!ignored) {
                 addOutputChar('\n');
@@ -1309,6 +1350,25 @@ final class TextExtractor {
 
         assert groupState.depth > 0;
         ansiSkip = 0;
+        
+        if (groupState.objdata == true) {
+            embObjHandler.handleCompletedObject();
+            groupState.objdata = false;
+        } else if (groupState.pictDepth > 0) {
+            if (groupState.sn == true) {
+                embObjHandler.endSN();
+            } else if (groupState.sv == true) {
+                embObjHandler.endSV();
+            } else if (groupState.sp == true) {
+                embObjHandler.endSP();
+            } else if (groupState.pictDepth == 1) {
+                embObjHandler.handleCompletedObject();
+            }
+        }
+
+        if (groupState.object == true) {
+            embObjHandler.setInObject(false);
+        }
 
         // Be robust if RTF doc is corrupt (has too many
         // closing }s):

Modified: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java?rev=1588005&r1=1588004&r2=1588005&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
 (original)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/rtf/RTFParserTest.java
 Wed Apr 16 18:04:20 2014
@@ -19,22 +19,41 @@ package org.apache.tika.parser.rtf;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.assertNotNull;
 
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.InputStream;
 import java.io.StringWriter;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
 
 import org.apache.tika.Tika;
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.extractor.ContainerExtractor;
+import org.apache.tika.extractor.ParserContainerExtractor;
+import org.apache.tika.io.FilenameUtils;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
 import org.apache.tika.metadata.OfficeOpenXMLCore;
+import org.apache.tika.metadata.RTFMetadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.WriteOutContentHandler;
 import org.junit.Test;
+import org.xml.sax.ContentHandler;
 
 /**
  * Junit test class for the Tika {@link RTFParser}
@@ -356,7 +375,24 @@ public class RTFParserTest extends TikaT
     // TIKA-782
     @Test
     public void testBinControlWord() throws Exception {
-        
assertTrue(getXML("testBinControlWord.rtf").xml.indexOf("\u00ff\u00ff\u00ff\u00ff")
 == -1);
+        ByteCopyingHandler embHandler = new ByteCopyingHandler();
+        TikaInputStream tis = null;
+        try {
+            ContainerExtractor ex = new ParserContainerExtractor();
+            tis = 
TikaInputStream.get(getResourceAsStream("/test-documents/testBinControlWord.rtf"));
+            assertEquals(true, ex.isSupported(tis));
+            ex.extract(tis, ex, embHandler);            
+        } finally {
+            tis.close();
+        }
+        assertEquals(1, embHandler.bytes.size());
+        
+        byte[] bytes = embHandler.bytes.get(0);
+        assertEquals(10, bytes.length);
+        //}
+        assertEquals(125, (int)bytes[4]);
+        //make sure that at least the last value is correct
+        assertEquals(-1, (int)bytes[9]);
     }
 
     // TIKA-999
@@ -377,6 +413,167 @@ public class RTFParserTest extends TikaT
         assertContains("Body", content);
     }
 
+    //TIKA-1010
+    @Test
+    public void testEmbeddedMonster() throws Exception {
+        Set<MediaType> skipTypes = new HashSet<MediaType>();
+        skipTypes.add(MediaType.parse("application/x-emf"));
+        skipTypes.add(MediaType.parse("application/x-msmetafile"));
+        
+        
+        List<String> trueNames = new ArrayList<String>();
+        trueNames.add("file_0.doc");
+        trueNames.add("Hw.txt");
+        trueNames.add("file_1.xlsx");
+        trueNames.add("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip");
+        trueNames.add("html-within-zip.zip");
+        trueNames.add("text.html");
+        trueNames.add("testHTML_utf8_\u666E\u6797\u65AF\u987F.html");
+        trueNames.add("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
+        trueNames.add("file_2.xls");
+        trueNames.add("testMSG_\u666E\u6797\u65AF\u987F.msg");
+        trueNames.add("file_3.pdf");
+        trueNames.add("file_4.ppt");
+        trueNames.add("file_5.pptx");
+        trueNames.add("thumbnail_0.jpeg");
+        trueNames.add("file_6.doc");
+        trueNames.add("file_7.doc");
+        trueNames.add("file_8.docx");
+        trueNames.add("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
+
+        List<String> trueTypes = new ArrayList<String>();
+        trueTypes.add("application/msword");
+        trueTypes.add("text/plain");
+        
trueTypes.add("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet");
+        trueTypes.add("application/zip");
+        trueTypes.add("application/zip");
+        trueTypes.add("text/html");
+        trueTypes.add("text/html");
+        trueTypes.add("image/jpeg");
+        trueTypes.add("application/vnd.ms-excel");
+        trueTypes.add("application/vnd.ms-outlook");
+        trueTypes.add("application/pdf");
+        trueTypes.add("application/vnd.ms-powerpoint");
+        
trueTypes.add("application/vnd.openxmlformats-officedocument.presentationml.presentation");
+        trueTypes.add("image/jpeg");
+        trueTypes.add("application/msword");
+        trueTypes.add("application/msword");
+        
trueTypes.add("application/vnd.openxmlformats-officedocument.wordprocessingml.document");
+        trueTypes.add("image/jpeg");
+        
+        TrackingHandler tracker = new TrackingHandler(skipTypes);
+        TikaInputStream tis = null;
+        try {
+            ContainerExtractor ex = new ParserContainerExtractor();
+            tis = 
TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf"));
+            assertEquals(true, ex.isSupported(tis));
+            ex.extract(tis, ex, tracker);
+
+        } finally {
+            tis.close();
+        }
+
+        assertEquals(trueNames.size(), tracker.filenames.size());
+        assertEquals(trueTypes.size(), tracker.mediaTypes.size());
+        for (int i = 0; i < tracker.filenames.size(); i++) {
+            String expectedName = trueNames.get(i);
+            if (expectedName == null) {
+                assertNull(tracker.filenames.get(i));
+            } else {
+                assertNotNull(tracker.filenames.get(i));
+                //necessary to getName() because MSOffice extractor includes
+                //directory: _1457338524/HW.txt
+                assertEquals("filename equals ", 
+                        expectedName, 
FilenameUtils.getName(tracker.filenames.get(i)));
+            }
+            assertEquals(trueTypes.get(i), 
tracker.mediaTypes.get(i).toString());
+        }
+        
+        tracker = new TrackingHandler();
+        tis = null;
+        try {
+            ContainerExtractor ex = new ParserContainerExtractor();
+            tis = 
TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedFiles.rtf"));
+            assertEquals(true, ex.isSupported(tis));
+            ex.extract(tis, ex, tracker);
+
+        } finally {
+            tis.close();
+        }
+        assertEquals(47, tracker.filenames.size());
+        assertEquals("thumbnail_26.emf", tracker.filenames.get(45));
+        assertEquals("thumbnail_27.wmf", tracker.filenames.get(46));
+    }
+    
+    //TIKA-1010 test regular (not "embedded") images/picts
+    public void testRegularImages() throws Exception {
+        Parser base = new AutoDetectParser();
+        ParseContext ctx = new ParseContext();
+        RecursiveMetadataParser parser = new RecursiveMetadataParser(base, 
false);
+        ctx.set(org.apache.tika.parser.Parser.class, parser);
+        TikaInputStream tis = null;
+        ContentHandler handler = new BodyContentHandler();
+        Metadata rootMetadata = new Metadata();
+        rootMetadata.add(Metadata.RESOURCE_NAME_KEY, 
"testRTFRegularImages.rtf");
+        try {
+            tis = 
TikaInputStream.get(getResourceAsStream("/test-documents/testRTFRegularImages.rtf"));
+            parser.parse(tis, handler, rootMetadata, ctx);            
+        } finally {
+            tis.close();
+        }
+        List<Metadata> metadatas =  parser.getAllMetadata();
+
+        Metadata meta_jpg_exif = 
metadatas.get(0);//("testJPEG_EXIF_\u666E\u6797\u65AF\u987F.jpg");
+        Metadata meta_jpg = 
metadatas.get(2);//("testJPEG_\u666E\u6797\u65AF\u987F.jpg");
+        
+        assertTrue(meta_jpg_exif != null);
+        assertTrue(meta_jpg != null);
+        
assertTrue(Arrays.asList(meta_jpg_exif.getValues("dc:subject")).contains("serbor"));
+        assertTrue(meta_jpg.get("Comments").contains("Licensed to the 
Apache"));
+        //make sure old metadata doesn't linger between objects
+        
assertFalse(Arrays.asList(meta_jpg.getValues("dc:subject")).contains("serbor"));
+        assertEquals("false", meta_jpg.get(RTFMetadata.THUMBNAIL));
+        assertEquals("false", meta_jpg_exif.get(RTFMetadata.THUMBNAIL));
+        
+        assertEquals(40, meta_jpg.names().length);
+        assertEquals(105, meta_jpg.names().length);
+    }
+    
+    //TIKA-1010 test linked embedded doc
+    @Test
+    public void testEmbeddedLinkedDocument() throws Exception {
+        Set<MediaType> skipTypes = new HashSet<MediaType>();
+        skipTypes.add(MediaType.parse("application/x-emf"));
+        skipTypes.add(MediaType.parse("application/x-msmetafile"));
+
+        TrackingHandler tracker = new TrackingHandler(skipTypes);
+        TikaInputStream tis = null;
+        try {
+            ContainerExtractor ex = new ParserContainerExtractor();
+            tis = 
TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"));
+            assertEquals(true, ex.isSupported(tis));
+            ex.extract(tis, ex, tracker);
+
+        } finally {
+            tis.close();
+        }
+        //should gracefully skip link and not throw NPE, IOEx, etc
+        assertEquals(0, tracker.filenames.size());
+
+        tracker = new TrackingHandler();
+        tis = null;
+        try {
+            ContainerExtractor ex = new ParserContainerExtractor();
+            tis = 
TikaInputStream.get(getResourceAsStream("/test-documents/testRTFEmbeddedLink.rtf"));
+            assertEquals(true, ex.isSupported(tis));
+            ex.extract(tis, ex, tracker);
+        } finally {
+            tis.close();
+        }
+        //should gracefully skip link and not throw NPE, IOEx, etc
+        assertEquals(2, tracker.filenames.size());
+    }
+
     private Result getResult(String filename) throws Exception {
         File file = getResourceAsFile("/test-documents/" + filename);

svn commit: r1588005 [1/4] - in /tika/trunk: ./ tika-core/src/main/java/org/apache/tika/io/ tika-core/src/main/java/org/apache/tika/metadata/ tika-core/src/test/java/org/apache/tika/io/ tika-parsers/src/main/java/org/apache/tika/parser/rtf/ tika-parser...

Reply via email to