Author: nick
Date: Fri Jan  8 16:44:08 2010
New Revision: 897258

URL: http://svn.apache.org/viewvc?rev=897258&view=rev
Log:
Add embeded (attachment) support to the outlook text extractor

Modified:
    poi/trunk/src/documentation/content/xdocs/status.xml
    poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
    
poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
    
poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java

Modified: poi/trunk/src/documentation/content/xdocs/status.xml
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/documentation/content/xdocs/status.xml?rev=897258&r1=897257&r2=897258&view=diff
==============================================================================
--- poi/trunk/src/documentation/content/xdocs/status.xml (original)
+++ poi/trunk/src/documentation/content/xdocs/status.xml Fri Jan  8 16:44:08 
2010
@@ -34,7 +34,8 @@
 
     <changes>
         <release version="3.7-SNAPSHOT" date="2010-??-??">
-           <action dev="POI-DEVELOPERS" type="fix">Add a text extractor 
(OutlookTextExtractor) to HSMF for simpler extraction of text from .msg 
files</action>
+           <action dev="POI-DEVELOPERS" type="add">Support attachments as 
embeded documents within the new OutlookTextExtractor</action>
+           <action dev="POI-DEVELOPERS" type="add">Add a text extractor 
(OutlookTextExtractor) to HSMF for simpler extraction of text from .msg 
files</action>
            <action dev="POI-DEVELOPERS" type="fix">Some improvements to HSMF 
parsing of .msg files</action>
            <action dev="POI-DEVELOPERS" type="fix">Initialise the link type of 
HSSFHyperLink, so that getType() on it works</action>
            <action dev="POI-DEVELOPERS" type="fix">48425 - improved 
performance of DateUtil.isCellDateFormatted()  </action>

Modified: 
poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java?rev=897258&r1=897257&r2=897258&view=diff
==============================================================================
--- poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java 
(original)
+++ poi/trunk/src/ooxml/java/org/apache/poi/extractor/ExtractorFactory.java Fri 
Jan  8 16:44:08 2010
@@ -16,6 +16,7 @@
 ==================================================================== */
 package org.apache.poi.extractor;
 
+import java.io.ByteArrayInputStream;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileNotFoundException;
@@ -31,6 +32,8 @@
 import org.apache.poi.POIXMLTextExtractor;
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hslf.extractor.PowerPointExtractor;
+import org.apache.poi.hsmf.MAPIMessage;
+import org.apache.poi.hsmf.datatypes.AttachmentChunks;
 import org.apache.poi.hsmf.extractor.OutlookTextExtactor;
 import org.apache.poi.hssf.extractor.ExcelExtractor;
 import org.apache.poi.hwpf.extractor.WordExtractor;
@@ -139,9 +142,14 @@
                        if(entry.getName().equals("VisioDocument")) {
                                return new VisioTextExtractor(poifsDir, fs);
                        }
-                       if(entry.getName().equals("__substg1.0_1000001E") ||
+                       if(
+                             entry.getName().equals("__substg1.0_1000001E") ||
+               entry.getName().equals("__substg1.0_1000001F") ||
                              entry.getName().equals("__substg1.0_0047001E") ||
-                             entry.getName().equals("__substg1.0_0037001E")) {
+               entry.getName().equals("__substg1.0_0047001F") ||
+                             entry.getName().equals("__substg1.0_0037001E") ||
+               entry.getName().equals("__substg1.0_0037001F")
+                       ) {
                           return new OutlookTextExtactor(poifsDir, fs);
                        }
                }
@@ -157,8 +165,12 @@
         *  {...@link POITextExtractor} for each embeded file.
         */
        public static POITextExtractor[] 
getEmbededDocsTextExtractors(POIOLE2TextExtractor ext) throws IOException {
-               // Find all the embeded directories
+          // All the embded directories we spotted
                ArrayList<Entry> dirs = new ArrayList<Entry>();
+               // For anything else not directly held in as a POIFS directory
+               ArrayList<InputStream> nonPOIFS = new ArrayList<InputStream>();
+               
+      // Find all the embeded directories
                POIFSFileSystem fs = ext.getFileSystem();
                if(fs == null) {
                        throw new IllegalStateException("The extractor didn't 
know which POIFS it came from!");
@@ -189,20 +201,44 @@
                } else if(ext instanceof PowerPointExtractor) {
                        // Tricky, not stored directly in poifs
                        // TODO
+               } else if(ext instanceof OutlookTextExtactor) {
+                  // Stored in the Attachment blocks
+                  MAPIMessage msg = 
((OutlookTextExtactor)ext).getMAPIMessage();
+                  for(AttachmentChunks attachment : msg.getAttachmentFiles()) {
+                     if(attachment.attachData != null) {
+                     byte[] data = attachment.attachData.getValue();
+                     nonPOIFS.add( new ByteArrayInputStream(data) );
+                     }
+                  }
                }
                
                // Create the extractors
-               if(dirs == null || dirs.size() == 0) {
+               if(
+                     (dirs == null || dirs.size() == 0) &&
+                     (nonPOIFS == null || nonPOIFS.size() == 0)
+               ){
                        return new POITextExtractor[0];
                }
                
-               POITextExtractor[] te = new POITextExtractor[dirs.size()];
-               for(int i=0; i<te.length; i++) {
-                       te[i] = createExtractor(
+               ArrayList<POITextExtractor> e = new 
ArrayList<POITextExtractor>();
+               for(int i=0; i<dirs.size(); i++) {
+                       e.add( createExtractor(
                                        (DirectoryNode)dirs.get(i), 
ext.getFileSystem()
-                       );
+                       ) );
+               }
+               for(int i=0; i<nonPOIFS.size(); i++) {
+                  try {
+                     e.add( createExtractor(nonPOIFS.get(i)) );
+         } catch(IllegalArgumentException ie) {
+            // Ignore, just means it didn't contain
+            //  a format we support as yet
+                  } catch(XmlException xe) {
+                     throw new IOException(xe.getMessage());
+                  } catch(OpenXML4JException oe) {
+                     throw new IOException(oe.getMessage());
+                  }
                }
-               return te;
+               return e.toArray(new POITextExtractor[e.size()]);
        }
 
        /**

Modified: 
poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java?rev=897258&r1=897257&r2=897258&view=diff
==============================================================================
--- 
poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
 (original)
+++ 
poi/trunk/src/ooxml/testcases/org/apache/poi/extractor/TestExtractorFactory.java
 Fri Jan  8 16:44:08 2010
@@ -59,6 +59,8 @@
    private File pptx;
 
    private File msg;
+   private File msgEmb;
+   
    private File vsd;
 
    protected void setUp() throws Exception {
@@ -86,6 +88,7 @@
       
       POIDataSamples olTests = POIDataSamples.getHSMFInstance();
       msg = olTests.getFile("quick.msg");
+      msgEmb = olTests.getFile("attachment_test_msg.msg");
    }
 
    public void testFile() throws Exception {
@@ -404,9 +407,25 @@
       assertEquals(1, numPpt);
       assertEquals(2, numXls);
       assertEquals(1, numWord);
+      
+      // Outlook
+      ext = (OutlookTextExtactor)
+      ExtractorFactory.createExtractor(msgEmb);
+      embeds = ExtractorFactory.getEmbededDocsTextExtractors(ext);
+
+      numWord = 0; numXls = 0; numPpt = 0;
+      assertEquals(1, embeds.length);
+      for(int i=0; i<embeds.length; i++) {
+         assertTrue(embeds[i].getText().length() > 20);
+         if(embeds[i] instanceof PowerPointExtractor) numPpt++;
+         else if(embeds[i] instanceof ExcelExtractor) numXls++;
+         else if(embeds[i] instanceof WordExtractor) numWord++;
+      }
+      assertEquals(0, numPpt);
+      assertEquals(0, numXls);
+      assertEquals(1, numWord);
 
       // TODO - PowerPoint
       // TODO - Visio
-      // TODO - Outlook
    }
 }

Modified: 
poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
URL: 
http://svn.apache.org/viewvc/poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java?rev=897258&r1=897257&r2=897258&view=diff
==============================================================================
--- 
poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
 (original)
+++ 
poi/trunk/src/scratchpad/src/org/apache/poi/hsmf/extractor/OutlookTextExtactor.java
 Fri Jan  8 16:44:08 2010
@@ -45,6 +45,13 @@
    }
 
    /**
+    * Returns the underlying MAPI message
+    */
+   public MAPIMessage getMAPIMessage() {
+      return (MAPIMessage)document;
+   }
+   
+   /**
     * Outputs something a little like a RFC822 email
     */
    public String getText() {



---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to