Author: kwright
Date: Tue Apr 25 21:52:49 2017
New Revision: 1792663

URL: http://svn.apache.org/viewvc?rev=1792663&view=rev
Log:
Fix for CONNECTORS-1417

Modified:
    manifoldcf/trunk/CHANGES.txt
    
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
    
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1792663&r1=1792662&r2=1792663&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue Apr 25 21:52:49 2017
@@ -3,6 +3,9 @@ $Id$
 
 ======================= 2.8-dev =====================
 
+CONNECTORS-1417: Accept multipart/alternal email format.
+(Cihad Guzel)
+
 CONNECTORS-1416: Use a 4-character year for email dates.
 (Cihad Guzel)
 

Modified: 
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java?rev=1792663&r1=1792662&r2=1792663&view=diff
==============================================================================
--- 
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
 (original)
+++ 
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
 Tue Apr 25 21:52:49 2017
@@ -113,6 +113,8 @@ public class EmailConfig {
   
   public static final String MIMETYPE_TEXT_PLAIN = "text/plain";
   public static final String MIMETYPE_HTML = "text/html";
+  public static final String MIMETYPE_MULTIPART_GENERIC = "multipart/*";
+  public static final String MIMETYPE_MULTIPART_ALTERNATIVE = 
"multipart/alternative";
   
   // Fields
   

Modified: 
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
URL: 
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java?rev=1792663&r1=1792662&r2=1792663&view=diff
==============================================================================
--- 
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
 (original)
+++ 
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
 Tue Apr 25 21:52:49 2017
@@ -30,7 +30,6 @@ import org.apache.manifoldcf.crawler.int
 import org.apache.manifoldcf.crawler.system.Logging;
 
 import javax.mail.*;
-import javax.mail.internet.MimeBodyPart;
 import javax.mail.internet.MimeMessage;
 import javax.mail.search.*;
 import java.io.ByteArrayInputStream;
@@ -684,14 +683,18 @@ public class EmailConnector extends org.
 
               //Content includes both body and attachments,
               //Body will be set as content and attachments will be indexed as 
separate documents.
-              InputStream is = new 
ByteArrayInputStream(extractBodyContent(msg).getBytes(StandardCharsets.UTF_8));
-              try {
-                rd.setBinary(is, fileLength);
-                activities.ingestDocumentWithException(documentIdentifier, 
version, msgURL, rd);
-                errorCode = "OK";
-                fileLengthLong = new Long(fileLength);
-              } finally {
-                is.close();
+              final EmailContent bodyContent = extractBodyContent(msg);
+              if(bodyContent != null) {
+                rd.setMimeType(bodyContent.getMimeType());
+                InputStream is = new 
ByteArrayInputStream(bodyContent.getContent().getBytes(StandardCharsets.UTF_8));
+                try {
+                  rd.setBinary(is, fileLength);
+                  activities.ingestDocumentWithException(documentIdentifier, 
version, msgURL, rd);
+                  errorCode = "OK";
+                  fileLengthLong = new Long(fileLength);
+                } finally {
+                  is.close();
+                }
               }
 
               // If we're supposed to deal with attachments, this is the time 
to queue them up
@@ -921,27 +924,65 @@ public class EmailConnector extends org.
 
   }
 
-  private String extractBodyContent(Message msg) throws MessagingException, 
IOException {
-    String bodyContent = null;
+  private EmailContent getContent(Part part) throws MessagingException, 
IOException {
+    if (part.isMimeType(EmailConfig.MIMETYPE_TEXT_PLAIN)) {
+      return new EmailContent(part.getContent().toString());
+    } else if(part.isMimeType(EmailConfig.MIMETYPE_HTML)) {
+      return new EmailContent(part.getContent().toString(), 
EmailConfig.MIMETYPE_HTML);
+    }
+
+    if (part.isMimeType(EmailConfig.MIMETYPE_MULTIPART_ALTERNATIVE)) {
+      // prefer html text over plain text
+      Multipart mp = (Multipart) part.getContent();
+      EmailContent emailContent = null;
+      for (int i = 0; i < mp.getCount(); i++) {
+        Part bodyPart = mp.getBodyPart(i);
+        if (bodyPart.isMimeType(EmailConfig.MIMETYPE_TEXT_PLAIN)) {
+          if (emailContent == null) {
+            emailContent = getContent(bodyPart);
+          }
+          continue;
+        } else if (bodyPart.isMimeType(EmailConfig.MIMETYPE_HTML)) {
+          emailContent = getContent(bodyPart);
+          if (emailContent != null) {
+            return emailContent;
+          }
+        } else {
+          return getContent(bodyPart);
+        }
+      }
+      return emailContent;
+    } else if (part.isMimeType(EmailConfig.MIMETYPE_MULTIPART_GENERIC)) {
+      Multipart mp = (Multipart) part.getContent();
+      for (int i = 0; i < mp.getCount(); i++) {
+        EmailContent emailContent = getContent(mp.getBodyPart(i));
+        if (emailContent != null) {
+          return emailContent;
+        }
+      }
+    }
+    return null;
+  }
+
+  private EmailContent extractBodyContent(Message msg) throws 
MessagingException, IOException {
+    EmailContent emailContent = null;
     Object o = msg.getContent();
     if (o instanceof Multipart) {
       Multipart mp = (Multipart) msg.getContent();
       for (int k = 0, n = mp.getCount(); k < n; k++) {
         Part part = mp.getBodyPart(k);
         String disposition = part.getDisposition();
-        if ((disposition == null)) {
-          MimeBodyPart mbp = (MimeBodyPart) part;
-          if (mbp.isMimeType(EmailConfig.MIMETYPE_TEXT_PLAIN)) {
-            bodyContent = mbp.getContent().toString();
-          } else if (mbp.isMimeType(EmailConfig.MIMETYPE_HTML)) {
-            bodyContent = mbp.getContent().toString(); //handle html 
accordingly. Returns content with html tags
+        if (disposition == null) {
+          EmailContent content = getContent(part);
+          if (content != null) {
+            emailContent = content;
           }
         }
       }
     } else if (o instanceof String) {
-      bodyContent = (String)o;
+      emailContent = new EmailContent((String)o);
     }
-    return bodyContent;
+    return emailContent;
   }
 
   /**
@@ -2173,4 +2214,27 @@ public class EmailConnector extends org.
     }
   }
 
-}
\ No newline at end of file
+  private static class EmailContent {
+    private final String content;
+    private final String mimeType;
+
+    public EmailContent(final String content) {
+      this.content = content;
+      this.mimeType = EmailConfig.MIMETYPE_TEXT_PLAIN;
+    }
+
+    public EmailContent(final String content, final String mimetype) {
+      this.content = content;
+      this.mimeType = mimetype;
+    }
+
+    public String getContent() {
+      return content;
+    }
+
+    public String getMimeType() {
+      return mimeType;
+    }
+  }
+
+}


Reply via email to