Author: kwright
Date: Tue Apr 25 21:52:49 2017
New Revision: 1792663
URL: http://svn.apache.org/viewvc?rev=1792663&view=rev
Log:
Fix for CONNECTORS-1417
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
Modified: manifoldcf/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1792663&r1=1792662&r2=1792663&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Tue Apr 25 21:52:49 2017
@@ -3,6 +3,9 @@ $Id$
======================= 2.8-dev =====================
+CONNECTORS-1417: Accept multipart/alternal email format.
+(Cihad Guzel)
+
CONNECTORS-1416: Use a 4-character year for email dates.
(Cihad Guzel)
Modified:
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java?rev=1792663&r1=1792662&r2=1792663&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
(original)
+++
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
Tue Apr 25 21:52:49 2017
@@ -113,6 +113,8 @@ public class EmailConfig {
public static final String MIMETYPE_TEXT_PLAIN = "text/plain";
public static final String MIMETYPE_HTML = "text/html";
+ public static final String MIMETYPE_MULTIPART_GENERIC = "multipart/*";
+ public static final String MIMETYPE_MULTIPART_ALTERNATIVE =
"multipart/alternative";
// Fields
Modified:
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java?rev=1792663&r1=1792662&r2=1792663&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
(original)
+++
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
Tue Apr 25 21:52:49 2017
@@ -30,7 +30,6 @@ import org.apache.manifoldcf.crawler.int
import org.apache.manifoldcf.crawler.system.Logging;
import javax.mail.*;
-import javax.mail.internet.MimeBodyPart;
import javax.mail.internet.MimeMessage;
import javax.mail.search.*;
import java.io.ByteArrayInputStream;
@@ -684,14 +683,18 @@ public class EmailConnector extends org.
//Content includes both body and attachments,
//Body will be set as content and attachments will be indexed as
separate documents.
- InputStream is = new
ByteArrayInputStream(extractBodyContent(msg).getBytes(StandardCharsets.UTF_8));
- try {
- rd.setBinary(is, fileLength);
- activities.ingestDocumentWithException(documentIdentifier,
version, msgURL, rd);
- errorCode = "OK";
- fileLengthLong = new Long(fileLength);
- } finally {
- is.close();
+ final EmailContent bodyContent = extractBodyContent(msg);
+ if(bodyContent != null) {
+ rd.setMimeType(bodyContent.getMimeType());
+ InputStream is = new
ByteArrayInputStream(bodyContent.getContent().getBytes(StandardCharsets.UTF_8));
+ try {
+ rd.setBinary(is, fileLength);
+ activities.ingestDocumentWithException(documentIdentifier,
version, msgURL, rd);
+ errorCode = "OK";
+ fileLengthLong = new Long(fileLength);
+ } finally {
+ is.close();
+ }
}
// If we're supposed to deal with attachments, this is the time
to queue them up
@@ -921,27 +924,65 @@ public class EmailConnector extends org.
}
- private String extractBodyContent(Message msg) throws MessagingException,
IOException {
- String bodyContent = null;
+ private EmailContent getContent(Part part) throws MessagingException,
IOException {
+ if (part.isMimeType(EmailConfig.MIMETYPE_TEXT_PLAIN)) {
+ return new EmailContent(part.getContent().toString());
+ } else if(part.isMimeType(EmailConfig.MIMETYPE_HTML)) {
+ return new EmailContent(part.getContent().toString(),
EmailConfig.MIMETYPE_HTML);
+ }
+
+ if (part.isMimeType(EmailConfig.MIMETYPE_MULTIPART_ALTERNATIVE)) {
+ // prefer html text over plain text
+ Multipart mp = (Multipart) part.getContent();
+ EmailContent emailContent = null;
+ for (int i = 0; i < mp.getCount(); i++) {
+ Part bodyPart = mp.getBodyPart(i);
+ if (bodyPart.isMimeType(EmailConfig.MIMETYPE_TEXT_PLAIN)) {
+ if (emailContent == null) {
+ emailContent = getContent(bodyPart);
+ }
+ continue;
+ } else if (bodyPart.isMimeType(EmailConfig.MIMETYPE_HTML)) {
+ emailContent = getContent(bodyPart);
+ if (emailContent != null) {
+ return emailContent;
+ }
+ } else {
+ return getContent(bodyPart);
+ }
+ }
+ return emailContent;
+ } else if (part.isMimeType(EmailConfig.MIMETYPE_MULTIPART_GENERIC)) {
+ Multipart mp = (Multipart) part.getContent();
+ for (int i = 0; i < mp.getCount(); i++) {
+ EmailContent emailContent = getContent(mp.getBodyPart(i));
+ if (emailContent != null) {
+ return emailContent;
+ }
+ }
+ }
+ return null;
+ }
+
+ private EmailContent extractBodyContent(Message msg) throws
MessagingException, IOException {
+ EmailContent emailContent = null;
Object o = msg.getContent();
if (o instanceof Multipart) {
Multipart mp = (Multipart) msg.getContent();
for (int k = 0, n = mp.getCount(); k < n; k++) {
Part part = mp.getBodyPart(k);
String disposition = part.getDisposition();
- if ((disposition == null)) {
- MimeBodyPart mbp = (MimeBodyPart) part;
- if (mbp.isMimeType(EmailConfig.MIMETYPE_TEXT_PLAIN)) {
- bodyContent = mbp.getContent().toString();
- } else if (mbp.isMimeType(EmailConfig.MIMETYPE_HTML)) {
- bodyContent = mbp.getContent().toString(); //handle html
accordingly. Returns content with html tags
+ if (disposition == null) {
+ EmailContent content = getContent(part);
+ if (content != null) {
+ emailContent = content;
}
}
}
} else if (o instanceof String) {
- bodyContent = (String)o;
+ emailContent = new EmailContent((String)o);
}
- return bodyContent;
+ return emailContent;
}
/**
@@ -2173,4 +2214,27 @@ public class EmailConnector extends org.
}
}
-}
\ No newline at end of file
+ private static class EmailContent {
+ private final String content;
+ private final String mimeType;
+
+ public EmailContent(final String content) {
+ this.content = content;
+ this.mimeType = EmailConfig.MIMETYPE_TEXT_PLAIN;
+ }
+
+ public EmailContent(final String content, final String mimetype) {
+ this.content = content;
+ this.mimeType = mimetype;
+ }
+
+ public String getContent() {
+ return content;
+ }
+
+ public String getMimeType() {
+ return mimeType;
+ }
+ }
+
+}