Author: kamaci
Date: Sat Apr 15 21:24:07 2017
New Revision: 1791548
URL: http://svn.apache.org/viewvc?rev=1791548&view=rev
Log:
Fix for CONNECTORS-1410.
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
Modified: manifoldcf/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1791548&r1=1791547&r2=1791548&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Sat Apr 15 21:24:07 2017
@@ -6,9 +6,12 @@ $Id$
======================= Release 2.7 =====================
+CONNECTORS-1410: Body is used as content at emails.
+(Furkan KAMACI)
+
CONNECTORS-1408: Insure that there's a non-null document name in
the Solr connector, otherwise there will be no multipart post.
-(Cihad Gozel, Karl Wright)
+(Cihad Guzel, Karl Wright)
CONNECTORS-1409: Fix re-processing email bug.
(Furkan KAMACI)
Modified:
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java?rev=1791548&r1=1791547&r2=1791548&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
(original)
+++
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
Sat Apr 15 21:24:07 2017
@@ -76,7 +76,7 @@ public class EmailConfig {
public static final String PROTOCOL_DEFAULT_VALUE = "IMAP";
public static final String PORT_DEFAULT_VALUE = "";
- public static final String[] BASIC_METADATA =
{"To","From","Subject","Body","Date","Encoding of Attachment",
+ public static final String[] BASIC_METADATA =
{"To","From","Subject","Date","Encoding of Attachment",
"MIME Type of attachment", "File Name of Attachment"};
public static final String BASIC_EXTRACT_EMAIL = "Use E-Mail Extractor";
public static final String[] BASIC_SEARCHABLE_ATTRIBUTES =
{"To","From","Subject","Body","Start Date", "End Date"};
Modified:
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java?rev=1791548&r1=1791547&r2=1791548&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
(original)
+++
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
Sat Apr 15 21:24:07 2017
@@ -33,9 +33,11 @@ import javax.mail.*;
import javax.mail.internet.MimeBodyPart;
import javax.mail.internet.MimeMessage;
import javax.mail.search.*;
+import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InterruptedIOException;
+import java.nio.charset.StandardCharsets;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
@@ -593,7 +595,7 @@ public class EmailConnector extends org.
rd.setMimeType(mimeType);
rd.setCreatedDate(sentDate);
rd.setModifiedDate(sentDate);
-
+
for (String metadata : requiredMetadata) {
if
(metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_TO)) {
Address[] to = msg.getRecipients(Message.RecipientType.TO);
@@ -616,25 +618,6 @@ public class EmailConnector extends org.
} else if
(metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_SUBJECT)) {
String subject = msg.getSubject();
rd.addField(EmailConfig.EMAIL_SUBJECT, subject);
- } else if
(metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_BODY)) {
- Object o = msg.getContent();
- if (o instanceof Multipart) {
- Multipart mp = (Multipart) msg.getContent();
- for (int k = 0, n = mp.getCount(); k < n; k++) {
- Part part = mp.getBodyPart(k);
- String disposition = part.getDisposition();
- if ((disposition == null)) {
- MimeBodyPart mbp = (MimeBodyPart) part;
- if (mbp.isMimeType(EmailConfig.MIMETYPE_TEXT_PLAIN)) {
- rd.addField(EmailConfig.EMAIL_BODY,
mbp.getContent().toString());
- } else if (mbp.isMimeType(EmailConfig.MIMETYPE_HTML)) {
- rd.addField(EmailConfig.EMAIL_BODY,
mbp.getContent().toString()); //handle html accordingly. Returns content with
html tags
- }
- }
- }
- } else if (o instanceof String) {
- rd.addField(EmailConfig.EMAIL_BODY, (String)o);
- }
} else if
(metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_DATE)) {
rd.addField(EmailConfig.EMAIL_DATE, sentDate.toString());
} else if
(metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_ATTACHMENT_ENCODING))
{
@@ -696,8 +679,10 @@ public class EmailConnector extends org.
}
}
}
-
- InputStream is = msg.getInputStream();
+
+ //Content includes both body and attachments,
+ //Body will be set as content and attachments will be indexed as
separate documents.
+ InputStream is = new
ByteArrayInputStream(extractBodyContent(msg).getBytes(StandardCharsets.UTF_8));
try {
rd.setBinary(is, fileLength);
activities.ingestDocumentWithException(documentIdentifier,
version, msgURL, rd);
@@ -706,7 +691,7 @@ public class EmailConnector extends org.
} finally {
is.close();
}
-
+
// If we're supposed to deal with attachments, this is the time
to queue them up
if (attachmentUrlTemplate != null) {
if (msg.getContent() != null && msg.getContent() instanceof
Multipart) {
@@ -932,16 +917,39 @@ public class EmailConnector extends org.
}
- /**
- * Checks whether a Part is an attachment or not
- * @param part Part to check
- * @return is attachment or not
- */
+ private String extractBodyContent(Message msg) throws MessagingException,
IOException {
+ String bodyContent = null;
+ Object o = msg.getContent();
+ if (o instanceof Multipart) {
+ Multipart mp = (Multipart) msg.getContent();
+ for (int k = 0, n = mp.getCount(); k < n; k++) {
+ Part part = mp.getBodyPart(k);
+ String disposition = part.getDisposition();
+ if ((disposition == null)) {
+ MimeBodyPart mbp = (MimeBodyPart) part;
+ if (mbp.isMimeType(EmailConfig.MIMETYPE_TEXT_PLAIN)) {
+ bodyContent = mbp.getContent().toString();
+ } else if (mbp.isMimeType(EmailConfig.MIMETYPE_HTML)) {
+ bodyContent = mbp.getContent().toString(); //handle html
accordingly. Returns content with html tags
+ }
+ }
+ }
+ } else if (o instanceof String) {
+ bodyContent = (String)o;
+ }
+ return bodyContent;
+ }
+
+ /**
+ * Checks whether a Part is an attachment or not
+ * @param part Part to check
+ * @return is attachment or not
+ */
private boolean isAttachment(Part part) throws MessagingException {
- String disposition = part.getDisposition();
- return ((disposition != null)
- && ((disposition.toLowerCase(Locale.ROOT).equals(Part.ATTACHMENT)
- || (disposition.toLowerCase(Locale.ROOT).equals(Part.INLINE)))));
+ String disposition = part.getDisposition();
+ return ((disposition != null)
+ && ((disposition.toLowerCase(Locale.ROOT).equals(Part.ATTACHMENT)
+ || (disposition.toLowerCase(Locale.ROOT).equals(Part.INLINE)))));
}
/**