Author: kamaci
Date: Fri Apr 14 14:50:38 2017
New Revision: 1791374
URL: http://svn.apache.org/viewvc?rev=1791374&view=rev
Log:
Fix for CONNECTORS-1407.
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_en_US.properties
manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_es_ES.properties
manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_ja_JP.properties
manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_zh_CN.properties
manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/SpecificationView.html
manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/Specification_Metadata.html
Modified: manifoldcf/trunk/CHANGES.txt
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1791374&r1=1791373&r2=1791374&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Fri Apr 14 14:50:38 2017
@@ -3,6 +3,9 @@ $Id$
======================= 2.7-dev =====================
+CONNECTORS-1407: Extract email addresses from email metadata fields.
+(Furkan KAMACI)
+
CONNECTORS-1406: Fix multiple To and From field bug at e-mail.
(Furkan KAMACI)
Modified:
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java?rev=1791374&r1=1791373&r2=1791374&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
(original)
+++
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConfig.java
Fri Apr 14 14:50:38 2017
@@ -78,12 +78,14 @@ public class EmailConfig {
public static final String PORT_DEFAULT_VALUE = "";
public static final String[] BASIC_METADATA =
{"To","From","Subject","Body","Date","Encoding of Attachment",
"MIME Type of attachment", "File Name of Attachment"};
+ public static final String BASIC_EXTRACT_EMAIL = "Use E-Mail Extractor";
public static final String[] BASIC_SEARCHABLE_ATTRIBUTES =
{"To","From","Subject","Body","Start Date", "End Date"};
// Specification nodes
public static final String NODE_PROPERTIES = "properties";
public static final String NODE_METADATA = "metadata";
+ public static final String NODE_EXTRACT_EMAIL = "extractemail";
public static final String NODE_FILTER = "filter";
public static final String NODE_FOLDER = "folder";
Modified:
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java?rev=1791374&r1=1791373&r2=1791374&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
(original)
+++
manifoldcf/trunk/connectors/email/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/email/EmailConnector.java
Fri Apr 14 14:50:38 2017
@@ -31,6 +31,8 @@ import java.io.*;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.*;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
import javax.mail.*;
import javax.mail.internet.MimeBodyPart;
import javax.mail.internet.MimeMessage;
@@ -482,12 +484,16 @@ public class EmailConnector extends org.
throws ManifoldCFException, ServiceInterruption {
List<String> requiredMetadata = new ArrayList<String>();
+ boolean useEmailExtractor = false;
for (int i = 0; i < spec.getChildCount(); i++) {
SpecificationNode sn = spec.getChild(i);
if (sn.getType().equals(EmailConfig.NODE_METADATA)) {
String metadataAttribute =
sn.getAttributeValue(EmailConfig.ATTRIBUTE_NAME);
requiredMetadata.add(metadataAttribute);
}
+ if (sn.getType().equals(EmailConfig.NODE_EXTRACT_EMAIL)) {
+ useEmailExtractor = true;
+ }
}
// Keep a cached set of open folders
@@ -590,7 +596,7 @@ public class EmailConnector extends org.
String[] toStr = new String[to.length];
int j = 0;
for (Address address : to) {
- toStr[j] = address.toString();
+ toStr[j] = useEmailExtractor ?
extractEmailAddress(address.toString()) : address.toString();
j++;
}
rd.addField(EmailConfig.EMAIL_TO, toStr);
@@ -599,11 +605,10 @@ public class EmailConnector extends org.
String[] fromStr = new String[from.length];
int j = 0;
for (Address address : from) {
- fromStr[j] = address.toString();
+ fromStr[j] = useEmailExtractor ?
extractEmailAddress(address.toString()) : address.toString();
j++;
}
rd.addField(EmailConfig.EMAIL_FROM, fromStr);
-
} else if
(metadata.toLowerCase(Locale.ROOT).equals(EmailConfig.EMAIL_SUBJECT)) {
String subject = msg.getSubject();
rd.addField(EmailConfig.EMAIL_SUBJECT, subject);
@@ -850,7 +855,7 @@ public class EmailConnector extends org.
String[] toStr = new String[to.length];
int j = 0;
for (Address address : to) {
- toStr[j] = address.toString();
+ toStr[j] = useEmailExtractor ?
extractEmailAddress(address.toString()) : address.toString();
j++;
}
rd.addField(EmailConfig.EMAIL_TO, toStr);
@@ -859,7 +864,7 @@ public class EmailConnector extends org.
String[] fromStr = new String[from.length];
int j = 0;
for (Address address : from) {
- fromStr[j] = address.toString();
+ fromStr[j] = useEmailExtractor ?
extractEmailAddress(address.toString()) : address.toString();
j++;
}
rd.addField(EmailConfig.EMAIL_FROM, fromStr);
@@ -930,6 +935,20 @@ public class EmailConnector extends org.
}
+ /**
+ * Extracts e-mail address within < and > characters if any.
+ * If not, returns passed raw mail address.
+ *
+ * @param rawEmailAddress e-mail address to be extracted
+ * @return Extracted e-mail address
+ */
+ private String extractEmailAddress(String rawEmailAddress) {
+ Pattern pattern = Pattern.compile("<(.+?@.+?)>");
+ Matcher matcher = pattern.matcher(rawEmailAddress);
+
+ return matcher.find() ? matcher.group(1) : rawEmailAddress;
+ }
+
//////////////////////////////End of Repository Connector
Methods///////////////////////////////////
@@ -1215,15 +1234,19 @@ public class EmailConnector extends org.
protected static void fillInMetadataTab(Map<String, Object> paramMap,
Specification ds) {
Set<String> metadataSelections = new HashSet<String>();
+ String extractEmailSelection = null;
int i = 0;
while (i < ds.getChildCount()) {
SpecificationNode sn = ds.getChild(i++);
if (sn.getType().equals(EmailConfig.NODE_METADATA)) {
String metadataName = sn.getAttributeValue(EmailConfig.ATTRIBUTE_NAME);
metadataSelections.add(metadataName);
+ } else if (sn.getType().equals(EmailConfig.NODE_EXTRACT_EMAIL)) {
+ extractEmailSelection =
sn.getAttributeValue(EmailConfig.ATTRIBUTE_NAME);
}
}
paramMap.put("METADATASELECTIONS", metadataSelections);
+ paramMap.put("EXTRACTEMAILSELECTION", extractEmailSelection);
}
/**
@@ -1232,6 +1255,9 @@ public class EmailConnector extends org.
protected void fillInMetadataAttributes(Map<String, Object> paramMap) {
String[] matchNames = EmailConfig.BASIC_METADATA;
paramMap.put("METADATAATTRIBUTES", matchNames);
+
+ String extractEmailAttribute = EmailConfig.BASIC_EXTRACT_EMAIL;
+ paramMap.put("EXTRACTEMAILATTRIBUTE", extractEmailAttribute);
}
protected void outputFilterTab(IHTTPOutput out, Locale locale,
@@ -1364,6 +1390,18 @@ public class EmailConnector extends org.
protected String processMetadataTab(IPostParameters variableContext,
Specification ds,
+ int connectionSequenceNumber)
+ throws ManifoldCFException {
+ String result = processMetadataAttributes(variableContext, ds,
connectionSequenceNumber);
+ if (result != null)
+ return result;
+
+ result = processExtractEmail(variableContext, ds,
connectionSequenceNumber);
+ return result;
+
+ }
+
+ protected String processMetadataAttributes(IPostParameters variableContext,
Specification ds,
int connectionSequenceNumber)
throws ManifoldCFException {
@@ -1385,6 +1423,30 @@ public class EmailConnector extends org.
return null;
}
+
+ protected String processExtractEmail(IPostParameters variableContext,
Specification ds,
+ int connectionSequenceNumber)
+ throws ManifoldCFException {
+
+ String seqPrefix = "s"+connectionSequenceNumber+"_";
+
+ // Remove old included extract email nodes
+ removeNodes(ds, EmailConfig.NODE_EXTRACT_EMAIL);
+
+ // Get the posted extract email value
+ String extractEmail = variableContext.getParameter(seqPrefix +
"extractemail");
+ if (extractEmail == null) {
+ return null;
+ }
+
+ // Gather the extract email parameter to be the last one
+ SpecificationNode sn = new
SpecificationNode(EmailConfig.NODE_EXTRACT_EMAIL);
+ sn.setAttribute(EmailConfig.ATTRIBUTE_NAME, extractEmail);
+ // Add the new extract email parameter
+ ds.addChild(ds.getChildCount(), sn);
+
+ return null;
+ }
/** View specification.
* This method is called in the body section of a job's view page. Its
purpose is to present the document
Modified:
manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_en_US.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_en_US.properties?rev=1791374&r1=1791373&r2=1791374&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_en_US.properties
(original)
+++
manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_en_US.properties
Fri Apr 14 14:50:38 2017
@@ -47,6 +47,7 @@ EmailConnector.MetadataName=Metadata nam
EmailConnector.NoMetadataSpecified=No metadata specified
EmailConnector.SelectMetadataName=--Select metadata name --
EmailConnector.IncludedMetadataColon=Included metadata:
+EmailConnector.ExtractEmailColon=Fields to extract e-mail addresses from:
EmailConnector.AttachmentURLTemplateColon=Attachment URL template (blank if no
attachments desired):
Modified:
manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_es_ES.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_es_ES.properties?rev=1791374&r1=1791373&r2=1791374&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_es_ES.properties
(original)
+++
manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_es_ES.properties
Fri Apr 14 14:50:38 2017
@@ -47,6 +47,7 @@ EmailConnector.MetadataName=nombre de me
EmailConnector.NoMetadataSpecified=Sin metadatos especificada
EmailConnector.SelectMetadataName=--Seleccione el nombre de metadatos --
EmailConnector.IncludedMetadataColon=metadatos Incluido:
+EmailConnector.ExtractEmailColon=Fields to extract e-mail addresses from:
EmailConnector.AttachmentURLTemplateColon=Attachment URL template (blank if no
attachments desired):
Modified:
manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_ja_JP.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_ja_JP.properties?rev=1791374&r1=1791373&r2=1791374&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_ja_JP.properties
(original)
+++
manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_ja_JP.properties
Fri Apr 14 14:50:38 2017
@@ -47,6 +47,7 @@ EmailConnector.MetadataName=Metadata nam
EmailConnector.NoMetadataSpecified=No metadata specified
EmailConnector.SelectMetadataName=--Select metadata name --
EmailConnector.IncludedMetadataColon=Included metadata:
+EmailConnector.ExtractEmailColon=Fields to extract e-mail addresses from:
EmailConnector.AttachmentURLTemplateColon=Attachment URL template (blank if no
attachments desired):
Modified:
manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_zh_CN.properties
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_zh_CN.properties?rev=1791374&r1=1791373&r2=1791374&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_zh_CN.properties
(original)
+++
manifoldcf/trunk/connectors/email/connector/src/main/native2ascii/org/apache/manifoldcf/crawler/connectors/email/common_zh_CN.properties
Fri Apr 14 14:50:38 2017
@@ -47,6 +47,7 @@ EmailConnector.MetadataName=å
æ°
EmailConnector.NoMetadataSpecified=å
æ°æ®æªæå®
EmailConnector.SelectMetadataName=-- éæ©å
æ°æ®å --
EmailConnector.IncludedMetadataColon=被å
å«çå
æ°æ®:
+EmailConnector.ExtractEmailColon=Fields to extract e-mail addresses from:
EmailConnector.AttachmentURLTemplateColon=Attachment URL template (blank if no
attachments desired):
Modified:
manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/SpecificationView.html
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/SpecificationView.html?rev=1791374&r1=1791373&r2=1791374&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/SpecificationView.html
(original)
+++
manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/SpecificationView.html
Fri Apr 14 14:50:38 2017
@@ -70,4 +70,13 @@ limitations under the License.
</td>
</tr>
+
+ <tr>
+ <td
class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('EmailConnector.ExtractEmailColon'))</nobr></td>
+ <td class="value">
+ #if($EXTRACTEMAILSELECTION)
+ <nobr>$Encoder.bodyEscape($EXTRACTEMAILSELECTION)</nobr>
+ #end
+ </td>
+ </tr>
</table>
Modified:
manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/Specification_Metadata.html
URL:
http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/Specification_Metadata.html?rev=1791374&r1=1791373&r2=1791374&view=diff
==============================================================================
---
manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/Specification_Metadata.html
(original)
+++
manifoldcf/trunk/connectors/email/connector/src/main/resources/org/apache/manifoldcf/crawler/connectors/email/Specification_Metadata.html
Fri Apr 14 14:50:38 2017
@@ -31,7 +31,18 @@ limitations under the License.
<nobr>$Encoder.bodyEscape($metadataattribute)</nobr><br/>
#end
</td>
-
+ </tr>
+ <tr><td class="separator" colspan="2"><hr/></td></tr>
+ <tr>
+ <td
class="description"><nobr>$Encoder.bodyEscape($ResourceBundle.getString('EmailConnector.ExtractEmailColon'))</nobr></td>
+ <td class="value">
+ #if($EXTRACTEMAILSELECTION)
+ <input type="checkbox" name="s${SeqNum}_extractemail"
value="$Encoder.attributeEscape($EXTRACTEMAILATTRIBUTE)" checked="true"/>
+ #else
+ <input type="checkbox" name="s${SeqNum}_extractemail"
value="$Encoder.attributeEscape($EXTRACTEMAILATTRIBUTE)"/>
+ #end
+ <nobr>$Encoder.bodyEscape($EXTRACTEMAILATTRIBUTE)</nobr><br/>
+ </td>
</tr>
</table>
@@ -40,5 +51,7 @@ limitations under the License.
#foreach($metadataselection in $METADATASELECTIONS)
<input type="hidden" name="s${SeqNum}_metadata"
value="$Encoder.attributeEscape($metadataselection)"/>
#end
-
+ #if($EXTRACTEMAILSELECTION)
+<input type="hidden" name="s${SeqNum}_extractemail"
value="$Encoder.attributeEscape($EXTRACTEMAILSELECTION)"/>
+ #end
#end