Author: gsingers
Date: Fri Nov 25 21:29:47 2011
New Revision: 1206335
URL: http://svn.apache.org/viewvc?rev=1206335&view=rev
Log:
MAHOUT-798: fix some edge cases around handling ids
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java?rev=1206335&r1=1206334&r2=1206335&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
Fri Nov 25 21:29:47 2011
@@ -43,6 +43,7 @@ public final class EmailUtility {
private static final Pattern ADDRESS_CLEANUP =
Pattern.compile("mailto:|<|>|\\[|\\]|\\=20");
private static final Pattern ANGLE_BRACES = Pattern.compile("<|>");
private static final Pattern SPACE_OR_CLOSE_ANGLE =
Pattern.compile(">|\\s+");
+ public static final Pattern WHITESPACE = Pattern.compile("\\s*");
private EmailUtility() {
}
@@ -97,4 +98,8 @@ public final class EmailUtility {
}
return splits;
}
+
+ public enum Counters {
+ NO_MESSAGE_ID, NO_FROM_ADDRESS
+ }
}
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java?rev=1206335&r1=1206334&r2=1206335&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
Fri Nov 25 21:29:47 2011
@@ -48,7 +48,13 @@ public final class FromEmailToDictionary
//TODO: is there more to clean up here?
full = EmailUtility.cleanUpEmailAddress(full);
- context.write(new Text(full), new VarIntWritable(1));
+ if (EmailUtility.WHITESPACE.matcher(full).matches() == false) {
+ context.write(new Text(full), new VarIntWritable(1));
+ } else {
+ context.getCounter(EmailUtility.Counters.NO_FROM_ADDRESS).increment(1);
+ }
+ } else {
+ context.getCounter(EmailUtility.Counters.NO_FROM_ADDRESS).increment(1);
}
}
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java?rev=1206335&r1=1206334&r2=1206335&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
Fri Nov 25 21:29:47 2011
@@ -28,20 +28,22 @@ import java.io.IOException;
*/
public final class MsgIdToDictionaryMapper extends Mapper<Text, Text, Text,
VarIntWritable> {
- public enum Counters {
- NO_MESSAGE_ID
- }
-
@Override
protected void map(Text key, Text value, Context context) throws
IOException, InterruptedException {
//message id is in the key:
/201008/[email protected]
String keyStr = key.toString();
- int idx = keyStr.lastIndexOf('/');
+ int idx = keyStr.lastIndexOf('@');//find the last @
if (idx == -1) {
- context.getCounter(Counters.NO_MESSAGE_ID).increment(1);
+ context.getCounter(EmailUtility.Counters.NO_MESSAGE_ID).increment(1);
} else {
+ //found the @, now find the last slash before the @ and grab everything
after that
+ idx = keyStr.lastIndexOf('/', idx);
String msgId = keyStr.substring(idx + 1);
- context.write(new Text(msgId), new VarIntWritable(1));
+ if (EmailUtility.WHITESPACE.matcher(msgId).matches() == false) {
+ context.write(new Text(msgId), new VarIntWritable(1));
+ } else {
+ context.getCounter(EmailUtility.Counters.NO_MESSAGE_ID).increment(1);
+ }
}
}
}
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java?rev=1206335&r1=1206334&r2=1206335&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
Fri Nov 25 21:29:47 2011
@@ -39,7 +39,7 @@ public class MailProcessor {
public static final Pattern SUBJECT_PREFIX =
Pattern.compile("^subject: (.*)$", Pattern.CASE_INSENSITIVE);
public static final Pattern FROM_PREFIX =
- Pattern.compile("^from: (.*)$", Pattern.CASE_INSENSITIVE);
+ Pattern.compile("^from: (\\S.*)$",
Pattern.CASE_INSENSITIVE);//we need to have at least one character
public static final Pattern REFS_PREFIX =
Pattern.compile("^references: (.*)$",
Pattern.CASE_INSENSITIVE);
public static final Pattern TO_PREFIX =