Author: gsingers
Date: Fri Nov 25 21:29:47 2011
New Revision: 1206335

URL: http://svn.apache.org/viewvc?rev=1206335&view=rev
Log:
MAHOUT-798: fix some edge cases around handling ids

Modified:
    
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
    
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
    
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java?rev=1206335&r1=1206334&r2=1206335&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
 Fri Nov 25 21:29:47 2011
@@ -43,6 +43,7 @@ public final class EmailUtility {
   private static final Pattern ADDRESS_CLEANUP = 
Pattern.compile("mailto:|<|>|\\[|\\]|\\=20");
   private static final Pattern ANGLE_BRACES = Pattern.compile("<|>");
   private static final Pattern SPACE_OR_CLOSE_ANGLE = 
Pattern.compile(">|\\s+");
+  public static final Pattern WHITESPACE = Pattern.compile("\\s*");
 
   private EmailUtility() {
   }
@@ -97,4 +98,8 @@ public final class EmailUtility {
     }
     return splits;
   }
+
+  public enum Counters {
+    NO_MESSAGE_ID, NO_FROM_ADDRESS
+  }
 }

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java?rev=1206335&r1=1206334&r2=1206335&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/FromEmailToDictionaryMapper.java
 Fri Nov 25 21:29:47 2011
@@ -48,7 +48,13 @@ public final class FromEmailToDictionary
       //TODO: is there more to clean up here?
       full = EmailUtility.cleanUpEmailAddress(full);
 
-      context.write(new Text(full), new VarIntWritable(1));
+      if (EmailUtility.WHITESPACE.matcher(full).matches() == false) {
+        context.write(new Text(full), new VarIntWritable(1));
+      } else {
+        context.getCounter(EmailUtility.Counters.NO_FROM_ADDRESS).increment(1);
+      }
+    } else {
+      context.getCounter(EmailUtility.Counters.NO_FROM_ADDRESS).increment(1);
     }
 
   }

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java?rev=1206335&r1=1206334&r2=1206335&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MsgIdToDictionaryMapper.java
 Fri Nov 25 21:29:47 2011
@@ -28,20 +28,22 @@ import java.io.IOException;
  */
 public final class MsgIdToDictionaryMapper extends Mapper<Text, Text, Text, 
VarIntWritable> {
 
-  public enum Counters {
-    NO_MESSAGE_ID
-  }
-
   @Override
   protected void map(Text key, Text value, Context context) throws 
IOException, InterruptedException {
     //message id is in the key: 
/201008/[email protected]
     String keyStr = key.toString();
-    int idx = keyStr.lastIndexOf('/');
+    int idx = keyStr.lastIndexOf('@');//find the last @
     if (idx == -1) {
-      context.getCounter(Counters.NO_MESSAGE_ID).increment(1);
+      context.getCounter(EmailUtility.Counters.NO_MESSAGE_ID).increment(1);
     } else {
+      //found the @, now find the last slash before the @ and grab everything 
after that
+      idx = keyStr.lastIndexOf('/', idx);
       String msgId = keyStr.substring(idx + 1);
-      context.write(new Text(msgId), new VarIntWritable(1));
+      if (EmailUtility.WHITESPACE.matcher(msgId).matches() == false) {
+        context.write(new Text(msgId), new VarIntWritable(1));
+      } else {
+        context.getCounter(EmailUtility.Counters.NO_MESSAGE_ID).increment(1);
+      }
     }
   }
 }

Modified: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java?rev=1206335&r1=1206334&r2=1206335&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
 (original)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
 Fri Nov 25 21:29:47 2011
@@ -39,7 +39,7 @@ public class MailProcessor {
   public static final Pattern SUBJECT_PREFIX =
           Pattern.compile("^subject: (.*)$", Pattern.CASE_INSENSITIVE);
   public static final Pattern FROM_PREFIX =
-                  Pattern.compile("^from: (.*)$", Pattern.CASE_INSENSITIVE);
+                  Pattern.compile("^from: (\\S.*)$", 
Pattern.CASE_INSENSITIVE);//we need to have at least one character
   public static final Pattern REFS_PREFIX =
                           Pattern.compile("^references: (.*)$", 
Pattern.CASE_INSENSITIVE);
   public static final Pattern TO_PREFIX =


Reply via email to