Author: gsingers
Date: Fri Oct 14 14:58:05 2011
New Revision: 1183379
URL: http://svn.apache.org/viewvc?rev=1183379&view=rev
Log:
MAHOUT-798: fix recommender content extraction from email
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java?rev=1183379&r1=1183378&r2=1183379&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/EmailUtility.java
Fri Oct 14 14:58:05 2011
@@ -23,6 +23,8 @@ public final class EmailUtility {
public static final String MSG_IDS_PREFIX = "msgIdsPrefix";
public static final String FROM_PREFIX = "fromPrefix";
public static final String MSG_ID_DIMENSION = "msgIdDim";
+ public static final String FROM_INDEX = "fromIdx";
+ public static final String REFS_INDEX = "refsIdx";
private EmailUtility() {
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java?rev=1183379&r1=1183378&r2=1183379&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
Fri Oct 14 14:58:05 2011
@@ -57,7 +57,7 @@ import java.util.concurrent.atomic.Atomi
* file that can be consumed by the {@link
org.apache.mahout.cf.taste.hadoop.pseudo.RecommenderJob}.
* <p/>
* This assumes the input is a Sequence File, that the key is:
filename/message id and the value is a list (separated by the
- * user's choosing) of: from, to, subject
+ * user's choosing) containing the from email and any references
* <p/>
* The output is a matrix where either the from or to are the rows
(represented as longs) and the columns are the message ids
* that the user has interacted with (as a VectorWritable). This class
currently does not account for thread hijacking.
@@ -83,6 +83,8 @@ public class MailToPrefsDriver extends A
addOption(DefaultOptionCreator.overwriteOption().create());
addOption("chunkSize", "cs", "The size of chunks to write. Default is 100
mb", "100");
addOption("separator", "sep", "The separator used in the input file to
separate to, from, subject. Default is \\n", "\n");
+ addOption("from", "f", "The position in the input text (value) where the
from email is located, starting from zero (0).", "0");
+ addOption("refs", "r", "The position in the input text (value) where the
reference ids are located, starting from zero (0).", "1");
Map<String, String> parsedArgs = parseArguments(args);
Path input = getInputPath();
@@ -159,6 +161,8 @@ public class MailToPrefsDriver extends A
conf.set(EmailUtility.MSG_ID_DIMENSION, String.valueOf(msgDim[0]));
conf.set(EmailUtility.FROM_PREFIX, "fromIds-dictionary-");
conf.set(EmailUtility.MSG_IDS_PREFIX, "msgIds-dictionary-");
+ conf.set(EmailUtility.FROM_INDEX, parsedArgs.get("--from"));
+ conf.set(EmailUtility.REFS_INDEX, parsedArgs.get("--refs"));
conf.set(EmailUtility.SEPARATOR, separator);
for (Path fromChunk : fromChunks) {
for (Path idChunk : msgIdChunks) {
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java?rev=1183379&r1=1183378&r2=1183379&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
Fri Oct 14 14:58:05 2011
@@ -39,6 +39,8 @@ public class MailToRecMapper extends
private OpenObjectIntHashMap<String> fromDictionary = new
OpenObjectIntHashMap<String>();
private OpenObjectIntHashMap<String> msgIdDictionary = new
OpenObjectIntHashMap<String>();
private String separator = "\n";
+ protected int fromIdx;
+ protected int refsIdx;
public enum Counters {
REFERENCE, ORIGINAL
@@ -49,6 +51,8 @@ public class MailToRecMapper extends
Configuration conf = context.getConfiguration();
String fromPrefix = conf.get(EmailUtility.FROM_PREFIX);
String msgPrefix = conf.get(EmailUtility.MSG_IDS_PREFIX);
+ fromIdx = conf.getInt(EmailUtility.FROM_INDEX, 0);
+ refsIdx = conf.getInt(EmailUtility.REFS_INDEX, 1);
EmailUtility.loadDictionaries(conf, fromPrefix, fromDictionary, msgPrefix,
msgIdDictionary);
log.info("From Dictionary size: {} Msg Id Dictionary size: {}",
fromDictionary.size(), msgIdDictionary.size());
separator = context.getConfiguration().get(EmailUtility.SEPARATOR);
@@ -64,14 +68,15 @@ public class MailToRecMapper extends
int fromKey = Integer.MIN_VALUE;
String valStr = value.toString();
String[] splits =
StringUtils.splitByWholeSeparatorPreserveAllTokens(valStr, separator);
- //format is: from, to, refs, subject, body
if (splits != null && splits.length > 0) {
- String from = EmailUtility.cleanUpEmailAddress(splits[0]);
- fromKey = fromDictionary.get(from);
+ if (splits.length > refsIdx){
+ String from = EmailUtility.cleanUpEmailAddress(splits[fromIdx]);
+ fromKey = fromDictionary.get(from);
+ }
//get the references
- if (splits.length > 2) {
- String[] theRefs = EmailUtility.parseReferences(splits[2]);
+ if (splits.length > refsIdx) {
+ String[] theRefs = EmailUtility.parseReferences(splits[refsIdx]);
if (theRefs != null && theRefs.length > 0) {
//we have a reference, the first one is the original message id, so
map to that one if it exists
msgIdKey = msgIdDictionary.get(theRefs[0]);
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java?rev=1183379&r1=1183378&r2=1183379&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/text/SequenceFilesFromMailArchives.java
Fri Oct 14 14:58:05 2011
@@ -41,7 +41,9 @@ import java.io.FileFilter;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import java.util.regex.Pattern;
/**
@@ -189,22 +191,30 @@ public final class SequenceFilesFromMail
options.chunkSize = chunkSize;
options.charset = charset;
- //If this order changes, must change FromEmailToDictionaryMapper,
potentially, as it expects From to be first
- List<Pattern> patterns = new ArrayList<Pattern>();
- //new Pattern[]{MailProcessor.FROM_PREFIX, MailProcessor.TO_PREFIX,
MailProcessor.REFS_PREFIX, MailProcessor.SUBJECT_PREFIX, };
+
+ List<Pattern> patterns = new ArrayList<Pattern>(5);
+ //patternOrder is used downstream so that we can know what order the
text is in instead of encoding it in the string, which
+ //would require more processing later to remove it pre feature selection.
+ Map<String, Integer> patternOrder = new HashMap<String, Integer>();
+ int order = 0;
if (cmdLine.hasOption(fromOpt)) {
patterns.add(MailProcessor.FROM_PREFIX);
+ patternOrder.put(MailOptions.FROM, order++);
}
if (cmdLine.hasOption(toOpt)) {
patterns.add(MailProcessor.TO_PREFIX);
+ patternOrder.put(MailOptions.TO, order++);
}
if (cmdLine.hasOption(refsOpt)) {
patterns.add(MailProcessor.REFS_PREFIX);
+ patternOrder.put(MailOptions.REFS, order++);
}
if (cmdLine.hasOption(subjectOpt)) {
patterns.add(MailProcessor.SUBJECT_PREFIX);
+ patternOrder.put(MailOptions.SUBJECT, order++);
}
options.patternsToMatch = patterns.toArray(new Pattern[patterns.size()]);
+ options.patternOrder = patternOrder;
options.includeBody = cmdLine.hasOption(bodyOpt);
options.separator = "\n";
if (cmdLine.hasOption(separatorOpt)) {
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java?rev=1183379&r1=1183378&r2=1183379&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailOptions.java
Fri Oct 14 14:58:05 2011
@@ -21,6 +21,7 @@ package org.apache.mahout.utils.email;
import java.io.File;
import java.nio.charset.Charset;
+import java.util.Map;
import java.util.regex.Pattern;
/**
@@ -28,7 +29,10 @@ import java.util.regex.Pattern;
*
**/
public class MailOptions {
-
+ public static final String FROM = "FROM";
+ public static final String TO = "TO";
+ public static final String REFS = "REFS";
+ public static final String SUBJECT = "SUBJECT";
public File input;
public String outputDir;
public String prefix;
@@ -38,5 +42,6 @@ public class MailOptions {
public String bodySeparator = "\n";
public boolean includeBody;
public Pattern[] patternsToMatch;
-
+ //maps FROM, TO, REFS, SUBJECT, etc. to the order they appear in
patternsToMatch. See MailToRecMapper
+ public Map<String, Integer> patternOrder;
}