Author: gsingers
Date: Wed Nov 23 02:47:28 2011
New Revision: 1205271
URL: http://svn.apache.org/viewvc?rev=1205271&view=rev
Log:
MAHOUT-798: minor bug fixes with recommendation example to remove dups and
properly handle missing dictionary hits
Added:
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java?rev=1205271&r1=1205270&r2=1205271&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
Wed Nov 23 02:47:28 2011
@@ -26,6 +26,7 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.FileUtil;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
@@ -84,6 +85,8 @@ public final class MailToPrefsDriver ext
addOption("separator", "sep", "The separator used in the input file to
separate to, from, subject. Default is \\n", "\n");
addOption("from", "f", "The position in the input text (value) where the
from email is located, starting from zero (0).", "0");
addOption("refs", "r", "The position in the input text (value) where the
reference ids are located, starting from zero (0).", "1");
+ addOption(buildOption("useCounts", "u", "If set, then use the number of
times the user has interacted with a thread as an indication of their
preference. Otherwise, use boolean preferences.",
+ false, false, "true"));
Map<String, String> parsedArgs = parseArguments(args);
Path input = getInputPath();
@@ -95,7 +98,7 @@ public final class MailToPrefsDriver ext
setConf(new Configuration());
conf = getConf();
}
-
+ boolean useCounts = hasOption("--useCounts");
AtomicInteger currentPhase = new AtomicInteger();
int[] msgDim = new int[1];
//TODO: mod this to not do so many passes over the data. Dictionary
creation could probably be a chain mapper
@@ -163,6 +166,7 @@ public final class MailToPrefsDriver ext
conf.set(EmailUtility.FROM_INDEX, parsedArgs.get("--from"));
conf.set(EmailUtility.REFS_INDEX, parsedArgs.get("--refs"));
conf.set(EmailUtility.SEPARATOR, separator);
+ conf.set(MailToRecReducer.USE_COUNTS_PREFERENCE,
String.valueOf(useCounts));
int j = 0;
int i = 0;
for (Path fromChunk : fromChunks) {
@@ -170,7 +174,7 @@ public final class MailToPrefsDriver ext
Path out = new Path(vecPath, "tmp-" + i + '-' + j);
DistributedCache.setCacheFiles(new URI[]{fromChunk.toUri(),
idChunk.toUri()}, conf);
Job createRecMatrix = prepareJob(input, out,
SequenceFileInputFormat.class,
- MailToRecMapper.class, NullWritable.class, Text.class,
+ MailToRecMapper.class, Text.class, LongWritable.class,
MailToRecReducer.class, Text.class, NullWritable.class,
TextOutputFormat.class);
createRecMatrix.getConfiguration().set("mapred.output.compress",
"false");
createRecMatrix.waitForCompletion(true);
@@ -220,7 +224,7 @@ public final class MailToPrefsDriver ext
try {
long currentChunkSize = 0;
Path filesPattern = new Path(inputPath, OUTPUT_FILES_PATTERN);
- int i = 0;
+ int i = 1;//start at 1, since a miss in the OpenObjectIntHashMap returns
a 0
for (Pair<Writable, Writable> record
: new SequenceFileDirIterable<Writable, Writable>(filesPattern,
PathType.GLOB, null, null, true, conf)) {
if (currentChunkSize > chunkSizeLimit) {
Modified:
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java?rev=1205271&r1=1205270&r2=1205271&view=diff
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
(original)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
Wed Nov 23 02:47:28 2011
@@ -19,6 +19,7 @@ package org.apache.mahout.cf.taste.examp
import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
@@ -28,7 +29,7 @@ import org.slf4j.LoggerFactory;
import java.io.IOException;
-public final class MailToRecMapper extends Mapper<Text, Text, NullWritable,
Text> {
+public final class MailToRecMapper extends Mapper<Text, Text, Text,
LongWritable> {
private static final Logger log =
LoggerFactory.getLogger(MailToRecMapper.class);
@@ -92,7 +93,7 @@ public final class MailToRecMapper exten
}
if (msgIdKey != Integer.MIN_VALUE && fromKey != Integer.MIN_VALUE) {
- context.write(null, new Text(fromKey + "," + msgIdKey + ",1"));
+ context.write(new Text(fromKey + "," + msgIdKey), new LongWritable(1));
}
}
Added:
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java?rev=1205271&view=auto
==============================================================================
---
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
(added)
+++
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
Wed Nov 23 02:47:28 2011
@@ -0,0 +1,40 @@
+package org.apache.mahout.cf.taste.example.email;
+
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+
+import java.io.IOException;
+
+/**
+ *
+ *
+ **/
+public class MailToRecReducer extends Reducer<Text, LongWritable, Text,
NullWritable>{
+ //if true, then output weight
+ private boolean useCounts = true;
+ /**
+ * We can either ignore how many times the user interacted (boolean) or
output the number of times they interacted.
+ */
+ public static final String USE_COUNTS_PREFERENCE = "useBooleanPreferences";
+
+ @Override
+ protected void setup(Context context) throws IOException,
InterruptedException {
+ useCounts = context.getConfiguration().getBoolean(USE_COUNTS_PREFERENCE,
true);
+ }
+
+ @Override
+ protected void reduce(Text key, Iterable<LongWritable> values, Context
context) throws IOException, InterruptedException {
+ if (useCounts == false){
+ context.write(new Text(key.toString()), null);
+ } else {
+ long sum = 0;
+ for (LongWritable value : values) {
+ sum++;
+ }
+ context.write(new Text(key.toString() + "," + sum), null);
+ }
+ }
+}
Modified:
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java?rev=1205271&r1=1205270&r2=1205271&view=diff
==============================================================================
---
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
(original)
+++
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
Wed Nov 23 02:47:28 2011
@@ -149,7 +149,11 @@ public class MailProcessor {
private static void writeContent(String separator, StringBuilder contents,
CharSequence body, String[] matches) {
for (String match : matches) {
- contents.append(match).append(separator);
+ if (match != null) {
+ contents.append(match).append(separator);
+ } else {
+ contents.append("").append(separator);
+ }
}
contents.append('\n').append(body);
}