Author: gsingers
Date: Wed Nov 23 02:47:28 2011
New Revision: 1205271

URL: http://svn.apache.org/viewvc?rev=1205271&view=rev
Log:
MAHOUT-798: minor bug fixes with recommendation example to remove dups and 
properly handle missing dictionary hits

Added:
    
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
Modified:
    
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
    
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
    
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java?rev=1205271&r1=1205270&r2=1205271&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToPrefsDriver.java
 Wed Nov 23 02:47:28 2011
@@ -26,6 +26,7 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.FileUtil;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.IntWritable;
+import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
@@ -84,6 +85,8 @@ public final class MailToPrefsDriver ext
     addOption("separator", "sep", "The separator used in the input file to 
separate to, from, subject.  Default is \\n", "\n");
     addOption("from", "f", "The position in the input text (value) where the 
from email is located, starting from zero (0).", "0");
     addOption("refs", "r", "The position in the input text (value) where the 
reference ids are located, starting from zero (0).", "1");
+    addOption(buildOption("useCounts", "u", "If set, then use the number of 
times the user has interacted with a thread as an indication of their 
preference.  Otherwise, use boolean preferences.",
+            false, false, "true"));
     Map<String, String> parsedArgs = parseArguments(args);
 
     Path input = getInputPath();
@@ -95,7 +98,7 @@ public final class MailToPrefsDriver ext
       setConf(new Configuration());
       conf = getConf();
     }
-
+    boolean useCounts = hasOption("--useCounts");
     AtomicInteger currentPhase = new AtomicInteger();
     int[] msgDim = new int[1];
     //TODO: mod this to not do so many passes over the data.  Dictionary 
creation could probably be a chain mapper
@@ -163,6 +166,7 @@ public final class MailToPrefsDriver ext
       conf.set(EmailUtility.FROM_INDEX, parsedArgs.get("--from"));
       conf.set(EmailUtility.REFS_INDEX, parsedArgs.get("--refs"));
       conf.set(EmailUtility.SEPARATOR, separator);
+      conf.set(MailToRecReducer.USE_COUNTS_PREFERENCE, 
String.valueOf(useCounts));
       int j = 0;
       int i = 0;
       for (Path fromChunk : fromChunks) {
@@ -170,7 +174,7 @@ public final class MailToPrefsDriver ext
           Path out = new Path(vecPath, "tmp-" + i + '-' + j);
           DistributedCache.setCacheFiles(new URI[]{fromChunk.toUri(), 
idChunk.toUri()}, conf);
           Job createRecMatrix = prepareJob(input, out, 
SequenceFileInputFormat.class,
-                  MailToRecMapper.class, NullWritable.class, Text.class,
+                  MailToRecMapper.class, Text.class, LongWritable.class, 
MailToRecReducer.class, Text.class, NullWritable.class,
                   TextOutputFormat.class);
           createRecMatrix.getConfiguration().set("mapred.output.compress", 
"false");
           createRecMatrix.waitForCompletion(true);
@@ -220,7 +224,7 @@ public final class MailToPrefsDriver ext
     try {
       long currentChunkSize = 0;
       Path filesPattern = new Path(inputPath, OUTPUT_FILES_PATTERN);
-      int i = 0;
+      int i = 1;//start at 1, since a miss in the OpenObjectIntHashMap returns 
a 0
       for (Pair<Writable, Writable> record
               : new SequenceFileDirIterable<Writable, Writable>(filesPattern, 
PathType.GLOB, null, null, true, conf)) {
         if (currentChunkSize > chunkSizeLimit) {

Modified: 
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java?rev=1205271&r1=1205270&r2=1205271&view=diff
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
 (original)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecMapper.java
 Wed Nov 23 02:47:28 2011
@@ -19,6 +19,7 @@ package org.apache.mahout.cf.taste.examp
 
 import org.apache.commons.lang.StringUtils;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.NullWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Mapper;
@@ -28,7 +29,7 @@ import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 
-public final class MailToRecMapper extends Mapper<Text, Text, NullWritable, 
Text> {
+public final class MailToRecMapper extends Mapper<Text, Text, Text, 
LongWritable> {
 
   private static final Logger log = 
LoggerFactory.getLogger(MailToRecMapper.class);
 
@@ -92,7 +93,7 @@ public final class MailToRecMapper exten
     }
 
     if (msgIdKey != Integer.MIN_VALUE && fromKey != Integer.MIN_VALUE) {
-      context.write(null, new Text(fromKey + "," + msgIdKey + ",1"));
+      context.write(new Text(fromKey + "," + msgIdKey), new LongWritable(1));
     }
   }
 

Added: 
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java?rev=1205271&view=auto
==============================================================================
--- 
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
 (added)
+++ 
mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/email/MailToRecReducer.java
 Wed Nov 23 02:47:28 2011
@@ -0,0 +1,40 @@
+package org.apache.mahout.cf.taste.example.email;
+
+
+import org.apache.hadoop.io.LongWritable;
+import org.apache.hadoop.io.NullWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.mapreduce.Reducer;
+
+import java.io.IOException;
+
+/**
+ *
+ *
+ **/
+public class MailToRecReducer extends Reducer<Text, LongWritable, Text, 
NullWritable>{
+  //if true, then output weight
+  private boolean useCounts = true;
+  /**
+   * We can either ignore how many times the user interacted (boolean) or 
output the number of times they interacted.
+   */
+  public static final String USE_COUNTS_PREFERENCE = "useBooleanPreferences";
+
+  @Override
+  protected void setup(Context context) throws IOException, 
InterruptedException {
+    useCounts = context.getConfiguration().getBoolean(USE_COUNTS_PREFERENCE, 
true);
+  }
+
+  @Override
+  protected void reduce(Text key, Iterable<LongWritable> values, Context 
context) throws IOException, InterruptedException {
+    if (useCounts == false){
+      context.write(new Text(key.toString()), null);
+    } else {
+      long sum = 0;
+      for (LongWritable value : values) {
+        sum++;
+      }
+      context.write(new Text(key.toString() + "," + sum), null);
+    }
+  }
+}

Modified: 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java?rev=1205271&r1=1205270&r2=1205271&view=diff
==============================================================================
--- 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
 (original)
+++ 
mahout/trunk/integration/src/main/java/org/apache/mahout/utils/email/MailProcessor.java
 Wed Nov 23 02:47:28 2011
@@ -149,7 +149,11 @@ public class MailProcessor {
 
   private static void writeContent(String separator, StringBuilder contents, 
CharSequence body, String[] matches) {
     for (String match : matches) {
-      contents.append(match).append(separator);
+      if (match != null) {
+        contents.append(match).append(separator);
+      } else {
+        contents.append("").append(separator);
+      }
     }
     contents.append('\n').append(body);
   }


Reply via email to