Author: ssc
Date: Tue Mar 22 21:48:22 2011
New Revision: 1084367

URL: http://svn.apache.org/viewvc?rev=1084367&view=rev
Log:
MAHOUT-628 Add an option to prune away users with less than a given number of 
preferences to ItemSimilarityJob and RecommenderJob

Added:
    
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java
    
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java
      - copied, changed from r1084304, 
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
Removed:
    
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
Modified:
    
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducer.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java?rev=1084367&r1=1084366&r2=1084367&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
 Tue Mar 22 21:48:22 2011
@@ -100,6 +100,7 @@ public final class RecommenderJob extend
   
   private static final int DEFAULT_MAX_SIMILARITIES_PER_ITEM = 100;
   private static final int DEFAULT_MAX_COOCCURRENCES_PER_ITEM = 100;
+  private static final int DEFAULT_MIN_PREFS_PER_USER = 1;
 
   @Override
   public int run(String[] args) throws IOException, ClassNotFoundException, 
InterruptedException {
@@ -116,6 +117,8 @@ public final class RecommenderJob extend
     addOption("maxPrefsPerUser", "mp",
         "Maximum number of preferences considered per user in final 
recommendation phase",
         
String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED));
+    addOption("minPrefsPerUser", "mp", "ignore users with less preferences 
than this in the similarity computation "
+        + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', 
String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
     addOption("maxSimilaritiesPerItem", "m", "Maximum number of similarities 
considered per item ",
         String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ITEM));
     addOption("maxCooccurrencesPerItem", "mo", "try to cap the number of 
cooccurrences per item to this "
@@ -139,6 +142,7 @@ public final class RecommenderJob extend
     String filterFile = parsedArgs.get("--filterFile");
     boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));
     int maxPrefsPerUser = 
Integer.parseInt(parsedArgs.get("--maxPrefsPerUser"));
+    int minPrefsPerUser = 
Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
     int maxSimilaritiesPerItem = 
Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
     int maxCooccurrencesPerItem = 
Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem"));
     String similarityClassname = parsedArgs.get("--similarityClassname");
@@ -172,13 +176,14 @@ public final class RecommenderJob extend
         ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class,
         SequenceFileOutputFormat.class);
       toUserVector.getConfiguration().setBoolean(BOOLEAN_DATA, booleanData);
+      
toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER,
 minPrefsPerUser);
       toUserVector.waitForCompletion(true);
     }
 
     if (shouldRunNextPhase(parsedArgs, currentPhase)) {
-      Job countUsers = prepareJob(inputPath,
+      Job countUsers = prepareJob(userVectorPath,
                                   countUsersPath,
-                                  TextInputFormat.class,
+                                  SequenceFileInputFormat.class,
                                   CountUsersMapper.class,
                                   CountUsersKeyWritable.class,
                                   VarLongWritable.class,

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducer.java?rev=1084367&r1=1084366&r2=1084367&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducer.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducer.java
 Tue Mar 22 21:48:22 2011
@@ -46,7 +46,17 @@ import org.apache.mahout.math.VectorWrit
  */
 public final class ToUserVectorReducer extends
     Reducer<VarLongWritable,VarLongWritable,VarLongWritable,VectorWritable> {
-  
+
+  public static final String MIN_PREFERENCES_PER_USER = 
ToUserVectorReducer.class.getName() +
+      ".minPreferencesPerUser";
+  private int minPreferences;
+
+  @Override
+  protected void setup(Context ctx) throws IOException, InterruptedException {
+    super.setup(ctx);
+    minPreferences = ctx.getConfiguration().getInt(MIN_PREFERENCES_PER_USER, 
1);
+  }
+
   @Override
   protected void reduce(VarLongWritable userID,
                         Iterable<VarLongWritable> itemPrefs,
@@ -58,9 +68,11 @@ public final class ToUserVectorReducer e
       userVector.set(index, value);
     }
 
-    VectorWritable vw = new VectorWritable(userVector);
-    vw.setWritesLaxPrecision(true);
-    context.write(userID, vw);
+    if (userVector.getNumNondefaultElements() >= minPreferences) {
+      VectorWritable vw = new VectorWritable(userVector);
+      vw.setWritesLaxPrecision(true);
+      context.write(userID, vw);
+    }
   }
   
 }

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java?rev=1084367&r1=1084366&r2=1084367&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java
 Tue Mar 22 21:48:22 2011
@@ -24,21 +24,19 @@ import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Mapper;
 import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
 import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.VectorWritable;
 
 /**
  * Maps out the userIDs in a way that we can use a secondary sort on them
  */
 public class CountUsersMapper extends
-    Mapper<LongWritable,Text,CountUsersKeyWritable, VarLongWritable> {
+    
Mapper<VarLongWritable,VectorWritable,CountUsersKeyWritable,VarLongWritable> {
 
   @Override
-  protected void map(LongWritable key,
-                     Text value,
+  protected void map(VarLongWritable key,
+                     VectorWritable value,
                      Context context) throws IOException, InterruptedException 
{
-
-    String[] tokens = TasteHadoopUtils.splitPrefTokens(value.toString());
-    long userID = Long.parseLong(tokens[0]);
-
+    long userID = key.get();
     context.write(new CountUsersKeyWritable(userID), new 
VarLongWritable(userID));
   }
 

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java?rev=1084367&r1=1084366&r2=1084367&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
 Tue Mar 22 21:48:22 2011
@@ -87,6 +87,7 @@ public final class ItemSimilarityJob ext
 
   private static final int DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM = 100;
   private static final int DEFAULT_MAX_COOCCURRENCES_PER_ITEM = 100;
+  private static final int DEFAULT_MIN_PREFS_PER_USER = 1;
 
   public static void main(String[] args) throws Exception {
     ToolRunner.run(new ItemSimilarityJob(), args);
@@ -100,9 +101,13 @@ public final class ItemSimilarityJob ext
     addOption("similarityClassname", "s", "Name of distributed similarity 
class to instantiate, alternatively use "
         + "one of the predefined similarities (" + 
SimilarityType.listEnumNames() + ')');
     addOption("maxSimilaritiesPerItem", "m", "try to cap the number of similar 
items per item to this number "
-        + "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')', 
String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
+        + "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')',
+        String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
     addOption("maxCooccurrencesPerItem", "mo", "try to cap the number of 
cooccurrences per item to this number "
-        + "(default: " + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')', 
String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM));
+        + "(default: " + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')',
+        String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM));
+    addOption("minPrefsPerUser", "mp", "ignore users with less preferences 
than this "
+        + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')', 
String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
     addOption("booleanData", "b", "Treat input as without pref values", 
Boolean.FALSE.toString());
 
     Map<String,String> parsedArgs = parseArguments(args);
@@ -113,6 +118,7 @@ public final class ItemSimilarityJob ext
     String similarityClassName = parsedArgs.get("--similarityClassname");
     int maxSimilarItemsPerItem = 
Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
     int maxCooccurrencesPerItem = 
Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem"));
+    int minPrefsPerUser = 
Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
     boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));
 
     Path inputPath = getInputPath();
@@ -137,21 +143,6 @@ public final class ItemSimilarityJob ext
       itemIDIndex.waitForCompletion(true);
     }
 
-    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
-      Job countUsers = prepareJob(inputPath,
-                                  countUsersPath,
-                                  TextInputFormat.class,
-                                  CountUsersMapper.class,
-                                  CountUsersKeyWritable.class,
-                                  VarLongWritable.class,
-                                  CountUsersReducer.class,
-                                  VarIntWritable.class,
-                                  NullWritable.class,
-                                  TextOutputFormat.class);
-      
countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class);
-      
countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class);
-      countUsers.waitForCompletion(true);
-    }
 
     if (shouldRunNextPhase(parsedArgs, currentPhase)) {
       Job toUserVector = prepareJob(inputPath,
@@ -165,10 +156,27 @@ public final class ItemSimilarityJob ext
                                   VectorWritable.class,
                                   SequenceFileOutputFormat.class);
       toUserVector.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA, 
booleanData);
+      
toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER,
 minPrefsPerUser);
       toUserVector.waitForCompletion(true);
     }
 
     if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+      Job countUsers = prepareJob(userVectorPath,
+                                  countUsersPath,
+                                  SequenceFileInputFormat.class,
+                                  CountUsersMapper.class,
+                                  CountUsersKeyWritable.class,
+                                  VarLongWritable.class,
+                                  CountUsersReducer.class,
+                                  VarIntWritable.class,
+                                  NullWritable.class,
+                                  TextOutputFormat.class);
+      
countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class);
+      
countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class);
+      countUsers.waitForCompletion(true);
+    }
+
+    if (shouldRunNextPhase(parsedArgs, currentPhase)) {
       Job maybePruneAndTransponse = prepareJob(userVectorPath,
                                   itemUserMatrixPath,
                                   SequenceFileInputFormat.class,

Added: 
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java?rev=1084367&view=auto
==============================================================================
--- 
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java
 (added)
+++ 
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java
 Tue Mar 22 21:48:22 2011
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.cf.taste.impl.TasteTestCase;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.hadoop.MathHelper;
+import org.easymock.classextension.EasyMock;
+import org.junit.Test;
+
+import java.util.Arrays;
+
+/**
+ * tests {@link org.apache.mahout.cf.taste.hadoop.item.ToUserVectorReducer}
+ */
+public class ToUserVectorReducerTest extends TasteTestCase {
+
+  @Test
+  public void testToUsersReducerMinPreferencesUserIgnored() throws Exception {
+    
Reducer<VarLongWritable,VarLongWritable,VarLongWritable,VectorWritable>.Context 
context =
+        EasyMock.createMock(Reducer.Context.class);
+
+    ToUserVectorReducer reducer = new ToUserVectorReducer();
+    setField(reducer, "minPreferences", 2);
+
+    EasyMock.replay(context);
+
+    reducer.reduce(new VarLongWritable(123), Arrays.asList(new 
VarLongWritable(456)), context);
+
+    EasyMock.verify(context);
+  }
+
+  @Test
+  public void testToUsersReducerMinPreferencesUserPasses() throws Exception {
+    
Reducer<VarLongWritable,VarLongWritable,VarLongWritable,VectorWritable>.Context 
context =
+        EasyMock.createMock(Reducer.Context.class);
+
+    ToUserVectorReducer reducer = new ToUserVectorReducer();
+    setField(reducer, "minPreferences", 2);
+
+    context.write(EasyMock.eq(new VarLongWritable(123)), 
MathHelper.vectorMatches(
+        MathHelper.elem(TasteHadoopUtils.idToIndex(456L), 1.0), 
MathHelper.elem(TasteHadoopUtils.idToIndex(789L), 1.0)));
+
+    EasyMock.replay(context);
+
+    reducer.reduce(new VarLongWritable(123), Arrays.asList(new 
VarLongWritable(456), new VarLongWritable(789)), context);
+
+    EasyMock.verify(context);
+  }
+
+}

Copied: 
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java
 (from r1084304, 
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java)
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java?p2=mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java&p1=mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java&r1=1084304&r2=1084367&rev=1084367&view=diff
==============================================================================
--- 
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
 (original)
+++ 
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java
 Tue Mar 22 21:48:22 2011
@@ -54,22 +54,22 @@ import org.junit.Test;
  * Unit tests for the mappers and reducers in 
org.apache.mahout.cf.taste.hadoop.similarity.item
  * some integration tests with tiny data sets at the end
  */
-public final class ItemSimilarityTest extends TasteTestCase {
+public final class ItemSimilarityJobTest extends TasteTestCase {
 
   /**
    * Tests {@link CountUsersMapper}
    */
   @Test
   public void testCountUsersMapper() throws Exception {
-    Mapper<LongWritable,Text,CountUsersKeyWritable,VarLongWritable>.Context 
context =
+    
Mapper<VarLongWritable,VectorWritable,CountUsersKeyWritable,VarLongWritable>.Context
 context =
         EasyMock.createMock(Mapper.Context.class);
     context.write(keyForUserID(12L), EasyMock.eq(new VarLongWritable(12L)));
     context.write(keyForUserID(35L), EasyMock.eq(new VarLongWritable(35L)));
     EasyMock.replay(context);
 
     CountUsersMapper mapper = new CountUsersMapper();
-    mapper.map(null, new Text("12,100,1.3"), context);
-    mapper.map(null, new Text("35,100,3.0"), context);
+    mapper.map(new VarLongWritable(12), new VectorWritable(), context);
+    mapper.map(new VarLongWritable(35), new VectorWritable(), context);
 
     EasyMock.verify(context);
   }


Reply via email to