Author: ssc
Date: Tue Mar 22 21:48:22 2011
New Revision: 1084367
URL: http://svn.apache.org/viewvc?rev=1084367&view=rev
Log:
MAHOUT-628 Add an option to prune away users with less than a given number of
preferences to ItemSimilarityJob and RecommenderJob
Added:
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java
- copied, changed from r1084304,
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
Removed:
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducer.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java?rev=1084367&r1=1084366&r2=1084367&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/RecommenderJob.java
Tue Mar 22 21:48:22 2011
@@ -100,6 +100,7 @@ public final class RecommenderJob extend
private static final int DEFAULT_MAX_SIMILARITIES_PER_ITEM = 100;
private static final int DEFAULT_MAX_COOCCURRENCES_PER_ITEM = 100;
+ private static final int DEFAULT_MIN_PREFS_PER_USER = 1;
@Override
public int run(String[] args) throws IOException, ClassNotFoundException,
InterruptedException {
@@ -116,6 +117,8 @@ public final class RecommenderJob extend
addOption("maxPrefsPerUser", "mp",
"Maximum number of preferences considered per user in final
recommendation phase",
String.valueOf(UserVectorSplitterMapper.DEFAULT_MAX_PREFS_PER_USER_CONSIDERED));
+ addOption("minPrefsPerUser", "mp", "ignore users with less preferences
than this in the similarity computation "
+ + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
addOption("maxSimilaritiesPerItem", "m", "Maximum number of similarities
considered per item ",
String.valueOf(DEFAULT_MAX_SIMILARITIES_PER_ITEM));
addOption("maxCooccurrencesPerItem", "mo", "try to cap the number of
cooccurrences per item to this "
@@ -139,6 +142,7 @@ public final class RecommenderJob extend
String filterFile = parsedArgs.get("--filterFile");
boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));
int maxPrefsPerUser =
Integer.parseInt(parsedArgs.get("--maxPrefsPerUser"));
+ int minPrefsPerUser =
Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
int maxSimilaritiesPerItem =
Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
int maxCooccurrencesPerItem =
Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem"));
String similarityClassname = parsedArgs.get("--similarityClassname");
@@ -172,13 +176,14 @@ public final class RecommenderJob extend
ToUserVectorReducer.class, VarLongWritable.class, VectorWritable.class,
SequenceFileOutputFormat.class);
toUserVector.getConfiguration().setBoolean(BOOLEAN_DATA, booleanData);
+
toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER,
minPrefsPerUser);
toUserVector.waitForCompletion(true);
}
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
- Job countUsers = prepareJob(inputPath,
+ Job countUsers = prepareJob(userVectorPath,
countUsersPath,
- TextInputFormat.class,
+ SequenceFileInputFormat.class,
CountUsersMapper.class,
CountUsersKeyWritable.class,
VarLongWritable.class,
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducer.java?rev=1084367&r1=1084366&r2=1084367&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducer.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducer.java
Tue Mar 22 21:48:22 2011
@@ -46,7 +46,17 @@ import org.apache.mahout.math.VectorWrit
*/
public final class ToUserVectorReducer extends
Reducer<VarLongWritable,VarLongWritable,VarLongWritable,VectorWritable> {
-
+
+ public static final String MIN_PREFERENCES_PER_USER =
ToUserVectorReducer.class.getName() +
+ ".minPreferencesPerUser";
+ private int minPreferences;
+
+ @Override
+ protected void setup(Context ctx) throws IOException, InterruptedException {
+ super.setup(ctx);
+ minPreferences = ctx.getConfiguration().getInt(MIN_PREFERENCES_PER_USER,
1);
+ }
+
@Override
protected void reduce(VarLongWritable userID,
Iterable<VarLongWritable> itemPrefs,
@@ -58,9 +68,11 @@ public final class ToUserVectorReducer e
userVector.set(index, value);
}
- VectorWritable vw = new VectorWritable(userVector);
- vw.setWritesLaxPrecision(true);
- context.write(userID, vw);
+ if (userVector.getNumNondefaultElements() >= minPreferences) {
+ VectorWritable vw = new VectorWritable(userVector);
+ vw.setWritesLaxPrecision(true);
+ context.write(userID, vw);
+ }
}
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java?rev=1084367&r1=1084366&r2=1084367&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/CountUsersMapper.java
Tue Mar 22 21:48:22 2011
@@ -24,21 +24,19 @@ import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.VectorWritable;
/**
* Maps out the userIDs in a way that we can use a secondary sort on them
*/
public class CountUsersMapper extends
- Mapper<LongWritable,Text,CountUsersKeyWritable, VarLongWritable> {
+
Mapper<VarLongWritable,VectorWritable,CountUsersKeyWritable,VarLongWritable> {
@Override
- protected void map(LongWritable key,
- Text value,
+ protected void map(VarLongWritable key,
+ VectorWritable value,
Context context) throws IOException, InterruptedException
{
-
- String[] tokens = TasteHadoopUtils.splitPrefTokens(value.toString());
- long userID = Long.parseLong(tokens[0]);
-
+ long userID = key.get();
context.write(new CountUsersKeyWritable(userID), new
VarLongWritable(userID));
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java?rev=1084367&r1=1084366&r2=1084367&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJob.java
Tue Mar 22 21:48:22 2011
@@ -87,6 +87,7 @@ public final class ItemSimilarityJob ext
private static final int DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM = 100;
private static final int DEFAULT_MAX_COOCCURRENCES_PER_ITEM = 100;
+ private static final int DEFAULT_MIN_PREFS_PER_USER = 1;
public static void main(String[] args) throws Exception {
ToolRunner.run(new ItemSimilarityJob(), args);
@@ -100,9 +101,13 @@ public final class ItemSimilarityJob ext
addOption("similarityClassname", "s", "Name of distributed similarity
class to instantiate, alternatively use "
+ "one of the predefined similarities (" +
SimilarityType.listEnumNames() + ')');
addOption("maxSimilaritiesPerItem", "m", "try to cap the number of similar
items per item to this number "
- + "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')',
String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
+ + "(default: " + DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM + ')',
+ String.valueOf(DEFAULT_MAX_SIMILAR_ITEMS_PER_ITEM));
addOption("maxCooccurrencesPerItem", "mo", "try to cap the number of
cooccurrences per item to this number "
- + "(default: " + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')',
String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM));
+ + "(default: " + DEFAULT_MAX_COOCCURRENCES_PER_ITEM + ')',
+ String.valueOf(DEFAULT_MAX_COOCCURRENCES_PER_ITEM));
+ addOption("minPrefsPerUser", "mp", "ignore users with less preferences
than this "
+ + "(default: " + DEFAULT_MIN_PREFS_PER_USER + ')',
String.valueOf(DEFAULT_MIN_PREFS_PER_USER));
addOption("booleanData", "b", "Treat input as without pref values",
Boolean.FALSE.toString());
Map<String,String> parsedArgs = parseArguments(args);
@@ -113,6 +118,7 @@ public final class ItemSimilarityJob ext
String similarityClassName = parsedArgs.get("--similarityClassname");
int maxSimilarItemsPerItem =
Integer.parseInt(parsedArgs.get("--maxSimilaritiesPerItem"));
int maxCooccurrencesPerItem =
Integer.parseInt(parsedArgs.get("--maxCooccurrencesPerItem"));
+ int minPrefsPerUser =
Integer.parseInt(parsedArgs.get("--minPrefsPerUser"));
boolean booleanData = Boolean.valueOf(parsedArgs.get("--booleanData"));
Path inputPath = getInputPath();
@@ -137,21 +143,6 @@ public final class ItemSimilarityJob ext
itemIDIndex.waitForCompletion(true);
}
- if (shouldRunNextPhase(parsedArgs, currentPhase)) {
- Job countUsers = prepareJob(inputPath,
- countUsersPath,
- TextInputFormat.class,
- CountUsersMapper.class,
- CountUsersKeyWritable.class,
- VarLongWritable.class,
- CountUsersReducer.class,
- VarIntWritable.class,
- NullWritable.class,
- TextOutputFormat.class);
-
countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class);
-
countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class);
- countUsers.waitForCompletion(true);
- }
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
Job toUserVector = prepareJob(inputPath,
@@ -165,10 +156,27 @@ public final class ItemSimilarityJob ext
VectorWritable.class,
SequenceFileOutputFormat.class);
toUserVector.getConfiguration().setBoolean(RecommenderJob.BOOLEAN_DATA,
booleanData);
+
toUserVector.getConfiguration().setInt(ToUserVectorReducer.MIN_PREFERENCES_PER_USER,
minPrefsPerUser);
toUserVector.waitForCompletion(true);
}
if (shouldRunNextPhase(parsedArgs, currentPhase)) {
+ Job countUsers = prepareJob(userVectorPath,
+ countUsersPath,
+ SequenceFileInputFormat.class,
+ CountUsersMapper.class,
+ CountUsersKeyWritable.class,
+ VarLongWritable.class,
+ CountUsersReducer.class,
+ VarIntWritable.class,
+ NullWritable.class,
+ TextOutputFormat.class);
+
countUsers.setPartitionerClass(CountUsersKeyWritable.CountUsersPartitioner.class);
+
countUsers.setGroupingComparatorClass(CountUsersKeyWritable.CountUsersGroupComparator.class);
+ countUsers.waitForCompletion(true);
+ }
+
+ if (shouldRunNextPhase(parsedArgs, currentPhase)) {
Job maybePruneAndTransponse = prepareJob(userVectorPath,
itemUserMatrixPath,
SequenceFileInputFormat.class,
Added:
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java?rev=1084367&view=auto
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java
(added)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/item/ToUserVectorReducerTest.java
Tue Mar 22 21:48:22 2011
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.mahout.cf.taste.hadoop.item;
+
+import org.apache.hadoop.mapreduce.Reducer;
+import org.apache.mahout.cf.taste.hadoop.TasteHadoopUtils;
+import org.apache.mahout.cf.taste.impl.TasteTestCase;
+import org.apache.mahout.math.VarLongWritable;
+import org.apache.mahout.math.VectorWritable;
+import org.apache.mahout.math.hadoop.MathHelper;
+import org.easymock.classextension.EasyMock;
+import org.junit.Test;
+
+import java.util.Arrays;
+
+/**
+ * tests {@link org.apache.mahout.cf.taste.hadoop.item.ToUserVectorReducer}
+ */
+public class ToUserVectorReducerTest extends TasteTestCase {
+
+ @Test
+ public void testToUsersReducerMinPreferencesUserIgnored() throws Exception {
+
Reducer<VarLongWritable,VarLongWritable,VarLongWritable,VectorWritable>.Context
context =
+ EasyMock.createMock(Reducer.Context.class);
+
+ ToUserVectorReducer reducer = new ToUserVectorReducer();
+ setField(reducer, "minPreferences", 2);
+
+ EasyMock.replay(context);
+
+ reducer.reduce(new VarLongWritable(123), Arrays.asList(new
VarLongWritable(456)), context);
+
+ EasyMock.verify(context);
+ }
+
+ @Test
+ public void testToUsersReducerMinPreferencesUserPasses() throws Exception {
+
Reducer<VarLongWritable,VarLongWritable,VarLongWritable,VectorWritable>.Context
context =
+ EasyMock.createMock(Reducer.Context.class);
+
+ ToUserVectorReducer reducer = new ToUserVectorReducer();
+ setField(reducer, "minPreferences", 2);
+
+ context.write(EasyMock.eq(new VarLongWritable(123)),
MathHelper.vectorMatches(
+ MathHelper.elem(TasteHadoopUtils.idToIndex(456L), 1.0),
MathHelper.elem(TasteHadoopUtils.idToIndex(789L), 1.0)));
+
+ EasyMock.replay(context);
+
+ reducer.reduce(new VarLongWritable(123), Arrays.asList(new
VarLongWritable(456), new VarLongWritable(789)), context);
+
+ EasyMock.verify(context);
+ }
+
+}
Copied:
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java
(from r1084304,
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java)
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java?p2=mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java&p1=mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java&r1=1084304&r2=1084367&rev=1084367&view=diff
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityTest.java
(original)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/hadoop/similarity/item/ItemSimilarityJobTest.java
Tue Mar 22 21:48:22 2011
@@ -54,22 +54,22 @@ import org.junit.Test;
* Unit tests for the mappers and reducers in
org.apache.mahout.cf.taste.hadoop.similarity.item
* some integration tests with tiny data sets at the end
*/
-public final class ItemSimilarityTest extends TasteTestCase {
+public final class ItemSimilarityJobTest extends TasteTestCase {
/**
* Tests {@link CountUsersMapper}
*/
@Test
public void testCountUsersMapper() throws Exception {
- Mapper<LongWritable,Text,CountUsersKeyWritable,VarLongWritable>.Context
context =
+
Mapper<VarLongWritable,VectorWritable,CountUsersKeyWritable,VarLongWritable>.Context
context =
EasyMock.createMock(Mapper.Context.class);
context.write(keyForUserID(12L), EasyMock.eq(new VarLongWritable(12L)));
context.write(keyForUserID(35L), EasyMock.eq(new VarLongWritable(35L)));
EasyMock.replay(context);
CountUsersMapper mapper = new CountUsersMapper();
- mapper.map(null, new Text("12,100,1.3"), context);
- mapper.map(null, new Text("35,100,3.0"), context);
+ mapper.map(new VarLongWritable(12), new VectorWritable(), context);
+ mapper.map(new VarLongWritable(35), new VectorWritable(), context);
EasyMock.verify(context);
}