Author: srowen
Date: Wed Dec 7 10:24:47 2011
New Revision: 1211377
URL: http://svn.apache.org/viewvc?rev=1211377&view=rev
Log:
MAHOUT-910 revamp sampling candidate strategy to expose sampling of items,
items' users, and those users' items
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategy.java
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategyTest.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategy.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategy.java?rev=1211377&r1=1211376&r2=1211377&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategy.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategy.java
Wed Dec 7 10:24:47 2011
@@ -1,4 +1,4 @@
-/**
+/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -20,6 +20,9 @@ package org.apache.mahout.cf.taste.impl.
import com.google.common.base.Preconditions;
import org.apache.mahout.cf.taste.common.TasteException;
import org.apache.mahout.cf.taste.impl.common.FastIDSet;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveArrayIterator;
+import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator;
+import org.apache.mahout.cf.taste.impl.common.SamplingLongPrimitiveIterator;
import org.apache.mahout.cf.taste.model.DataModel;
import org.apache.mahout.cf.taste.model.Preference;
import org.apache.mahout.cf.taste.model.PreferenceArray;
@@ -28,57 +31,89 @@ import org.apache.mahout.common.iterator
import java.util.Iterator;
/**
- * <p>returns all items that have not been rated by the user and that were
preferred by another user
- * that has preferred at least one item that the current user has preferred
too</p>
+ * <p>Returns all items that have not been rated by the user <em>(3)</em> and
that were preferred by another user
+ * <em>(2)</em> that has preferred at least one item <em>(1)</em> that the
current user has preferred too.</p>
+ *
+ * <p>This strategy uses sampling to limit the number of items that are
considered, by sampling three different
+ * things, noted above:</p>
*
- * <p>this strategy uses sampling in a way that only a certain amount of
preferences per item is considered
- * <pre>
+ * <ol>
+ * <li>The items</li>
+ * </ol>
+ *
+ * <p><pre>
* max(defaultMaxPrefsPerItemConsidered, userItemCountFactor *
log(max(N_users, N_items)))
* </pre></p>
+ *
+ * <p>This limit is applied in two ways. First, for each item that the current
user prefers, it limits
+ * the number of other users who preferred that item that are then considered.
Then for each of those users,
+ * it limits the number of their preferred items that are added to the list of
candidates.</p>
*/
public class SamplingCandidateItemsStrategy extends
AbstractCandidateItemsStrategy {
- private final int defaultMaxPrefsPerItemConsidered;
- private final int userItemCountMultiplier;
+ private static final int DEFAULT_FACTOR = 5;
+
+ private final int maxItems;
+ private final int maxUsersPerItem;
+ private final int maxItemsPerUser;
- /**
- * uses defaultMaxPrefsPerItemConsidered = 100 and userItemCountMultiplier =
20 as default values
- *
- * @see SamplingCandidateItemsStrategy#SamplingCandidateItemsStrategy(int,
int)
- */
- public SamplingCandidateItemsStrategy() {
- this(100, 20);
+ public SamplingCandidateItemsStrategy(int numUsers, int numItems) {
+ this(DEFAULT_FACTOR, DEFAULT_FACTOR, DEFAULT_FACTOR, numUsers, numItems);
}
- /**
- * <p>the maximum number of prefs considered per item will be computed like
this:
- * <pre>
- * max(defaultMaxPrefsPerItemConsidered, userItemCountFactor *
log(max(N_users, N_items)))
- * </pre>
- * </p>
- */
- public SamplingCandidateItemsStrategy(int defaultMaxPrefsPerItemConsidered,
int userItemCountMultiplier) {
- Preconditions.checkArgument(defaultMaxPrefsPerItemConsidered > 0,
"defaultMaxPrefsPerItemConsidered must be " +
- "greater zero");
- Preconditions.checkArgument(userItemCountMultiplier > 0,
"userItemCountMultiplier must be greater zero");
- this.defaultMaxPrefsPerItemConsidered = defaultMaxPrefsPerItemConsidered;
- this.userItemCountMultiplier = userItemCountMultiplier;
+ public SamplingCandidateItemsStrategy(int itemsFactor,
+ int usersPerItemFactor,
+ int candidatesPerUserFactor,
+ int numUsers,
+ int numItems) {
+ Preconditions.checkArgument(itemsFactor > 0);
+ Preconditions.checkArgument(usersPerItemFactor > 0);
+ Preconditions.checkArgument(candidatesPerUserFactor > 0);
+ Preconditions.checkArgument(numUsers > 0);
+ Preconditions.checkArgument(numItems > 0);
+ maxItems = (int) (itemsFactor * (1.0 + Math.log(numItems)));
+ maxUsersPerItem = (int) (itemsFactor * (1.0 + Math.log(numUsers)));
+ maxItemsPerUser = (int) (itemsFactor *(1.0 + Math.log(numItems)));
}
@Override
protected FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel
dataModel) throws TasteException {
- int maxPrefsPerItemConsidered = (int)
Math.max(defaultMaxPrefsPerItemConsidered,
- userItemCountMultiplier * Math.log(Math.max(dataModel.getNumUsers(),
dataModel.getNumItems())));
FastIDSet possibleItemsIDs = new FastIDSet();
- for (long itemID : preferredItemIDs) {
+ LongPrimitiveIterator preferredItemIDsIterator = new
LongPrimitiveArrayIterator(preferredItemIDs);
+ if (preferredItemIDs.length > maxItems) {
+ preferredItemIDsIterator =
+ new SamplingLongPrimitiveIterator(preferredItemIDsIterator, (double)
maxItems / preferredItemIDs.length);
+ }
+ while (preferredItemIDsIterator.hasNext()) {
+ long itemID = preferredItemIDsIterator.nextLong();
PreferenceArray prefs = dataModel.getPreferencesForItem(itemID);
- int prefsConsidered = Math.min(prefs.length(),
maxPrefsPerItemConsidered);
- Iterator<Preference> sampledPrefs = new
FixedSizeSamplingIterator<Preference>(prefsConsidered, prefs.iterator());
- while (sampledPrefs.hasNext()) {
-
possibleItemsIDs.addAll(dataModel.getItemIDsFromUser(sampledPrefs.next().getUserID()));
+ int prefsLength = prefs.length();
+ if (prefsLength > maxUsersPerItem) {
+ Iterator<Preference> sampledPrefs =
+ new FixedSizeSamplingIterator<Preference>(maxUsersPerItem,
prefs.iterator());
+ while (sampledPrefs.hasNext()) {
+ addSomeOf(possibleItemsIDs,
dataModel.getItemIDsFromUser(sampledPrefs.next().getUserID()));
+ }
+ } else {
+ for (int i = 0; i < prefsLength; i++) {
+ addSomeOf(possibleItemsIDs,
dataModel.getItemIDsFromUser(prefs.getUserID(i)));
+ }
}
}
possibleItemsIDs.removeAll(preferredItemIDs);
return possibleItemsIDs;
}
+
+ private void addSomeOf(FastIDSet possibleItemIDs, FastIDSet itemIDs) {
+ if (itemIDs.size() > maxItemsPerUser) {
+ LongPrimitiveIterator it =
+ new SamplingLongPrimitiveIterator(itemIDs.iterator(), (double)
maxItemsPerUser / itemIDs.size());
+ while (it.hasNext()) {
+ possibleItemIDs.add(it.nextLong());
+ }
+ } else {
+ possibleItemIDs.addAll(itemIDs);
+ }
+ }
+
}
Modified:
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategyTest.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategyTest.java?rev=1211377&r1=1211376&r2=1211377&view=diff
==============================================================================
---
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategyTest.java
(original)
+++
mahout/trunk/core/src/test/java/org/apache/mahout/cf/taste/impl/recommender/SamplingCandidateItemsStrategyTest.java
Wed Dec 7 10:24:47 2011
@@ -60,7 +60,8 @@ public final class SamplingCandidateItem
DataModel dataModel = new GenericDataModel(userData);
- CandidateItemsStrategy strategy = new SamplingCandidateItemsStrategy(1, 1);
+ CandidateItemsStrategy strategy =
+ new SamplingCandidateItemsStrategy(1, 1, 1, dataModel.getNumUsers(),
dataModel.getNumItems());
FastIDSet candidateItems = strategy.getCandidateItems(123L,
prefArrayOfUser123, dataModel);
/* result can be either item2 or item3 or empty */