http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java new file mode 100644 index 0000000..59c30d9 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AbstractSimilarity.java @@ -0,0 +1,343 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.similarity; + +import java.util.Collection; +import java.util.concurrent.Callable; + +import org.apache.mahout.cf.taste.common.Refreshable; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.common.Weighting; +import org.apache.mahout.cf.taste.impl.common.RefreshHelper; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.model.PreferenceArray; +import org.apache.mahout.cf.taste.similarity.PreferenceInferrer; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; + +import com.google.common.base.Preconditions; + +/** Abstract superclass encapsulating functionality that is common to most implementations in this package. */ +abstract class AbstractSimilarity extends AbstractItemSimilarity implements UserSimilarity { + + private PreferenceInferrer inferrer; + private final boolean weighted; + private final boolean centerData; + private int cachedNumItems; + private int cachedNumUsers; + private final RefreshHelper refreshHelper; + + /** + * <p> + * Creates a possibly weighted {@link AbstractSimilarity}. + * </p> + */ + AbstractSimilarity(final DataModel dataModel, Weighting weighting, boolean centerData) throws TasteException { + super(dataModel); + this.weighted = weighting == Weighting.WEIGHTED; + this.centerData = centerData; + this.cachedNumItems = dataModel.getNumItems(); + this.cachedNumUsers = dataModel.getNumUsers(); + this.refreshHelper = new RefreshHelper(new Callable<Object>() { + @Override + public Object call() throws TasteException { + cachedNumItems = dataModel.getNumItems(); + cachedNumUsers = dataModel.getNumUsers(); + return null; + } + }); + } + + final PreferenceInferrer getPreferenceInferrer() { + return inferrer; + } + + @Override + public final void setPreferenceInferrer(PreferenceInferrer inferrer) { + Preconditions.checkArgument(inferrer != null, "inferrer is null"); + refreshHelper.addDependency(inferrer); + refreshHelper.removeDependency(this.inferrer); + this.inferrer = inferrer; + } + + final boolean isWeighted() { + return weighted; + } + + /** + * <p> + * Several subclasses in this package implement this method to actually compute the similarity from figures + * computed over users or items. Note that the computations in this class "center" the data, such that X and + * Y's mean are 0. + * </p> + * + * <p> + * Note that the sum of all X and Y values must then be 0. This value isn't passed down into the standard + * similarity computations as a result. + * </p> + * + * @param n + * total number of users or items + * @param sumXY + * sum of product of user/item preference values, over all items/users preferred by both + * users/items + * @param sumX2 + * sum of the square of user/item preference values, over the first item/user + * @param sumY2 + * sum of the square of the user/item preference values, over the second item/user + * @param sumXYdiff2 + * sum of squares of differences in X and Y values + * @return similarity value between -1.0 and 1.0, inclusive, or {@link Double#NaN} if no similarity can be + * computed (e.g. when no items have been rated by both users + */ + abstract double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2); + + @Override + public double userSimilarity(long userID1, long userID2) throws TasteException { + DataModel dataModel = getDataModel(); + PreferenceArray xPrefs = dataModel.getPreferencesFromUser(userID1); + PreferenceArray yPrefs = dataModel.getPreferencesFromUser(userID2); + int xLength = xPrefs.length(); + int yLength = yPrefs.length(); + + if (xLength == 0 || yLength == 0) { + return Double.NaN; + } + + long xIndex = xPrefs.getItemID(0); + long yIndex = yPrefs.getItemID(0); + int xPrefIndex = 0; + int yPrefIndex = 0; + + double sumX = 0.0; + double sumX2 = 0.0; + double sumY = 0.0; + double sumY2 = 0.0; + double sumXY = 0.0; + double sumXYdiff2 = 0.0; + int count = 0; + + boolean hasInferrer = inferrer != null; + + while (true) { + int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0; + if (hasInferrer || compare == 0) { + double x; + double y; + if (xIndex == yIndex) { + // Both users expressed a preference for the item + x = xPrefs.getValue(xPrefIndex); + y = yPrefs.getValue(yPrefIndex); + } else { + // Only one user expressed a preference, but infer the other one's preference and tally + // as if the other user expressed that preference + if (compare < 0) { + // X has a value; infer Y's + x = xPrefs.getValue(xPrefIndex); + y = inferrer.inferPreference(userID2, xIndex); + } else { + // compare > 0 + // Y has a value; infer X's + x = inferrer.inferPreference(userID1, yIndex); + y = yPrefs.getValue(yPrefIndex); + } + } + sumXY += x * y; + sumX += x; + sumX2 += x * x; + sumY += y; + sumY2 += y * y; + double diff = x - y; + sumXYdiff2 += diff * diff; + count++; + } + if (compare <= 0) { + if (++xPrefIndex >= xLength) { + if (hasInferrer) { + // Must count other Ys; pretend next X is far away + if (yIndex == Long.MAX_VALUE) { + // ... but stop if both are done! + break; + } + xIndex = Long.MAX_VALUE; + } else { + break; + } + } else { + xIndex = xPrefs.getItemID(xPrefIndex); + } + } + if (compare >= 0) { + if (++yPrefIndex >= yLength) { + if (hasInferrer) { + // Must count other Xs; pretend next Y is far away + if (xIndex == Long.MAX_VALUE) { + // ... but stop if both are done! + break; + } + yIndex = Long.MAX_VALUE; + } else { + break; + } + } else { + yIndex = yPrefs.getItemID(yPrefIndex); + } + } + } + + // "Center" the data. If my math is correct, this'll do it. + double result; + if (centerData) { + double meanX = sumX / count; + double meanY = sumY / count; + // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY; + double centeredSumXY = sumXY - meanY * sumX; + // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX; + double centeredSumX2 = sumX2 - meanX * sumX; + // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY; + double centeredSumY2 = sumY2 - meanY * sumY; + result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2); + } else { + result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2); + } + + if (!Double.isNaN(result)) { + result = normalizeWeightResult(result, count, cachedNumItems); + } + return result; + } + + @Override + public final double itemSimilarity(long itemID1, long itemID2) throws TasteException { + DataModel dataModel = getDataModel(); + PreferenceArray xPrefs = dataModel.getPreferencesForItem(itemID1); + PreferenceArray yPrefs = dataModel.getPreferencesForItem(itemID2); + int xLength = xPrefs.length(); + int yLength = yPrefs.length(); + + if (xLength == 0 || yLength == 0) { + return Double.NaN; + } + + long xIndex = xPrefs.getUserID(0); + long yIndex = yPrefs.getUserID(0); + int xPrefIndex = 0; + int yPrefIndex = 0; + + double sumX = 0.0; + double sumX2 = 0.0; + double sumY = 0.0; + double sumY2 = 0.0; + double sumXY = 0.0; + double sumXYdiff2 = 0.0; + int count = 0; + + // No, pref inferrers and transforms don't apply here. I think. + + while (true) { + int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0; + if (compare == 0) { + // Both users expressed a preference for the item + double x = xPrefs.getValue(xPrefIndex); + double y = yPrefs.getValue(yPrefIndex); + sumXY += x * y; + sumX += x; + sumX2 += x * x; + sumY += y; + sumY2 += y * y; + double diff = x - y; + sumXYdiff2 += diff * diff; + count++; + } + if (compare <= 0) { + if (++xPrefIndex == xLength) { + break; + } + xIndex = xPrefs.getUserID(xPrefIndex); + } + if (compare >= 0) { + if (++yPrefIndex == yLength) { + break; + } + yIndex = yPrefs.getUserID(yPrefIndex); + } + } + + double result; + if (centerData) { + // See comments above on these computations + double n = (double) count; + double meanX = sumX / n; + double meanY = sumY / n; + // double centeredSumXY = sumXY - meanY * sumX - meanX * sumY + n * meanX * meanY; + double centeredSumXY = sumXY - meanY * sumX; + // double centeredSumX2 = sumX2 - 2.0 * meanX * sumX + n * meanX * meanX; + double centeredSumX2 = sumX2 - meanX * sumX; + // double centeredSumY2 = sumY2 - 2.0 * meanY * sumY + n * meanY * meanY; + double centeredSumY2 = sumY2 - meanY * sumY; + result = computeResult(count, centeredSumXY, centeredSumX2, centeredSumY2, sumXYdiff2); + } else { + result = computeResult(count, sumXY, sumX2, sumY2, sumXYdiff2); + } + + if (!Double.isNaN(result)) { + result = normalizeWeightResult(result, count, cachedNumUsers); + } + return result; + } + + @Override + public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException { + int length = itemID2s.length; + double[] result = new double[length]; + for (int i = 0; i < length; i++) { + result[i] = itemSimilarity(itemID1, itemID2s[i]); + } + return result; + } + + final double normalizeWeightResult(double result, int count, int num) { + double normalizedResult = result; + if (weighted) { + double scaleFactor = 1.0 - (double) count / (double) (num + 1); + if (normalizedResult < 0.0) { + normalizedResult = -1.0 + scaleFactor * (1.0 + normalizedResult); + } else { + normalizedResult = 1.0 - scaleFactor * (1.0 - normalizedResult); + } + } + // Make sure the result is not accidentally a little outside [-1.0, 1.0] due to rounding: + if (normalizedResult < -1.0) { + normalizedResult = -1.0; + } else if (normalizedResult > 1.0) { + normalizedResult = 1.0; + } + return normalizedResult; + } + + @Override + public final void refresh(Collection<Refreshable> alreadyRefreshed) { + super.refresh(alreadyRefreshed); + refreshHelper.refresh(alreadyRefreshed); + } + + @Override + public final String toString() { + return this.getClass().getSimpleName() + "[dataModel:" + getDataModel() + ",inferrer:" + inferrer + ']'; + } + +}
http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java new file mode 100644 index 0000000..7c655fe --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/AveragingPreferenceInferrer.java @@ -0,0 +1,85 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.similarity; + +import java.util.Collection; + +import org.apache.mahout.cf.taste.common.Refreshable; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.common.Cache; +import org.apache.mahout.cf.taste.impl.common.FullRunningAverage; +import org.apache.mahout.cf.taste.impl.common.Retriever; +import org.apache.mahout.cf.taste.impl.common.RunningAverage; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.model.PreferenceArray; +import org.apache.mahout.cf.taste.similarity.PreferenceInferrer; + +/** + * <p> + * Implementations of this interface compute an inferred preference for a user and an item that the user has + * not expressed any preference for. This might be an average of other preferences scores from that user, for + * example. This technique is sometimes called "default voting". + * </p> + */ +public final class AveragingPreferenceInferrer implements PreferenceInferrer { + + private static final Float ZERO = 0.0f; + + private final DataModel dataModel; + private final Cache<Long,Float> averagePreferenceValue; + + public AveragingPreferenceInferrer(DataModel dataModel) throws TasteException { + this.dataModel = dataModel; + Retriever<Long,Float> retriever = new PrefRetriever(); + averagePreferenceValue = new Cache<>(retriever, dataModel.getNumUsers()); + refresh(null); + } + + @Override + public float inferPreference(long userID, long itemID) throws TasteException { + return averagePreferenceValue.get(userID); + } + + @Override + public void refresh(Collection<Refreshable> alreadyRefreshed) { + averagePreferenceValue.clear(); + } + + private final class PrefRetriever implements Retriever<Long,Float> { + + @Override + public Float get(Long key) throws TasteException { + PreferenceArray prefs = dataModel.getPreferencesFromUser(key); + int size = prefs.length(); + if (size == 0) { + return ZERO; + } + RunningAverage average = new FullRunningAverage(); + for (int i = 0; i < size; i++) { + average.addDatum(prefs.getValue(i)); + } + return (float) average.getAverage(); + } + } + + @Override + public String toString() { + return "AveragingPreferenceInferrer"; + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java new file mode 100644 index 0000000..87aeae9 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingItemSimilarity.java @@ -0,0 +1,111 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.similarity; + +import java.util.Collection; +import java.util.concurrent.Callable; + +import org.apache.mahout.cf.taste.common.Refreshable; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.common.Cache; +import org.apache.mahout.cf.taste.impl.common.RefreshHelper; +import org.apache.mahout.cf.taste.impl.common.Retriever; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.similarity.ItemSimilarity; +import org.apache.mahout.common.LongPair; +import com.google.common.base.Preconditions; + +/** + * Caches the results from an underlying {@link ItemSimilarity} implementation. + */ +public final class CachingItemSimilarity implements ItemSimilarity { + + private final ItemSimilarity similarity; + private final Cache<LongPair,Double> similarityCache; + private final RefreshHelper refreshHelper; + + /** + * Creates this on top of the given {@link ItemSimilarity}. + * The cache is sized according to properties of the given {@link DataModel}. + */ + public CachingItemSimilarity(ItemSimilarity similarity, DataModel dataModel) throws TasteException { + this(similarity, dataModel.getNumItems()); + } + + /** + * Creates this on top of the given {@link ItemSimilarity}. + * The cache size is capped by the given size. + */ + public CachingItemSimilarity(ItemSimilarity similarity, int maxCacheSize) { + Preconditions.checkArgument(similarity != null, "similarity is null"); + this.similarity = similarity; + this.similarityCache = new Cache<>(new SimilarityRetriever(similarity), maxCacheSize); + this.refreshHelper = new RefreshHelper(new Callable<Void>() { + @Override + public Void call() { + similarityCache.clear(); + return null; + } + }); + refreshHelper.addDependency(similarity); + } + + @Override + public double itemSimilarity(long itemID1, long itemID2) throws TasteException { + LongPair key = itemID1 < itemID2 ? new LongPair(itemID1, itemID2) : new LongPair(itemID2, itemID1); + return similarityCache.get(key); + } + + @Override + public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException { + int length = itemID2s.length; + double[] result = new double[length]; + for (int i = 0; i < length; i++) { + result[i] = itemSimilarity(itemID1, itemID2s[i]); + } + return result; + } + + @Override + public long[] allSimilarItemIDs(long itemID) throws TasteException { + return similarity.allSimilarItemIDs(itemID); + } + + @Override + public void refresh(Collection<Refreshable> alreadyRefreshed) { + refreshHelper.refresh(alreadyRefreshed); + } + + public void clearCacheForItem(long itemID) { + similarityCache.removeKeysMatching(new LongPairMatchPredicate(itemID)); + } + + private static final class SimilarityRetriever implements Retriever<LongPair,Double> { + private final ItemSimilarity similarity; + + private SimilarityRetriever(ItemSimilarity similarity) { + this.similarity = similarity; + } + + @Override + public Double get(LongPair key) throws TasteException { + return similarity.itemSimilarity(key.getFirst(), key.getSecond()); + } + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java new file mode 100644 index 0000000..873568a --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CachingUserSimilarity.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.similarity; + +import java.util.Collection; +import java.util.concurrent.Callable; + +import org.apache.mahout.cf.taste.common.Refreshable; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.common.Cache; +import org.apache.mahout.cf.taste.impl.common.RefreshHelper; +import org.apache.mahout.cf.taste.impl.common.Retriever; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.similarity.PreferenceInferrer; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; +import org.apache.mahout.common.LongPair; + +import com.google.common.base.Preconditions; + +/** + * Caches the results from an underlying {@link UserSimilarity} implementation. + */ +public final class CachingUserSimilarity implements UserSimilarity { + + private final UserSimilarity similarity; + private final Cache<LongPair,Double> similarityCache; + private final RefreshHelper refreshHelper; + + /** + * Creates this on top of the given {@link UserSimilarity}. + * The cache is sized according to properties of the given {@link DataModel}. + */ + public CachingUserSimilarity(UserSimilarity similarity, DataModel dataModel) throws TasteException { + this(similarity, dataModel.getNumUsers()); + } + + /** + * Creates this on top of the given {@link UserSimilarity}. + * The cache size is capped by the given size. + */ + public CachingUserSimilarity(UserSimilarity similarity, int maxCacheSize) { + Preconditions.checkArgument(similarity != null, "similarity is null"); + this.similarity = similarity; + this.similarityCache = new Cache<>(new SimilarityRetriever(similarity), maxCacheSize); + this.refreshHelper = new RefreshHelper(new Callable<Void>() { + @Override + public Void call() { + similarityCache.clear(); + return null; + } + }); + refreshHelper.addDependency(similarity); + } + + @Override + public double userSimilarity(long userID1, long userID2) throws TasteException { + LongPair key = userID1 < userID2 ? new LongPair(userID1, userID2) : new LongPair(userID2, userID1); + return similarityCache.get(key); + } + + @Override + public void setPreferenceInferrer(PreferenceInferrer inferrer) { + similarityCache.clear(); + similarity.setPreferenceInferrer(inferrer); + } + + public void clearCacheForUser(long userID) { + similarityCache.removeKeysMatching(new LongPairMatchPredicate(userID)); + } + + @Override + public void refresh(Collection<Refreshable> alreadyRefreshed) { + refreshHelper.refresh(alreadyRefreshed); + } + + private static final class SimilarityRetriever implements Retriever<LongPair,Double> { + private final UserSimilarity similarity; + + private SimilarityRetriever(UserSimilarity similarity) { + this.similarity = similarity; + } + + @Override + public Double get(LongPair key) throws TasteException { + return similarity.userSimilarity(key.getFirst(), key.getSecond()); + } + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java new file mode 100644 index 0000000..88fbe58 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/CityBlockSimilarity.java @@ -0,0 +1,98 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.mahout.cf.taste.impl.similarity; + +import java.util.Collection; + +import org.apache.mahout.cf.taste.common.Refreshable; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.common.FastIDSet; +import org.apache.mahout.cf.taste.impl.common.RefreshHelper; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.similarity.PreferenceInferrer; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; + +/** + * Implementation of City Block distance (also known as Manhattan distance) - the absolute value of the difference of + * each direction is summed. The resulting unbounded distance is then mapped between 0 and 1. + */ +public final class CityBlockSimilarity extends AbstractItemSimilarity implements UserSimilarity { + + public CityBlockSimilarity(DataModel dataModel) { + super(dataModel); + } + + /** + * @throws UnsupportedOperationException + */ + @Override + public void setPreferenceInferrer(PreferenceInferrer inferrer) { + throw new UnsupportedOperationException(); + } + + @Override + public void refresh(Collection<Refreshable> alreadyRefreshed) { + Collection<Refreshable> refreshed = RefreshHelper.buildRefreshed(alreadyRefreshed); + RefreshHelper.maybeRefresh(refreshed, getDataModel()); + } + + @Override + public double itemSimilarity(long itemID1, long itemID2) throws TasteException { + DataModel dataModel = getDataModel(); + int preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1); + int preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2); + int intersection = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2); + return doSimilarity(preferring1, preferring2, intersection); + } + + @Override + public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException { + DataModel dataModel = getDataModel(); + int preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1); + double[] distance = new double[itemID2s.length]; + for (int i = 0; i < itemID2s.length; ++i) { + int preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2s[i]); + int intersection = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2s[i]); + distance[i] = doSimilarity(preferring1, preferring2, intersection); + } + return distance; + } + + @Override + public double userSimilarity(long userID1, long userID2) throws TasteException { + DataModel dataModel = getDataModel(); + FastIDSet prefs1 = dataModel.getItemIDsFromUser(userID1); + FastIDSet prefs2 = dataModel.getItemIDsFromUser(userID2); + int prefs1Size = prefs1.size(); + int prefs2Size = prefs2.size(); + int intersectionSize = prefs1Size < prefs2Size ? prefs2.intersectionSize(prefs1) : prefs1.intersectionSize(prefs2); + return doSimilarity(prefs1Size, prefs2Size, intersectionSize); + } + + /** + * Calculate City Block Distance from total non-zero values and intersections and map to a similarity value. + * + * @param pref1 number of non-zero values in left vector + * @param pref2 number of non-zero values in right vector + * @param intersection number of overlapping non-zero values + */ + private static double doSimilarity(int pref1, int pref2, int intersection) { + int distance = pref1 + pref2 - 2 * intersection; + return 1.0 / (1.0 + distance); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java new file mode 100644 index 0000000..990e9ea --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/EuclideanDistanceSimilarity.java @@ -0,0 +1,67 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.similarity; + +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.common.Weighting; +import org.apache.mahout.cf.taste.model.DataModel; + +import com.google.common.base.Preconditions; + +/** + * <p> + * An implementation of a "similarity" based on the Euclidean "distance" between two users X and Y. Thinking + * of items as dimensions and preferences as points along those dimensions, a distance is computed using all + * items (dimensions) where both users have expressed a preference for that item. This is simply the square + * root of the sum of the squares of differences in position (preference) along each dimension.</p> + * + * <p>The similarity could be computed as 1 / (1 + distance / sqrt(n)), so the resulting values are in the range (0,1]. + * This would weight against pairs that overlap in more dimensions, which should indicate more similarity, + * since more dimensions offer more opportunities to be farther apart. Actually, it is computed as + * sqrt(n) / (1 + distance), where n is the number of dimensions, in order to help correct for this. + * sqrt(n) is chosen since randomly-chosen points have a distance that grows as sqrt(n).</p> + * + * <p>Note that this could cause a similarity to exceed 1; such values are capped at 1.</p> + * + * <p>Note that the distance isn't normalized in any way; it's not valid to compare similarities computed from + * different domains (different rating scales, for example). Within one domain, normalizing doesn't matter much as + * it doesn't change ordering.</p> + */ +public final class EuclideanDistanceSimilarity extends AbstractSimilarity { + + /** + * @throws IllegalArgumentException if {@link DataModel} does not have preference values + */ + public EuclideanDistanceSimilarity(DataModel dataModel) throws TasteException { + this(dataModel, Weighting.UNWEIGHTED); + } + + /** + * @throws IllegalArgumentException if {@link DataModel} does not have preference values + */ + public EuclideanDistanceSimilarity(DataModel dataModel, Weighting weighting) throws TasteException { + super(dataModel, weighting, false); + Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values"); + } + + @Override + double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) { + return 1.0 / (1.0 + Math.sqrt(sumXYdiff2) / Math.sqrt(n)); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java new file mode 100644 index 0000000..d0c9b8c --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericItemSimilarity.java @@ -0,0 +1,358 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.similarity; + +import java.util.Collection; +import java.util.Iterator; + +import com.google.common.collect.AbstractIterator; +import org.apache.mahout.cf.taste.common.Refreshable; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.common.FastByIDMap; +import org.apache.mahout.cf.taste.impl.common.FastIDSet; +import org.apache.mahout.cf.taste.impl.recommender.TopItems; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.similarity.ItemSimilarity; +import org.apache.mahout.common.RandomUtils; + +import com.google.common.base.Preconditions; + +/** + * <p> + * A "generic" {@link ItemSimilarity} which takes a static list of precomputed item similarities and bases its + * responses on that alone. The values may have been precomputed offline by another process, stored in a file, + * and then read and fed into an instance of this class. + * </p> + * + * <p> + * This is perhaps the best {@link ItemSimilarity} to use with + * {@link org.apache.mahout.cf.taste.impl.recommender.GenericItemBasedRecommender}, for now, since the point + * of item-based recommenders is that they can take advantage of the fact that item similarity is relatively + * static, can be precomputed, and then used in computation to gain a significant performance advantage. + * </p> + */ +public final class GenericItemSimilarity implements ItemSimilarity { + + private static final long[] NO_IDS = new long[0]; + + private final FastByIDMap<FastByIDMap<Double>> similarityMaps = new FastByIDMap<>(); + private final FastByIDMap<FastIDSet> similarItemIDsIndex = new FastByIDMap<>(); + + /** + * <p> + * Creates a {@link GenericItemSimilarity} from a precomputed list of {@link ItemItemSimilarity}s. Each + * represents the similarity between two distinct items. Since similarity is assumed to be symmetric, it is + * not necessary to specify similarity between item1 and item2, and item2 and item1. Both are the same. It + * is also not necessary to specify a similarity between any item and itself; these are assumed to be 1.0. + * </p> + * + * <p> + * Note that specifying a similarity between two items twice is not an error, but, the later value will win. + * </p> + * + * @param similarities + * set of {@link ItemItemSimilarity}s on which to base this instance + */ + public GenericItemSimilarity(Iterable<ItemItemSimilarity> similarities) { + initSimilarityMaps(similarities.iterator()); + } + + /** + * <p> + * Like {@link #GenericItemSimilarity(Iterable)}, but will only keep the specified number of similarities + * from the given {@link Iterable} of similarities. It will keep those with the highest similarity -- those + * that are therefore most important. + * </p> + * + * <p> + * Thanks to tsmorton for suggesting this and providing part of the implementation. + * </p> + * + * @param similarities + * set of {@link ItemItemSimilarity}s on which to base this instance + * @param maxToKeep + * maximum number of similarities to keep + */ + public GenericItemSimilarity(Iterable<ItemItemSimilarity> similarities, int maxToKeep) { + Iterable<ItemItemSimilarity> keptSimilarities = + TopItems.getTopItemItemSimilarities(maxToKeep, similarities.iterator()); + initSimilarityMaps(keptSimilarities.iterator()); + } + + /** + * <p> + * Builds a list of item-item similarities given an {@link ItemSimilarity} implementation and a + * {@link DataModel}, rather than a list of {@link ItemItemSimilarity}s. + * </p> + * + * <p> + * It's valid to build a {@link GenericItemSimilarity} this way, but perhaps missing some of the point of an + * item-based recommender. Item-based recommenders use the assumption that item-item similarities are + * relatively fixed, and might be known already independent of user preferences. Hence it is useful to + * inject that information, using {@link #GenericItemSimilarity(Iterable)}. + * </p> + * + * @param otherSimilarity + * other {@link ItemSimilarity} to get similarities from + * @param dataModel + * data model to get items from + * @throws TasteException + * if an error occurs while accessing the {@link DataModel} items + */ + public GenericItemSimilarity(ItemSimilarity otherSimilarity, DataModel dataModel) throws TasteException { + long[] itemIDs = GenericUserSimilarity.longIteratorToList(dataModel.getItemIDs()); + initSimilarityMaps(new DataModelSimilaritiesIterator(otherSimilarity, itemIDs)); + } + + /** + * <p> + * Like {@link #GenericItemSimilarity(ItemSimilarity, DataModel)} )}, but will only keep the specified + * number of similarities from the given {@link DataModel}. It will keep those with the highest similarity + * -- those that are therefore most important. + * </p> + * + * <p> + * Thanks to tsmorton for suggesting this and providing part of the implementation. + * </p> + * + * @param otherSimilarity + * other {@link ItemSimilarity} to get similarities from + * @param dataModel + * data model to get items from + * @param maxToKeep + * maximum number of similarities to keep + * @throws TasteException + * if an error occurs while accessing the {@link DataModel} items + */ + public GenericItemSimilarity(ItemSimilarity otherSimilarity, + DataModel dataModel, + int maxToKeep) throws TasteException { + long[] itemIDs = GenericUserSimilarity.longIteratorToList(dataModel.getItemIDs()); + Iterator<ItemItemSimilarity> it = new DataModelSimilaritiesIterator(otherSimilarity, itemIDs); + Iterable<ItemItemSimilarity> keptSimilarities = TopItems.getTopItemItemSimilarities(maxToKeep, it); + initSimilarityMaps(keptSimilarities.iterator()); + } + + private void initSimilarityMaps(Iterator<ItemItemSimilarity> similarities) { + while (similarities.hasNext()) { + ItemItemSimilarity iic = similarities.next(); + long similarityItemID1 = iic.getItemID1(); + long similarityItemID2 = iic.getItemID2(); + if (similarityItemID1 != similarityItemID2) { + // Order them -- first key should be the "smaller" one + long itemID1; + long itemID2; + if (similarityItemID1 < similarityItemID2) { + itemID1 = similarityItemID1; + itemID2 = similarityItemID2; + } else { + itemID1 = similarityItemID2; + itemID2 = similarityItemID1; + } + FastByIDMap<Double> map = similarityMaps.get(itemID1); + if (map == null) { + map = new FastByIDMap<>(); + similarityMaps.put(itemID1, map); + } + map.put(itemID2, iic.getValue()); + + doIndex(itemID1, itemID2); + doIndex(itemID2, itemID1); + } + // else similarity between item and itself already assumed to be 1.0 + } + } + + private void doIndex(long fromItemID, long toItemID) { + FastIDSet similarItemIDs = similarItemIDsIndex.get(fromItemID); + if (similarItemIDs == null) { + similarItemIDs = new FastIDSet(); + similarItemIDsIndex.put(fromItemID, similarItemIDs); + } + similarItemIDs.add(toItemID); + } + + /** + * <p> + * Returns the similarity between two items. Note that similarity is assumed to be symmetric, that + * {@code itemSimilarity(item1, item2) == itemSimilarity(item2, item1)}, and that + * {@code itemSimilarity(item1,item1) == 1.0} for all items. + * </p> + * + * @param itemID1 + * first item + * @param itemID2 + * second item + * @return similarity between the two + */ + @Override + public double itemSimilarity(long itemID1, long itemID2) { + if (itemID1 == itemID2) { + return 1.0; + } + long firstID; + long secondID; + if (itemID1 < itemID2) { + firstID = itemID1; + secondID = itemID2; + } else { + firstID = itemID2; + secondID = itemID1; + } + FastByIDMap<Double> nextMap = similarityMaps.get(firstID); + if (nextMap == null) { + return Double.NaN; + } + Double similarity = nextMap.get(secondID); + return similarity == null ? Double.NaN : similarity; + } + + @Override + public double[] itemSimilarities(long itemID1, long[] itemID2s) { + int length = itemID2s.length; + double[] result = new double[length]; + for (int i = 0; i < length; i++) { + result[i] = itemSimilarity(itemID1, itemID2s[i]); + } + return result; + } + + @Override + public long[] allSimilarItemIDs(long itemID) { + FastIDSet similarItemIDs = similarItemIDsIndex.get(itemID); + return similarItemIDs != null ? similarItemIDs.toArray() : NO_IDS; + } + + @Override + public void refresh(Collection<Refreshable> alreadyRefreshed) { + // Do nothing + } + + /** Encapsulates a similarity between two items. Similarity must be in the range [-1.0,1.0]. */ + public static final class ItemItemSimilarity implements Comparable<ItemItemSimilarity> { + + private final long itemID1; + private final long itemID2; + private final double value; + + /** + * @param itemID1 + * first item + * @param itemID2 + * second item + * @param value + * similarity between the two + * @throws IllegalArgumentException + * if value is NaN, less than -1.0 or greater than 1.0 + */ + public ItemItemSimilarity(long itemID1, long itemID2, double value) { + Preconditions.checkArgument(value >= -1.0 && value <= 1.0, "Illegal value: " + value + ". Must be: -1.0 <= value <= 1.0"); + this.itemID1 = itemID1; + this.itemID2 = itemID2; + this.value = value; + } + + public long getItemID1() { + return itemID1; + } + + public long getItemID2() { + return itemID2; + } + + public double getValue() { + return value; + } + + @Override + public String toString() { + return "ItemItemSimilarity[" + itemID1 + ',' + itemID2 + ':' + value + ']'; + } + + /** Defines an ordering from highest similarity to lowest. */ + @Override + public int compareTo(ItemItemSimilarity other) { + double otherValue = other.getValue(); + return value > otherValue ? -1 : value < otherValue ? 1 : 0; + } + + @Override + public boolean equals(Object other) { + if (!(other instanceof ItemItemSimilarity)) { + return false; + } + ItemItemSimilarity otherSimilarity = (ItemItemSimilarity) other; + return otherSimilarity.getItemID1() == itemID1 + && otherSimilarity.getItemID2() == itemID2 + && otherSimilarity.getValue() == value; + } + + @Override + public int hashCode() { + return (int) itemID1 ^ (int) itemID2 ^ RandomUtils.hashDouble(value); + } + + } + + private static final class DataModelSimilaritiesIterator extends AbstractIterator<ItemItemSimilarity> { + + private final ItemSimilarity otherSimilarity; + private final long[] itemIDs; + private int i; + private long itemID1; + private int j; + + private DataModelSimilaritiesIterator(ItemSimilarity otherSimilarity, long[] itemIDs) { + this.otherSimilarity = otherSimilarity; + this.itemIDs = itemIDs; + i = 0; + itemID1 = itemIDs[0]; + j = 1; + } + + @Override + protected ItemItemSimilarity computeNext() { + int size = itemIDs.length; + ItemItemSimilarity result = null; + while (result == null && i < size - 1) { + long itemID2 = itemIDs[j]; + double similarity; + try { + similarity = otherSimilarity.itemSimilarity(itemID1, itemID2); + } catch (TasteException te) { + // ugly: + throw new IllegalStateException(te); + } + if (!Double.isNaN(similarity)) { + result = new ItemItemSimilarity(itemID1, itemID2, similarity); + } + if (++j == size) { + itemID1 = itemIDs[++i]; + j = i + 1; + } + } + if (result == null) { + return endOfData(); + } else { + return result; + } + } + + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java new file mode 100644 index 0000000..1c221c2 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/GenericUserSimilarity.java @@ -0,0 +1,238 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.similarity; + +import java.util.Collection; +import java.util.Iterator; + +import com.google.common.collect.AbstractIterator; +import org.apache.mahout.cf.taste.common.Refreshable; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.common.FastByIDMap; +import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; +import org.apache.mahout.cf.taste.impl.recommender.TopItems; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.similarity.PreferenceInferrer; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; +import org.apache.mahout.common.RandomUtils; + +import com.google.common.base.Preconditions; + +public final class GenericUserSimilarity implements UserSimilarity { + + private final FastByIDMap<FastByIDMap<Double>> similarityMaps = new FastByIDMap<>(); + + public GenericUserSimilarity(Iterable<UserUserSimilarity> similarities) { + initSimilarityMaps(similarities.iterator()); + } + + public GenericUserSimilarity(Iterable<UserUserSimilarity> similarities, int maxToKeep) { + Iterable<UserUserSimilarity> keptSimilarities = + TopItems.getTopUserUserSimilarities(maxToKeep, similarities.iterator()); + initSimilarityMaps(keptSimilarities.iterator()); + } + + public GenericUserSimilarity(UserSimilarity otherSimilarity, DataModel dataModel) throws TasteException { + long[] userIDs = longIteratorToList(dataModel.getUserIDs()); + initSimilarityMaps(new DataModelSimilaritiesIterator(otherSimilarity, userIDs)); + } + + public GenericUserSimilarity(UserSimilarity otherSimilarity, + DataModel dataModel, + int maxToKeep) throws TasteException { + long[] userIDs = longIteratorToList(dataModel.getUserIDs()); + Iterator<UserUserSimilarity> it = new DataModelSimilaritiesIterator(otherSimilarity, userIDs); + Iterable<UserUserSimilarity> keptSimilarities = TopItems.getTopUserUserSimilarities(maxToKeep, it); + initSimilarityMaps(keptSimilarities.iterator()); + } + + static long[] longIteratorToList(LongPrimitiveIterator iterator) { + long[] result = new long[5]; + int size = 0; + while (iterator.hasNext()) { + if (size == result.length) { + long[] newResult = new long[result.length << 1]; + System.arraycopy(result, 0, newResult, 0, result.length); + result = newResult; + } + result[size++] = iterator.next(); + } + if (size != result.length) { + long[] newResult = new long[size]; + System.arraycopy(result, 0, newResult, 0, size); + result = newResult; + } + return result; + } + + private void initSimilarityMaps(Iterator<UserUserSimilarity> similarities) { + while (similarities.hasNext()) { + UserUserSimilarity uuc = similarities.next(); + long similarityUser1 = uuc.getUserID1(); + long similarityUser2 = uuc.getUserID2(); + if (similarityUser1 != similarityUser2) { + // Order them -- first key should be the "smaller" one + long user1; + long user2; + if (similarityUser1 < similarityUser2) { + user1 = similarityUser1; + user2 = similarityUser2; + } else { + user1 = similarityUser2; + user2 = similarityUser1; + } + FastByIDMap<Double> map = similarityMaps.get(user1); + if (map == null) { + map = new FastByIDMap<>(); + similarityMaps.put(user1, map); + } + map.put(user2, uuc.getValue()); + } + // else similarity between user and itself already assumed to be 1.0 + } + } + + @Override + public double userSimilarity(long userID1, long userID2) { + if (userID1 == userID2) { + return 1.0; + } + long first; + long second; + if (userID1 < userID2) { + first = userID1; + second = userID2; + } else { + first = userID2; + second = userID1; + } + FastByIDMap<Double> nextMap = similarityMaps.get(first); + if (nextMap == null) { + return Double.NaN; + } + Double similarity = nextMap.get(second); + return similarity == null ? Double.NaN : similarity; + } + + @Override + public void setPreferenceInferrer(PreferenceInferrer inferrer) { + throw new UnsupportedOperationException(); + } + + @Override + public void refresh(Collection<Refreshable> alreadyRefreshed) { + // Do nothing + } + + public static final class UserUserSimilarity implements Comparable<UserUserSimilarity> { + + private final long userID1; + private final long userID2; + private final double value; + + public UserUserSimilarity(long userID1, long userID2, double value) { + Preconditions.checkArgument(value >= -1.0 && value <= 1.0, "Illegal value: " + value + ". Must be: -1.0 <= value <= 1.0"); + this.userID1 = userID1; + this.userID2 = userID2; + this.value = value; + } + + public long getUserID1() { + return userID1; + } + + public long getUserID2() { + return userID2; + } + + public double getValue() { + return value; + } + + @Override + public String toString() { + return "UserUserSimilarity[" + userID1 + ',' + userID2 + ':' + value + ']'; + } + + /** Defines an ordering from highest similarity to lowest. */ + @Override + public int compareTo(UserUserSimilarity other) { + double otherValue = other.getValue(); + return value > otherValue ? -1 : value < otherValue ? 1 : 0; + } + + @Override + public boolean equals(Object other) { + if (!(other instanceof UserUserSimilarity)) { + return false; + } + UserUserSimilarity otherSimilarity = (UserUserSimilarity) other; + return otherSimilarity.getUserID1() == userID1 + && otherSimilarity.getUserID2() == userID2 + && otherSimilarity.getValue() == value; + } + + @Override + public int hashCode() { + return (int) userID1 ^ (int) userID2 ^ RandomUtils.hashDouble(value); + } + + } + + private static final class DataModelSimilaritiesIterator extends AbstractIterator<UserUserSimilarity> { + + private final UserSimilarity otherSimilarity; + private final long[] itemIDs; + private int i; + private long itemID1; + private int j; + + private DataModelSimilaritiesIterator(UserSimilarity otherSimilarity, long[] itemIDs) { + this.otherSimilarity = otherSimilarity; + this.itemIDs = itemIDs; + i = 0; + itemID1 = itemIDs[0]; + j = 1; + } + + @Override + protected UserUserSimilarity computeNext() { + int size = itemIDs.length; + while (i < size - 1) { + long itemID2 = itemIDs[j]; + double similarity; + try { + similarity = otherSimilarity.userSimilarity(itemID1, itemID2); + } catch (TasteException te) { + // ugly: + throw new IllegalStateException(te); + } + if (!Double.isNaN(similarity)) { + return new UserUserSimilarity(itemID1, itemID2, similarity); + } + if (++j == size) { + itemID1 = itemIDs[++i]; + j = i + 1; + } + } + return endOfData(); + } + + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java new file mode 100644 index 0000000..3084c8f --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LogLikelihoodSimilarity.java @@ -0,0 +1,121 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.similarity; + +import java.util.Collection; + +import org.apache.mahout.cf.taste.common.Refreshable; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.common.FastIDSet; +import org.apache.mahout.cf.taste.impl.common.RefreshHelper; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.similarity.PreferenceInferrer; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; +import org.apache.mahout.math.stats.LogLikelihood; + +/** + * See <a href="http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.14.5962"> + * http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.14.5962</a> and + * <a href="http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html"> + * http://tdunning.blogspot.com/2008/03/surprise-and-coincidence.html</a>. + */ +public final class LogLikelihoodSimilarity extends AbstractItemSimilarity implements UserSimilarity { + + public LogLikelihoodSimilarity(DataModel dataModel) { + super(dataModel); + } + + /** + * @throws UnsupportedOperationException + */ + @Override + public void setPreferenceInferrer(PreferenceInferrer inferrer) { + throw new UnsupportedOperationException(); + } + + @Override + public double userSimilarity(long userID1, long userID2) throws TasteException { + + DataModel dataModel = getDataModel(); + FastIDSet prefs1 = dataModel.getItemIDsFromUser(userID1); + FastIDSet prefs2 = dataModel.getItemIDsFromUser(userID2); + + long prefs1Size = prefs1.size(); + long prefs2Size = prefs2.size(); + long intersectionSize = + prefs1Size < prefs2Size ? prefs2.intersectionSize(prefs1) : prefs1.intersectionSize(prefs2); + if (intersectionSize == 0) { + return Double.NaN; + } + long numItems = dataModel.getNumItems(); + double logLikelihood = + LogLikelihood.logLikelihoodRatio(intersectionSize, + prefs2Size - intersectionSize, + prefs1Size - intersectionSize, + numItems - prefs1Size - prefs2Size + intersectionSize); + return 1.0 - 1.0 / (1.0 + logLikelihood); + } + + @Override + public double itemSimilarity(long itemID1, long itemID2) throws TasteException { + DataModel dataModel = getDataModel(); + long preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1); + long numUsers = dataModel.getNumUsers(); + return doItemSimilarity(itemID1, itemID2, preferring1, numUsers); + } + + @Override + public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException { + DataModel dataModel = getDataModel(); + long preferring1 = dataModel.getNumUsersWithPreferenceFor(itemID1); + long numUsers = dataModel.getNumUsers(); + int length = itemID2s.length; + double[] result = new double[length]; + for (int i = 0; i < length; i++) { + result[i] = doItemSimilarity(itemID1, itemID2s[i], preferring1, numUsers); + } + return result; + } + + private double doItemSimilarity(long itemID1, long itemID2, long preferring1, long numUsers) throws TasteException { + DataModel dataModel = getDataModel(); + long preferring1and2 = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2); + if (preferring1and2 == 0) { + return Double.NaN; + } + long preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2); + double logLikelihood = + LogLikelihood.logLikelihoodRatio(preferring1and2, + preferring2 - preferring1and2, + preferring1 - preferring1and2, + numUsers - preferring1 - preferring2 + preferring1and2); + return 1.0 - 1.0 / (1.0 + logLikelihood); + } + + @Override + public void refresh(Collection<Refreshable> alreadyRefreshed) { + alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed); + RefreshHelper.maybeRefresh(alreadyRefreshed, getDataModel()); + } + + @Override + public String toString() { + return "LogLikelihoodSimilarity[dataModel:" + getDataModel() + ']'; + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java new file mode 100644 index 0000000..48dc4e0 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/LongPairMatchPredicate.java @@ -0,0 +1,40 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.similarity; + +import org.apache.mahout.cf.taste.impl.common.Cache; +import org.apache.mahout.common.LongPair; + +/** + * A {@link Cache.MatchPredicate} which will match an ID against either element of a + * {@link LongPair}. + */ +final class LongPairMatchPredicate implements Cache.MatchPredicate<LongPair> { + + private final long id; + + LongPairMatchPredicate(long id) { + this.id = id; + } + + @Override + public boolean matches(LongPair pair) { + return pair.getFirst() == id || pair.getSecond() == id; + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java new file mode 100644 index 0000000..8ea1660 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/PearsonCorrelationSimilarity.java @@ -0,0 +1,93 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.similarity; + +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.common.Weighting; +import org.apache.mahout.cf.taste.model.DataModel; + +import com.google.common.base.Preconditions; + +/** + * <p> + * An implementation of the Pearson correlation. For users X and Y, the following values are calculated: + * </p> + * + * <ul> + * <li>sumX2: sum of the square of all X's preference values</li> + * <li>sumY2: sum of the square of all Y's preference values</li> + * <li>sumXY: sum of the product of X and Y's preference value for all items for which both X and Y express a + * preference</li> + * </ul> + * + * <p> + * The correlation is then: + * + * <p> + * {@code sumXY / sqrt(sumX2 * sumY2)} + * </p> + * + * <p> + * Note that this correlation "centers" its data, shifts the user's preference values so that each of their + * means is 0. This is necessary to achieve expected behavior on all data sets. + * </p> + * + * <p> + * This correlation implementation is equivalent to the cosine similarity since the data it receives + * is assumed to be centered -- mean is 0. The correlation may be interpreted as the cosine of the angle + * between the two vectors defined by the users' preference values. + * </p> + * + * <p> + * For cosine similarity on uncentered data, see {@link UncenteredCosineSimilarity}. + * </p> + */ +public final class PearsonCorrelationSimilarity extends AbstractSimilarity { + + /** + * @throws IllegalArgumentException if {@link DataModel} does not have preference values + */ + public PearsonCorrelationSimilarity(DataModel dataModel) throws TasteException { + this(dataModel, Weighting.UNWEIGHTED); + } + + /** + * @throws IllegalArgumentException if {@link DataModel} does not have preference values + */ + public PearsonCorrelationSimilarity(DataModel dataModel, Weighting weighting) throws TasteException { + super(dataModel, weighting, true); + Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values"); + } + + @Override + double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) { + if (n == 0) { + return Double.NaN; + } + // Note that sum of X and sum of Y don't appear here since they are assumed to be 0; + // the data is assumed to be centered. + double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2); + if (denominator == 0.0) { + // One or both parties has -all- the same ratings; + // can't really say much similarity under this measure + return Double.NaN; + } + return sumXY / denominator; + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java new file mode 100644 index 0000000..1116368 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/SpearmanCorrelationSimilarity.java @@ -0,0 +1,135 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.similarity; + +import java.util.Collection; + +import org.apache.mahout.cf.taste.common.Refreshable; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.common.RefreshHelper; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.model.PreferenceArray; +import org.apache.mahout.cf.taste.similarity.PreferenceInferrer; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; + +import com.google.common.base.Preconditions; + +/** + * <p> + * Like {@link PearsonCorrelationSimilarity}, but compares relative ranking of preference values instead of + * preference values themselves. That is, each user's preferences are sorted and then assign a rank as their + * preference value, with 1 being assigned to the least preferred item. + * </p> + */ +public final class SpearmanCorrelationSimilarity implements UserSimilarity { + + private final DataModel dataModel; + + public SpearmanCorrelationSimilarity(DataModel dataModel) { + this.dataModel = Preconditions.checkNotNull(dataModel); + } + + @Override + public double userSimilarity(long userID1, long userID2) throws TasteException { + PreferenceArray xPrefs = dataModel.getPreferencesFromUser(userID1); + PreferenceArray yPrefs = dataModel.getPreferencesFromUser(userID2); + int xLength = xPrefs.length(); + int yLength = yPrefs.length(); + + if (xLength <= 1 || yLength <= 1) { + return Double.NaN; + } + + // Copy prefs since we need to modify pref values to ranks + xPrefs = xPrefs.clone(); + yPrefs = yPrefs.clone(); + + // First sort by values from low to high + xPrefs.sortByValue(); + yPrefs.sortByValue(); + + // Assign ranks from low to high + float nextRank = 1.0f; + for (int i = 0; i < xLength; i++) { + // ... but only for items that are common to both pref arrays + if (yPrefs.hasPrefWithItemID(xPrefs.getItemID(i))) { + xPrefs.setValue(i, nextRank); + nextRank += 1.0f; + } + // Other values are bogus but don't matter + } + nextRank = 1.0f; + for (int i = 0; i < yLength; i++) { + if (xPrefs.hasPrefWithItemID(yPrefs.getItemID(i))) { + yPrefs.setValue(i, nextRank); + nextRank += 1.0f; + } + } + + xPrefs.sortByItem(); + yPrefs.sortByItem(); + + long xIndex = xPrefs.getItemID(0); + long yIndex = yPrefs.getItemID(0); + int xPrefIndex = 0; + int yPrefIndex = 0; + + double sumXYRankDiff2 = 0.0; + int count = 0; + + while (true) { + int compare = xIndex < yIndex ? -1 : xIndex > yIndex ? 1 : 0; + if (compare == 0) { + double diff = xPrefs.getValue(xPrefIndex) - yPrefs.getValue(yPrefIndex); + sumXYRankDiff2 += diff * diff; + count++; + } + if (compare <= 0) { + if (++xPrefIndex >= xLength) { + break; + } + xIndex = xPrefs.getItemID(xPrefIndex); + } + if (compare >= 0) { + if (++yPrefIndex >= yLength) { + break; + } + yIndex = yPrefs.getItemID(yPrefIndex); + } + } + + if (count <= 1) { + return Double.NaN; + } + + // When ranks are unique, this formula actually gives the Pearson correlation + return 1.0 - 6.0 * sumXYRankDiff2 / (count * (count * count - 1)); + } + + @Override + public void setPreferenceInferrer(PreferenceInferrer inferrer) { + throw new UnsupportedOperationException(); + } + + @Override + public void refresh(Collection<Refreshable> alreadyRefreshed) { + alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed); + RefreshHelper.maybeRefresh(alreadyRefreshed, dataModel); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java new file mode 100644 index 0000000..0c3a0a4 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/TanimotoCoefficientSimilarity.java @@ -0,0 +1,126 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.similarity; + +import java.util.Collection; + +import org.apache.mahout.cf.taste.common.Refreshable; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.common.FastIDSet; +import org.apache.mahout.cf.taste.impl.common.RefreshHelper; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.similarity.PreferenceInferrer; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; + +/** + * <p> + * An implementation of a "similarity" based on the <a + * href="http://en.wikipedia.org/wiki/Jaccard_index#Tanimoto_coefficient_.28extended_Jaccard_coefficient.29"> + * Tanimoto coefficient</a>, or extended <a href="http://en.wikipedia.org/wiki/Jaccard_index">Jaccard + * coefficient</a>. + * </p> + * + * <p> + * This is intended for "binary" data sets where a user either expresses a generic "yes" preference for an + * item or has no preference. The actual preference values do not matter here, only their presence or absence. + * </p> + * + * <p> + * The value returned is in [0,1]. + * </p> + */ +public final class TanimotoCoefficientSimilarity extends AbstractItemSimilarity implements UserSimilarity { + + public TanimotoCoefficientSimilarity(DataModel dataModel) { + super(dataModel); + } + + /** + * @throws UnsupportedOperationException + */ + @Override + public void setPreferenceInferrer(PreferenceInferrer inferrer) { + throw new UnsupportedOperationException(); + } + + @Override + public double userSimilarity(long userID1, long userID2) throws TasteException { + + DataModel dataModel = getDataModel(); + FastIDSet xPrefs = dataModel.getItemIDsFromUser(userID1); + FastIDSet yPrefs = dataModel.getItemIDsFromUser(userID2); + + int xPrefsSize = xPrefs.size(); + int yPrefsSize = yPrefs.size(); + if (xPrefsSize == 0 && yPrefsSize == 0) { + return Double.NaN; + } + if (xPrefsSize == 0 || yPrefsSize == 0) { + return 0.0; + } + + int intersectionSize = + xPrefsSize < yPrefsSize ? yPrefs.intersectionSize(xPrefs) : xPrefs.intersectionSize(yPrefs); + if (intersectionSize == 0) { + return Double.NaN; + } + + int unionSize = xPrefsSize + yPrefsSize - intersectionSize; + + return (double) intersectionSize / (double) unionSize; + } + + @Override + public double itemSimilarity(long itemID1, long itemID2) throws TasteException { + int preferring1 = getDataModel().getNumUsersWithPreferenceFor(itemID1); + return doItemSimilarity(itemID1, itemID2, preferring1); + } + + @Override + public double[] itemSimilarities(long itemID1, long[] itemID2s) throws TasteException { + int preferring1 = getDataModel().getNumUsersWithPreferenceFor(itemID1); + int length = itemID2s.length; + double[] result = new double[length]; + for (int i = 0; i < length; i++) { + result[i] = doItemSimilarity(itemID1, itemID2s[i], preferring1); + } + return result; + } + + private double doItemSimilarity(long itemID1, long itemID2, int preferring1) throws TasteException { + DataModel dataModel = getDataModel(); + int preferring1and2 = dataModel.getNumUsersWithPreferenceFor(itemID1, itemID2); + if (preferring1and2 == 0) { + return Double.NaN; + } + int preferring2 = dataModel.getNumUsersWithPreferenceFor(itemID2); + return (double) preferring1and2 / (double) (preferring1 + preferring2 - preferring1and2); + } + + @Override + public void refresh(Collection<Refreshable> alreadyRefreshed) { + alreadyRefreshed = RefreshHelper.buildRefreshed(alreadyRefreshed); + RefreshHelper.maybeRefresh(alreadyRefreshed, getDataModel()); + } + + @Override + public String toString() { + return "TanimotoCoefficientSimilarity[dataModel:" + getDataModel() + ']'; + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java new file mode 100644 index 0000000..6260606 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/UncenteredCosineSimilarity.java @@ -0,0 +1,69 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.similarity; + +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.common.Weighting; +import org.apache.mahout.cf.taste.model.DataModel; + +import com.google.common.base.Preconditions; + +/** + * <p> + * An implementation of the cosine similarity. The result is the cosine of the angle formed between + * the two preference vectors. + * </p> + * + * <p> + * Note that this similarity does not "center" its data, shifts the user's preference values so that each of their + * means is 0. For this behavior, use {@link PearsonCorrelationSimilarity}, which actually is mathematically + * equivalent for centered data. + * </p> + */ +public final class UncenteredCosineSimilarity extends AbstractSimilarity { + + /** + * @throws IllegalArgumentException if {@link DataModel} does not have preference values + */ + public UncenteredCosineSimilarity(DataModel dataModel) throws TasteException { + this(dataModel, Weighting.UNWEIGHTED); + } + + /** + * @throws IllegalArgumentException if {@link DataModel} does not have preference values + */ + public UncenteredCosineSimilarity(DataModel dataModel, Weighting weighting) throws TasteException { + super(dataModel, weighting, false); + Preconditions.checkArgument(dataModel.hasPreferenceValues(), "DataModel doesn't have preference values"); + } + + @Override + double computeResult(int n, double sumXY, double sumX2, double sumY2, double sumXYdiff2) { + if (n == 0) { + return Double.NaN; + } + double denominator = Math.sqrt(sumX2) * Math.sqrt(sumY2); + if (denominator == 0.0) { + // One or both parties has -all- the same ratings; + // can't really say much similarity under this measure + return Double.NaN; + } + return sumXY / denominator; + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java new file mode 100644 index 0000000..1ae45c2 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterable.java @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.similarity.file; + +import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity; + +import java.io.File; +import java.io.IOException; +import java.util.Iterator; + +/** + * {@link Iterable} to be able to read a file linewise into a {@link GenericItemSimilarity} + */ +final class FileItemItemSimilarityIterable implements Iterable<GenericItemSimilarity.ItemItemSimilarity> { + + private final File similaritiesFile; + + FileItemItemSimilarityIterable(File similaritiesFile) { + this.similaritiesFile = similaritiesFile; + } + + @Override + public Iterator<GenericItemSimilarity.ItemItemSimilarity> iterator() { + try { + return new FileItemItemSimilarityIterator(similaritiesFile); + } catch (IOException ioe) { + throw new IllegalStateException("Can't read " + similaritiesFile, ioe); + } + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/5eda9e1f/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java new file mode 100644 index 0000000..c071159 --- /dev/null +++ b/community/mahout-mr/src/main/java/org/apache/mahout/cf/taste/impl/similarity/file/FileItemItemSimilarityIterator.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.similarity.file; + +import com.google.common.base.Function; +import com.google.common.collect.ForwardingIterator; +import com.google.common.collect.Iterators; +import org.apache.mahout.cf.taste.impl.similarity.GenericItemSimilarity; +import org.apache.mahout.common.iterator.FileLineIterator; + +import java.io.File; +import java.io.IOException; +import java.util.Iterator; +import java.util.regex.Pattern; + +/** + * a simple iterator using a {@link FileLineIterator} internally, parsing each + * line into an {@link GenericItemSimilarity.ItemItemSimilarity}. + */ +final class FileItemItemSimilarityIterator extends ForwardingIterator<GenericItemSimilarity.ItemItemSimilarity> { + + private static final Pattern SEPARATOR = Pattern.compile("[,\t]"); + + private final Iterator<GenericItemSimilarity.ItemItemSimilarity> delegate; + + FileItemItemSimilarityIterator(File similaritiesFile) throws IOException { + delegate = Iterators.transform( + new FileLineIterator(similaritiesFile), + new Function<String, GenericItemSimilarity.ItemItemSimilarity>() { + @Override + public GenericItemSimilarity.ItemItemSimilarity apply(String from) { + String[] tokens = SEPARATOR.split(from); + return new GenericItemSimilarity.ItemItemSimilarity(Long.parseLong(tokens[0]), + Long.parseLong(tokens[1]), + Double.parseDouble(tokens[2])); + } + }); + } + + @Override + protected Iterator<GenericItemSimilarity.ItemItemSimilarity> delegate() { + return delegate; + } + +}
