http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java new file mode 100644 index 0000000..3463ff5 --- /dev/null +++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MemoryIDMigrator.java @@ -0,0 +1,55 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.model; + +import org.apache.mahout.cf.taste.impl.common.FastByIDMap; +import org.apache.mahout.cf.taste.model.UpdatableIDMigrator; + +/** + * Implementation which stores the reverse long-to-String mapping in memory. + */ +public final class MemoryIDMigrator extends AbstractIDMigrator implements UpdatableIDMigrator { + + private final FastByIDMap<String> longToString; + + public MemoryIDMigrator() { + this.longToString = new FastByIDMap<>(100); + } + + @Override + public void storeMapping(long longID, String stringID) { + synchronized (longToString) { + longToString.put(longID, stringID); + } + } + + @Override + public String toStringID(long longID) { + synchronized (longToString) { + return longToString.get(longID); + } + } + + @Override + public void initialize(Iterable<String> stringIDs) { + for (String stringID : stringIDs) { + storeMapping(toLongID(stringID), stringID); + } + } + +}
http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java new file mode 100644 index 0000000..b134598 --- /dev/null +++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/MySQLJDBCIDMigrator.java @@ -0,0 +1,67 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.model; + +import javax.sql.DataSource; + +/** + * <p> + * An implementation for MySQL. The following statement would create a table suitable for use with this class: + * </p> + * + * <p> + * + * <pre> + * CREATE TABLE taste_id_migration ( + * long_id BIGINT NOT NULL PRIMARY KEY, + * string_id VARCHAR(255) NOT NULL UNIQUE + * ) + * </pre> + * + * </p> + * + * <p> + * Separately, note that in a MySQL database, the following function calls will convert a string value into a + * numeric value in the same way that the standard implementations in this package do. This may be useful in + * writing SQL statements for use with + * {@code AbstractJDBCDataModel} subclasses which convert string + * column values to appropriate numeric values -- though this should be viewed as a temporary arrangement + * since it will impact performance: + * </p> + * + * <p> + * {@code cast(conv(substring(md5([column name]), 1, 16),16,10) as signed)} + * </p> + */ +public final class MySQLJDBCIDMigrator extends AbstractJDBCIDMigrator { + + public MySQLJDBCIDMigrator(DataSource dataSource) { + this(dataSource, DEFAULT_MAPPING_TABLE, + DEFAULT_LONG_ID_COLUMN, DEFAULT_STRING_ID_COLUMN); + } + + public MySQLJDBCIDMigrator(DataSource dataSource, + String mappingTable, + String longIDColumn, + String stringIDColumn) { + super(dataSource, + "SELECT " + stringIDColumn + " FROM " + mappingTable + " WHERE " + longIDColumn + "=?", + "INSERT IGNORE INTO " + mappingTable + " (" + longIDColumn + ',' + stringIDColumn + ") VALUES (?,?)"); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousConcurrentUserDataModel.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousConcurrentUserDataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousConcurrentUserDataModel.java new file mode 100644 index 0000000..c97a545 --- /dev/null +++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousConcurrentUserDataModel.java @@ -0,0 +1,352 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.model; + +import com.google.common.base.Preconditions; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.ConcurrentLinkedQueue; + +import com.google.common.collect.Lists; +import org.apache.mahout.cf.taste.common.NoSuchItemException; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.common.FastIDSet; +import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.model.Preference; +import org.apache.mahout.cf.taste.model.PreferenceArray; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * <p> + * This is a special thread-safe version of {@link PlusAnonymousUserDataModel} + * which allow multiple concurrent anonymous requests. + * </p> + * + * <p> + * To use it, you have to estimate the number of concurrent anonymous users of your application. + * The pool of users with the given size will be created. For each anonymous recommendations request, + * a user has to be taken from the pool and returned back immediately afterwards. + * </p> + * + * <p> + * If no more users are available in the pool, anonymous recommendations cannot be produced. + * </p> + * + * </p> + * + * Setup: + * <pre> + * int concurrentUsers = 100; + * DataModel realModel = .. + * PlusAnonymousConcurrentUserDataModel plusModel = + * new PlusAnonymousConcurrentUserDataModel(realModel, concurrentUsers); + * Recommender recommender = ...; + * </pre> + * + * Real-time recommendation: + * <pre> + * PlusAnonymousConcurrentUserDataModel plusModel = + * (PlusAnonymousConcurrentUserDataModel) recommender.getDataModel(); + * + * // Take the next available anonymous user from the pool + * Long anonymousUserID = plusModel.takeAvailableUser(); + * + * PreferenceArray tempPrefs = .. + * tempPrefs.setUserID(0, anonymousUserID); + * tempPrefs.setItemID(0, itemID); + * plusModel.setTempPrefs(tempPrefs, anonymousUserID); + * + * // Produce recommendations + * recommender.recommend(anonymousUserID, howMany); + * + * // It is very IMPORTANT to release user back to the pool + * plusModel.releaseUser(anonymousUserID); + * </pre> + * + * </p> + */ +public final class PlusAnonymousConcurrentUserDataModel extends PlusAnonymousUserDataModel { + + /** Preferences for all anonymous users */ + private final Map<Long,PreferenceArray> tempPrefs; + /** Item IDs set for all anonymous users */ + private final Map<Long,FastIDSet> prefItemIDs; + /** Pool of the users (FIFO) */ + private Queue<Long> usersPool; + + private static final Logger log = LoggerFactory.getLogger(PlusAnonymousUserDataModel.class); + + /** + * @param delegate Real model where anonymous users will be added to + * @param maxConcurrentUsers Maximum allowed number of concurrent anonymous users + */ + public PlusAnonymousConcurrentUserDataModel(DataModel delegate, int maxConcurrentUsers) { + super(delegate); + + tempPrefs = new ConcurrentHashMap<>(); + prefItemIDs = new ConcurrentHashMap<>(); + + initializeUsersPools(maxConcurrentUsers); + } + + /** + * Initialize the pool of concurrent anonymous users. + * + * @param usersPoolSize Maximum allowed number of concurrent anonymous user. Depends on the consumer system. + */ + private void initializeUsersPools(int usersPoolSize) { + usersPool = new ConcurrentLinkedQueue<>(); + for (int i = 0; i < usersPoolSize; i++) { + usersPool.add(TEMP_USER_ID + i); + } + } + + /** + * Take the next available concurrent anonymous users from the pool. + * + * @return User ID or null if no more users are available + */ + public Long takeAvailableUser() { + Long takenUserID = usersPool.poll(); + if (takenUserID != null) { + // Initialize the preferences array to indicate that the user is taken. + tempPrefs.put(takenUserID, new GenericUserPreferenceArray(0)); + return takenUserID; + } + return null; + } + + /** + * Release previously taken anonymous user and return it to the pool. + * + * @param userID ID of a previously taken anonymous user + * @return true if the user was previously taken, false otherwise + */ + public boolean releaseUser(Long userID) { + if (tempPrefs.containsKey(userID)) { + this.clearTempPrefs(userID); + // Return previously taken user to the pool + usersPool.offer(userID); + return true; + } + return false; + } + + /** + * Checks whether a given user is a valid previously acquired anonymous user. + */ + private boolean isAnonymousUser(long userID) { + return tempPrefs.containsKey(userID); + } + + /** + * Sets temporary preferences for a given anonymous user. + */ + public void setTempPrefs(PreferenceArray prefs, long anonymousUserID) { + Preconditions.checkArgument(prefs != null && prefs.length() > 0, "prefs is null or empty"); + + this.tempPrefs.put(anonymousUserID, prefs); + FastIDSet userPrefItemIDs = new FastIDSet(); + + for (int i = 0; i < prefs.length(); i++) { + userPrefItemIDs.add(prefs.getItemID(i)); + } + + this.prefItemIDs.put(anonymousUserID, userPrefItemIDs); + } + + /** + * Clears temporary preferences for a given anonymous user. + */ + public void clearTempPrefs(long anonymousUserID) { + this.tempPrefs.remove(anonymousUserID); + this.prefItemIDs.remove(anonymousUserID); + } + + @Override + public LongPrimitiveIterator getUserIDs() throws TasteException { + // Anonymous users have short lifetime and should not be included into the neighbohoods of the real users. + // Thus exclude them from the universe. + return getDelegate().getUserIDs(); + } + + @Override + public PreferenceArray getPreferencesFromUser(long userID) throws TasteException { + if (isAnonymousUser(userID)) { + return tempPrefs.get(userID); + } + return getDelegate().getPreferencesFromUser(userID); + } + + @Override + public FastIDSet getItemIDsFromUser(long userID) throws TasteException { + if (isAnonymousUser(userID)) { + return prefItemIDs.get(userID); + } + return getDelegate().getItemIDsFromUser(userID); + } + + @Override + public PreferenceArray getPreferencesForItem(long itemID) throws TasteException { + if (tempPrefs.isEmpty()) { + return getDelegate().getPreferencesForItem(itemID); + } + + PreferenceArray delegatePrefs = null; + + try { + delegatePrefs = getDelegate().getPreferencesForItem(itemID); + } catch (NoSuchItemException nsie) { + // OK. Probably an item that only the anonymous user has + if (log.isDebugEnabled()) { + log.debug("Item {} unknown", itemID); + } + } + + List<Preference> anonymousPreferences = Lists.newArrayList(); + + for (Map.Entry<Long, PreferenceArray> prefsMap : tempPrefs.entrySet()) { + PreferenceArray singleUserTempPrefs = prefsMap.getValue(); + for (int i = 0; i < singleUserTempPrefs.length(); i++) { + if (singleUserTempPrefs.getItemID(i) == itemID) { + anonymousPreferences.add(singleUserTempPrefs.get(i)); + } + } + } + + int delegateLength = delegatePrefs == null ? 0 : delegatePrefs.length(); + int anonymousPrefsLength = anonymousPreferences.size(); + int prefsCounter = 0; + + // Merge the delegate and anonymous preferences into a single array + PreferenceArray newPreferenceArray = new GenericItemPreferenceArray(delegateLength + anonymousPrefsLength); + + for (int i = 0; i < delegateLength; i++) { + newPreferenceArray.set(prefsCounter++, delegatePrefs.get(i)); + } + + for (Preference anonymousPreference : anonymousPreferences) { + newPreferenceArray.set(prefsCounter++, anonymousPreference); + } + + if (newPreferenceArray.length() == 0) { + // No, didn't find it among the anonymous user prefs + throw new NoSuchItemException(itemID); + } + + return newPreferenceArray; + } + + @Override + public Float getPreferenceValue(long userID, long itemID) throws TasteException { + if (isAnonymousUser(userID)) { + PreferenceArray singleUserTempPrefs = tempPrefs.get(userID); + for (int i = 0; i < singleUserTempPrefs.length(); i++) { + if (singleUserTempPrefs.getItemID(i) == itemID) { + return singleUserTempPrefs.getValue(i); + } + } + return null; + } + return getDelegate().getPreferenceValue(userID, itemID); + } + + @Override + public Long getPreferenceTime(long userID, long itemID) throws TasteException { + if (isAnonymousUser(userID)) { + // Timestamps are not saved for anonymous preferences + return null; + } + return getDelegate().getPreferenceTime(userID, itemID); + } + + @Override + public int getNumUsers() throws TasteException { + // Anonymous users have short lifetime and should not be included into the neighbohoods of the real users. + // Thus exclude them from the universe. + return getDelegate().getNumUsers(); + } + + @Override + public int getNumUsersWithPreferenceFor(long itemID) throws TasteException { + if (tempPrefs.isEmpty()) { + return getDelegate().getNumUsersWithPreferenceFor(itemID); + } + + int countAnonymousUsersWithPreferenceFor = 0; + + for (Map.Entry<Long, PreferenceArray> singleUserTempPrefs : tempPrefs.entrySet()) { + for (int i = 0; i < singleUserTempPrefs.getValue().length(); i++) { + if (singleUserTempPrefs.getValue().getItemID(i) == itemID) { + countAnonymousUsersWithPreferenceFor++; + break; + } + } + } + return getDelegate().getNumUsersWithPreferenceFor(itemID) + countAnonymousUsersWithPreferenceFor; + } + + @Override + public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException { + if (tempPrefs.isEmpty()) { + return getDelegate().getNumUsersWithPreferenceFor(itemID1, itemID2); + } + + int countAnonymousUsersWithPreferenceFor = 0; + + for (Map.Entry<Long, PreferenceArray> singleUserTempPrefs : tempPrefs.entrySet()) { + boolean found1 = false; + boolean found2 = false; + for (int i = 0; i < singleUserTempPrefs.getValue().length() && !(found1 && found2); i++) { + long itemID = singleUserTempPrefs.getValue().getItemID(i); + if (itemID == itemID1) { + found1 = true; + } + if (itemID == itemID2) { + found2 = true; + } + } + + if (found1 && found2) { + countAnonymousUsersWithPreferenceFor++; + } + } + + return getDelegate().getNumUsersWithPreferenceFor(itemID1, itemID2) + countAnonymousUsersWithPreferenceFor; + } + + @Override + public void setPreference(long userID, long itemID, float value) throws TasteException { + if (isAnonymousUser(userID)) { + throw new UnsupportedOperationException(); + } + getDelegate().setPreference(userID, itemID, value); + } + + @Override + public void removePreference(long userID, long itemID) throws TasteException { + if (isAnonymousUser(userID)) { + throw new UnsupportedOperationException(); + } + getDelegate().removePreference(userID, itemID); + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserDataModel.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserDataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserDataModel.java new file mode 100644 index 0000000..546349b --- /dev/null +++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserDataModel.java @@ -0,0 +1,320 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.model; + +import java.util.Collection; + +import org.apache.mahout.cf.taste.common.NoSuchItemException; +import org.apache.mahout.cf.taste.common.NoSuchUserException; +import org.apache.mahout.cf.taste.common.Refreshable; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.common.FastIDSet; +import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.model.PreferenceArray; + +import com.google.common.base.Preconditions; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * <p> + * This {@link DataModel} decorator class is useful in a situation where you wish to recommend to a user that + * doesn't really exist yet in your actual {@link DataModel}. For example maybe you wish to recommend DVDs to + * a user who has browsed a few titles on your DVD store site, but, the user is not yet registered. + * </p> + * + * <p> + * This enables you to temporarily add a temporary user to an existing {@link DataModel} in a way that + * recommenders can then produce recommendations anyway. To do so, wrap your real implementation in this + * class: + * </p> + * + * <p> + * + * <pre> + * DataModel realModel = ...; + * DataModel plusModel = new PlusAnonymousUserDataModel(realModel); + * ... + * ItemSimilarity similarity = new LogLikelihoodSimilarity(realModel); // not plusModel + * </pre> + * + * </p> + * + * <p> + * But, you may continue to use {@code realModel} as input to other components. To recommend, first construct and + * set the temporary user information on the model and then simply call the recommender. The + * {@code synchronized} block exists to remind you that this is of course not thread-safe. Only one set + * of temp data can be inserted into the model and used at one time. + * </p> + * + * <p> + * + * <pre> + * Recommender recommender = ...; + * ... + * synchronized(...) { + * PreferenceArray tempPrefs = ...; + * plusModel.setTempPrefs(tempPrefs); + * recommender.recommend(PlusAnonymousUserDataModel.TEMP_USER_ID, 10); + * plusModel.setTempPrefs(null); + * } + * </pre> + * + * </p> + */ +public class PlusAnonymousUserDataModel implements DataModel { + + public static final long TEMP_USER_ID = Long.MIN_VALUE; + + private final DataModel delegate; + private PreferenceArray tempPrefs; + private final FastIDSet prefItemIDs; + + private static final Logger log = LoggerFactory.getLogger(PlusAnonymousUserDataModel.class); + + public PlusAnonymousUserDataModel(DataModel delegate) { + this.delegate = delegate; + this.prefItemIDs = new FastIDSet(); + } + + protected DataModel getDelegate() { + return delegate; + } + + public void setTempPrefs(PreferenceArray prefs) { + Preconditions.checkArgument(prefs != null && prefs.length() > 0, "prefs is null or empty"); + this.tempPrefs = prefs; + this.prefItemIDs.clear(); + for (int i = 0; i < prefs.length(); i++) { + this.prefItemIDs.add(prefs.getItemID(i)); + } + } + + public void clearTempPrefs() { + tempPrefs = null; + prefItemIDs.clear(); + } + + @Override + public LongPrimitiveIterator getUserIDs() throws TasteException { + if (tempPrefs == null) { + return delegate.getUserIDs(); + } + return new PlusAnonymousUserLongPrimitiveIterator(delegate.getUserIDs(), TEMP_USER_ID); + } + + @Override + public PreferenceArray getPreferencesFromUser(long userID) throws TasteException { + if (userID == TEMP_USER_ID) { + if (tempPrefs == null) { + throw new NoSuchUserException(TEMP_USER_ID); + } + return tempPrefs; + } + return delegate.getPreferencesFromUser(userID); + } + + @Override + public FastIDSet getItemIDsFromUser(long userID) throws TasteException { + if (userID == TEMP_USER_ID) { + if (tempPrefs == null) { + throw new NoSuchUserException(TEMP_USER_ID); + } + return prefItemIDs; + } + return delegate.getItemIDsFromUser(userID); + } + + @Override + public LongPrimitiveIterator getItemIDs() throws TasteException { + return delegate.getItemIDs(); + // Yeah ignoring items that only the plus-one user knows about... can't really happen + } + + @Override + public PreferenceArray getPreferencesForItem(long itemID) throws TasteException { + if (tempPrefs == null) { + return delegate.getPreferencesForItem(itemID); + } + PreferenceArray delegatePrefs = null; + try { + delegatePrefs = delegate.getPreferencesForItem(itemID); + } catch (NoSuchItemException nsie) { + // OK. Probably an item that only the anonymous user has + if (log.isDebugEnabled()) { + log.debug("Item {} unknown", itemID); + } + } + for (int i = 0; i < tempPrefs.length(); i++) { + if (tempPrefs.getItemID(i) == itemID) { + return cloneAndMergeInto(delegatePrefs, itemID, tempPrefs.getUserID(i), tempPrefs.getValue(i)); + } + } + if (delegatePrefs == null) { + // No, didn't find it among the anonymous user prefs + throw new NoSuchItemException(itemID); + } + return delegatePrefs; + } + + private static PreferenceArray cloneAndMergeInto(PreferenceArray delegatePrefs, + long itemID, + long newUserID, + float value) { + + int length = delegatePrefs == null ? 0 : delegatePrefs.length(); + int newLength = length + 1; + PreferenceArray newPreferenceArray = new GenericItemPreferenceArray(newLength); + + // Set item ID once + newPreferenceArray.setItemID(0, itemID); + + int positionToInsert = 0; + while (positionToInsert < length && newUserID > delegatePrefs.getUserID(positionToInsert)) { + positionToInsert++; + } + + for (int i = 0; i < positionToInsert; i++) { + newPreferenceArray.setUserID(i, delegatePrefs.getUserID(i)); + newPreferenceArray.setValue(i, delegatePrefs.getValue(i)); + } + newPreferenceArray.setUserID(positionToInsert, newUserID); + newPreferenceArray.setValue(positionToInsert, value); + for (int i = positionToInsert + 1; i < newLength; i++) { + newPreferenceArray.setUserID(i, delegatePrefs.getUserID(i - 1)); + newPreferenceArray.setValue(i, delegatePrefs.getValue(i - 1)); + } + + return newPreferenceArray; + } + + @Override + public Float getPreferenceValue(long userID, long itemID) throws TasteException { + if (userID == TEMP_USER_ID) { + if (tempPrefs == null) { + throw new NoSuchUserException(TEMP_USER_ID); + } + for (int i = 0; i < tempPrefs.length(); i++) { + if (tempPrefs.getItemID(i) == itemID) { + return tempPrefs.getValue(i); + } + } + return null; + } + return delegate.getPreferenceValue(userID, itemID); + } + + @Override + public Long getPreferenceTime(long userID, long itemID) throws TasteException { + if (userID == TEMP_USER_ID) { + if (tempPrefs == null) { + throw new NoSuchUserException(TEMP_USER_ID); + } + return null; + } + return delegate.getPreferenceTime(userID, itemID); + } + + @Override + public int getNumItems() throws TasteException { + return delegate.getNumItems(); + } + + @Override + public int getNumUsers() throws TasteException { + return delegate.getNumUsers() + (tempPrefs == null ? 0 : 1); + } + + @Override + public int getNumUsersWithPreferenceFor(long itemID) throws TasteException { + if (tempPrefs == null) { + return delegate.getNumUsersWithPreferenceFor(itemID); + } + boolean found = false; + for (int i = 0; i < tempPrefs.length(); i++) { + if (tempPrefs.getItemID(i) == itemID) { + found = true; + break; + } + } + return delegate.getNumUsersWithPreferenceFor(itemID) + (found ? 1 : 0); + } + + @Override + public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException { + if (tempPrefs == null) { + return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2); + } + boolean found1 = false; + boolean found2 = false; + for (int i = 0; i < tempPrefs.length() && !(found1 && found2); i++) { + long itemID = tempPrefs.getItemID(i); + if (itemID == itemID1) { + found1 = true; + } + if (itemID == itemID2) { + found2 = true; + } + } + return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2) + (found1 && found2 ? 1 : 0); + } + + @Override + public void setPreference(long userID, long itemID, float value) throws TasteException { + if (userID == TEMP_USER_ID) { + if (tempPrefs == null) { + throw new NoSuchUserException(TEMP_USER_ID); + } + throw new UnsupportedOperationException(); + } + delegate.setPreference(userID, itemID, value); + } + + @Override + public void removePreference(long userID, long itemID) throws TasteException { + if (userID == TEMP_USER_ID) { + if (tempPrefs == null) { + throw new NoSuchUserException(TEMP_USER_ID); + } + throw new UnsupportedOperationException(); + } + delegate.removePreference(userID, itemID); + } + + @Override + public void refresh(Collection<Refreshable> alreadyRefreshed) { + delegate.refresh(alreadyRefreshed); + } + + @Override + public boolean hasPreferenceValues() { + return delegate.hasPreferenceValues(); + } + + @Override + public float getMaxPreference() { + return delegate.getMaxPreference(); + } + + @Override + public float getMinPreference() { + return delegate.getMinPreference(); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserLongPrimitiveIterator.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserLongPrimitiveIterator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserLongPrimitiveIterator.java new file mode 100644 index 0000000..ea4df85 --- /dev/null +++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/PlusAnonymousUserLongPrimitiveIterator.java @@ -0,0 +1,90 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.model; + +import org.apache.mahout.cf.taste.impl.common.AbstractLongPrimitiveIterator; +import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; + +final class PlusAnonymousUserLongPrimitiveIterator extends AbstractLongPrimitiveIterator { + + private final LongPrimitiveIterator delegate; + private final long extraDatum; + private boolean datumConsumed; + + PlusAnonymousUserLongPrimitiveIterator(LongPrimitiveIterator delegate, long extraDatum) { + this.delegate = delegate; + this.extraDatum = extraDatum; + datumConsumed = false; + } + + @Override + public long nextLong() { + if (datumConsumed) { + return delegate.nextLong(); + } else { + if (delegate.hasNext()) { + long delegateNext = delegate.peek(); + if (extraDatum <= delegateNext) { + datumConsumed = true; + return extraDatum; + } else { + return delegate.next(); + } + } else { + datumConsumed = true; + return extraDatum; + } + } + } + + @Override + public long peek() { + if (datumConsumed) { + return delegate.peek(); + } else { + if (delegate.hasNext()) { + long delegateNext = delegate.peek(); + if (extraDatum <= delegateNext) { + return extraDatum; + } else { + return delegateNext; + } + } else { + return extraDatum; + } + } + } + + @Override + public boolean hasNext() { + return !datumConsumed || delegate.hasNext(); + } + + @Override + public void remove() { + throw new UnsupportedOperationException(); + } + + @Override + public void skip(int n) { + for (int i = 0; i < n; i++) { + nextLong(); + } + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java new file mode 100644 index 0000000..0399618 --- /dev/null +++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java @@ -0,0 +1,758 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.model.file; + +import java.io.File; +import java.io.FileFilter; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.TreeMap; +import java.util.concurrent.locks.ReentrantLock; + +import com.google.common.base.Preconditions; +import com.google.common.base.Splitter; +import com.google.common.io.Closeables; +import org.apache.mahout.cf.taste.common.Refreshable; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.common.FastByIDMap; +import org.apache.mahout.cf.taste.impl.common.FastIDSet; +import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; +import org.apache.mahout.cf.taste.impl.model.AbstractDataModel; +import org.apache.mahout.cf.taste.impl.model.GenericBooleanPrefDataModel; +import org.apache.mahout.cf.taste.impl.model.GenericDataModel; +import org.apache.mahout.cf.taste.impl.model.GenericPreference; +import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.model.Preference; +import org.apache.mahout.cf.taste.model.PreferenceArray; +import org.apache.mahout.common.iterator.FileLineIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * <p> + * A {@link DataModel} backed by a delimited file. This class expects a file where each line + * contains a user ID, followed by item ID, followed by optional preference value, followed by + * optional timestamp. Commas or tabs delimit fields: + * </p> + * + * <p>{@code userID,itemID[,preference[,timestamp]]}</p> + * + * <p> + * Preference value is optional to accommodate applications that have no notion of a + * preference value (that is, the user simply expresses a + * preference for an item, but no degree of preference). + * </p> + * + * <p> + * The preference value is assumed to be parseable as a {@code double}. The user IDs and item IDs are + * read parsed as {@code long}s. The timestamp, if present, is assumed to be parseable as a + * {@code long}, though this can be overridden via {@link #readTimestampFromString(String)}. + * The preference value may be empty, to indicate "no preference value", but cannot be empty. That is, + * this is legal: + * </p> + * + * <p>{@code 123,456,,129050099059}</p> + * + * <p>But this isn't:</p> + * + * <p>{@code 123,456,129050099059}</p> + * + * <p> + * It is also acceptable for the lines to contain additional fields. Fields beyond the third will be ignored. + * An empty line, or one that begins with '#' will be ignored as a comment. + * </p> + * + * <p> + * This class will reload data from the data file when {@link #refresh(Collection)} is called, unless the file + * has been reloaded very recently already. + * </p> + * + * <p> + * This class will also look for update "delta" files in the same directory, with file names that start the + * same way (up to the first period). These files have the same format, and provide updated data that + * supersedes what is in the main data file. This is a mechanism that allows an application to push updates to + * {@link FileDataModel} without re-copying the entire data file. + * </p> + * + * <p> + * One small format difference exists. Update files must also be able to express deletes. + * This is done by ending with a blank preference value, as in "123,456,". + * </p> + * + * <p> + * Note that it's all-or-nothing -- all of the items in the file must express no preference, or the all must. + * These cannot be mixed. Put another way there will always be the same number of delimiters on every line of + * the file! + * </p> + * + * <p> + * This class is not intended for use with very large amounts of data (over, say, tens of millions of rows). + * For that, a JDBC-backed {@link DataModel} and a database are more appropriate. + * </p> + * + * <p> + * It is possible and likely useful to subclass this class and customize its behavior to accommodate + * application-specific needs and input formats. See {@link #processLine(String, FastByIDMap, FastByIDMap, boolean)} and + * {@link #processLineWithoutID(String, FastByIDMap, FastByIDMap)} + */ +public class FileDataModel extends AbstractDataModel { + + private static final Logger log = LoggerFactory.getLogger(FileDataModel.class); + + public static final long DEFAULT_MIN_RELOAD_INTERVAL_MS = 60 * 1000L; // 1 minute? + private static final char COMMENT_CHAR = '#'; + private static final char[] DELIMIETERS = {',', '\t'}; + + private final File dataFile; + private long lastModified; + private long lastUpdateFileModified; + private final transient Splitter delimiterPattern; + private final boolean hasPrefValues; + private DataModel delegate; + private final ReentrantLock reloadLock; + private final boolean transpose; + private final long minReloadIntervalMS; + + /** + * @param dataFile + * file containing preferences data. If file is compressed (and name ends in .gz or .zip + * accordingly) it will be decompressed as it is read) + * @throws FileNotFoundException + * if dataFile does not exist + * @throws IOException + * if file can't be read + */ + public FileDataModel(File dataFile) throws IOException { + this(dataFile, false, DEFAULT_MIN_RELOAD_INTERVAL_MS); + } + + /** + * @param delimiterRegex If your data file don't use '\t' or ',' as delimiter, you can specify + * a custom regex pattern. + */ + public FileDataModel(File dataFile, String delimiterRegex) throws IOException { + this(dataFile, false, DEFAULT_MIN_RELOAD_INTERVAL_MS, delimiterRegex); + } + + /** + * @param transpose + * transposes user IDs and item IDs -- convenient for 'flipping' the data model this way + * @param minReloadIntervalMS + * the minimum interval in milliseconds after which a full reload of the original datafile is done + * when refresh() is called + * @see #FileDataModel(File) + */ + public FileDataModel(File dataFile, boolean transpose, long minReloadIntervalMS) throws IOException { + this(dataFile, transpose, minReloadIntervalMS, null); + } + + /** + * @param delimiterRegex If your data file don't use '\t' or ',' as delimiters, you can specify + * user own using regex pattern. + * @throws IOException + */ + public FileDataModel(File dataFile, boolean transpose, long minReloadIntervalMS, String delimiterRegex) + throws IOException { + + this.dataFile = Preconditions.checkNotNull(dataFile.getAbsoluteFile()); + if (!dataFile.exists() || dataFile.isDirectory()) { + throw new FileNotFoundException(dataFile.toString()); + } + Preconditions.checkArgument(dataFile.length() > 0L, "dataFile is empty"); + Preconditions.checkArgument(minReloadIntervalMS >= 0L, "minReloadIntervalMs must be non-negative"); + + log.info("Creating FileDataModel for file {}", dataFile); + + this.lastModified = dataFile.lastModified(); + this.lastUpdateFileModified = readLastUpdateFileModified(); + + FileLineIterator iterator = new FileLineIterator(dataFile, false); + String firstLine = iterator.peek(); + while (firstLine.isEmpty() || firstLine.charAt(0) == COMMENT_CHAR) { + iterator.next(); + firstLine = iterator.peek(); + } + Closeables.close(iterator, true); + + char delimiter; + if (delimiterRegex == null) { + delimiter = determineDelimiter(firstLine); + delimiterPattern = Splitter.on(delimiter); + } else { + delimiter = '\0'; + delimiterPattern = Splitter.onPattern(delimiterRegex); + if (!delimiterPattern.split(firstLine).iterator().hasNext()) { + throw new IllegalArgumentException("Did not find a delimiter(pattern) in first line"); + } + } + List<String> firstLineSplit = new ArrayList<>(); + for (String token : delimiterPattern.split(firstLine)) { + firstLineSplit.add(token); + } + // If preference value exists and isn't empty then the file is specifying pref values + hasPrefValues = firstLineSplit.size() >= 3 && !firstLineSplit.get(2).isEmpty(); + + this.reloadLock = new ReentrantLock(); + this.transpose = transpose; + this.minReloadIntervalMS = minReloadIntervalMS; + + reload(); + } + + public File getDataFile() { + return dataFile; + } + + protected void reload() { + if (reloadLock.tryLock()) { + try { + delegate = buildModel(); + } catch (IOException ioe) { + log.warn("Exception while reloading", ioe); + } finally { + reloadLock.unlock(); + } + } + } + + protected DataModel buildModel() throws IOException { + + long newLastModified = dataFile.lastModified(); + long newLastUpdateFileModified = readLastUpdateFileModified(); + + boolean loadFreshData = delegate == null || newLastModified > lastModified + minReloadIntervalMS; + + long oldLastUpdateFileModifieid = lastUpdateFileModified; + lastModified = newLastModified; + lastUpdateFileModified = newLastUpdateFileModified; + + FastByIDMap<FastByIDMap<Long>> timestamps = new FastByIDMap<>(); + + if (hasPrefValues) { + + if (loadFreshData) { + + FastByIDMap<Collection<Preference>> data = new FastByIDMap<>(); + FileLineIterator iterator = new FileLineIterator(dataFile, false); + processFile(iterator, data, timestamps, false); + + for (File updateFile : findUpdateFilesAfter(newLastModified)) { + processFile(new FileLineIterator(updateFile, false), data, timestamps, false); + } + + return new GenericDataModel(GenericDataModel.toDataMap(data, true), timestamps); + + } else { + + FastByIDMap<PreferenceArray> rawData = ((GenericDataModel) delegate).getRawUserData(); + + for (File updateFile : findUpdateFilesAfter(Math.max(oldLastUpdateFileModifieid, newLastModified))) { + processFile(new FileLineIterator(updateFile, false), rawData, timestamps, true); + } + + return new GenericDataModel(rawData, timestamps); + + } + + } else { + + if (loadFreshData) { + + FastByIDMap<FastIDSet> data = new FastByIDMap<>(); + FileLineIterator iterator = new FileLineIterator(dataFile, false); + processFileWithoutID(iterator, data, timestamps); + + for (File updateFile : findUpdateFilesAfter(newLastModified)) { + processFileWithoutID(new FileLineIterator(updateFile, false), data, timestamps); + } + + return new GenericBooleanPrefDataModel(data, timestamps); + + } else { + + FastByIDMap<FastIDSet> rawData = ((GenericBooleanPrefDataModel) delegate).getRawUserData(); + + for (File updateFile : findUpdateFilesAfter(Math.max(oldLastUpdateFileModifieid, newLastModified))) { + processFileWithoutID(new FileLineIterator(updateFile, false), rawData, timestamps); + } + + return new GenericBooleanPrefDataModel(rawData, timestamps); + + } + + } + } + + /** + * Finds update delta files in the same directory as the data file. This finds any file whose name starts + * the same way as the data file (up to first period) but isn't the data file itself. For example, if the + * data file is /foo/data.txt.gz, you might place update files at /foo/data.1.txt.gz, /foo/data.2.txt.gz, + * etc. + */ + private Iterable<File> findUpdateFilesAfter(long minimumLastModified) { + String dataFileName = dataFile.getName(); + int period = dataFileName.indexOf('.'); + String startName = period < 0 ? dataFileName : dataFileName.substring(0, period); + File parentDir = dataFile.getParentFile(); + Map<Long, File> modTimeToUpdateFile = new TreeMap<>(); + FileFilter onlyFiles = new FileFilter() { + @Override + public boolean accept(File file) { + return !file.isDirectory(); + } + }; + for (File updateFile : parentDir.listFiles(onlyFiles)) { + String updateFileName = updateFile.getName(); + if (updateFileName.startsWith(startName) + && !updateFileName.equals(dataFileName) + && updateFile.lastModified() >= minimumLastModified) { + modTimeToUpdateFile.put(updateFile.lastModified(), updateFile); + } + } + return modTimeToUpdateFile.values(); + } + + private long readLastUpdateFileModified() { + long mostRecentModification = Long.MIN_VALUE; + for (File updateFile : findUpdateFilesAfter(0L)) { + mostRecentModification = Math.max(mostRecentModification, updateFile.lastModified()); + } + return mostRecentModification; + } + + public static char determineDelimiter(String line) { + for (char possibleDelimieter : DELIMIETERS) { + if (line.indexOf(possibleDelimieter) >= 0) { + return possibleDelimieter; + } + } + throw new IllegalArgumentException("Did not find a delimiter in first line"); + } + + protected void processFile(FileLineIterator dataOrUpdateFileIterator, + FastByIDMap<?> data, + FastByIDMap<FastByIDMap<Long>> timestamps, + boolean fromPriorData) { + log.info("Reading file info..."); + int count = 0; + while (dataOrUpdateFileIterator.hasNext()) { + String line = dataOrUpdateFileIterator.next(); + if (!line.isEmpty()) { + processLine(line, data, timestamps, fromPriorData); + if (++count % 1000000 == 0) { + log.info("Processed {} lines", count); + } + } + } + log.info("Read lines: {}", count); + } + + /** + * <p> + * Reads one line from the input file and adds the data to a {@link FastByIDMap} data structure which maps user IDs + * to preferences. This assumes that each line of the input file corresponds to one preference. After + * reading a line and determining which user and item the preference pertains to, the method should look to + * see if the data contains a mapping for the user ID already, and if not, add an empty data structure of preferences + * as appropriate to the data. + * </p> + * + * <p> + * Note that if the line is empty or begins with '#' it will be ignored as a comment. + * </p> + * + * @param line + * line from input data file + * @param data + * all data read so far, as a mapping from user IDs to preferences + * @param fromPriorData an implementation detail -- if true, data will map IDs to + * {@link PreferenceArray} since the framework is attempting to read and update raw + * data that is already in memory. Otherwise it maps to {@link Collection}s of + * {@link Preference}s, since it's reading fresh data. Subclasses must be prepared + * to handle this wrinkle. + */ + protected void processLine(String line, + FastByIDMap<?> data, + FastByIDMap<FastByIDMap<Long>> timestamps, + boolean fromPriorData) { + + // Ignore empty lines and comments + if (line.isEmpty() || line.charAt(0) == COMMENT_CHAR) { + return; + } + + Iterator<String> tokens = delimiterPattern.split(line).iterator(); + String userIDString = tokens.next(); + String itemIDString = tokens.next(); + String preferenceValueString = tokens.next(); + boolean hasTimestamp = tokens.hasNext(); + String timestampString = hasTimestamp ? tokens.next() : null; + + long userID = readUserIDFromString(userIDString); + long itemID = readItemIDFromString(itemIDString); + + if (transpose) { + long tmp = userID; + userID = itemID; + itemID = tmp; + } + + // This is kind of gross but need to handle two types of storage + Object maybePrefs = data.get(userID); + if (fromPriorData) { + // Data are PreferenceArray + + PreferenceArray prefs = (PreferenceArray) maybePrefs; + if (!hasTimestamp && preferenceValueString.isEmpty()) { + // Then line is of form "userID,itemID,", meaning remove + if (prefs != null) { + boolean exists = false; + int length = prefs.length(); + for (int i = 0; i < length; i++) { + if (prefs.getItemID(i) == itemID) { + exists = true; + break; + } + } + if (exists) { + if (length == 1) { + data.remove(userID); + } else { + PreferenceArray newPrefs = new GenericUserPreferenceArray(length - 1); + for (int i = 0, j = 0; i < length; i++, j++) { + if (prefs.getItemID(i) == itemID) { + j--; + } else { + newPrefs.set(j, prefs.get(i)); + } + } + ((FastByIDMap<PreferenceArray>) data).put(userID, newPrefs); + } + } + } + + removeTimestamp(userID, itemID, timestamps); + + } else { + + float preferenceValue = Float.parseFloat(preferenceValueString); + + boolean exists = false; + if (prefs != null) { + for (int i = 0; i < prefs.length(); i++) { + if (prefs.getItemID(i) == itemID) { + exists = true; + prefs.setValue(i, preferenceValue); + break; + } + } + } + + if (!exists) { + if (prefs == null) { + prefs = new GenericUserPreferenceArray(1); + } else { + PreferenceArray newPrefs = new GenericUserPreferenceArray(prefs.length() + 1); + for (int i = 0, j = 1; i < prefs.length(); i++, j++) { + newPrefs.set(j, prefs.get(i)); + } + prefs = newPrefs; + } + prefs.setUserID(0, userID); + prefs.setItemID(0, itemID); + prefs.setValue(0, preferenceValue); + ((FastByIDMap<PreferenceArray>) data).put(userID, prefs); + } + } + + addTimestamp(userID, itemID, timestampString, timestamps); + + } else { + // Data are Collection<Preference> + + Collection<Preference> prefs = (Collection<Preference>) maybePrefs; + + if (!hasTimestamp && preferenceValueString.isEmpty()) { + // Then line is of form "userID,itemID,", meaning remove + if (prefs != null) { + // remove pref + Iterator<Preference> prefsIterator = prefs.iterator(); + while (prefsIterator.hasNext()) { + Preference pref = prefsIterator.next(); + if (pref.getItemID() == itemID) { + prefsIterator.remove(); + break; + } + } + } + + removeTimestamp(userID, itemID, timestamps); + + } else { + + float preferenceValue = Float.parseFloat(preferenceValueString); + + boolean exists = false; + if (prefs != null) { + for (Preference pref : prefs) { + if (pref.getItemID() == itemID) { + exists = true; + pref.setValue(preferenceValue); + break; + } + } + } + + if (!exists) { + if (prefs == null) { + prefs = new ArrayList<>(2); + ((FastByIDMap<Collection<Preference>>) data).put(userID, prefs); + } + prefs.add(new GenericPreference(userID, itemID, preferenceValue)); + } + + addTimestamp(userID, itemID, timestampString, timestamps); + + } + + } + } + + protected void processFileWithoutID(FileLineIterator dataOrUpdateFileIterator, + FastByIDMap<FastIDSet> data, + FastByIDMap<FastByIDMap<Long>> timestamps) { + log.info("Reading file info..."); + int count = 0; + while (dataOrUpdateFileIterator.hasNext()) { + String line = dataOrUpdateFileIterator.next(); + if (!line.isEmpty()) { + processLineWithoutID(line, data, timestamps); + if (++count % 100000 == 0) { + log.info("Processed {} lines", count); + } + } + } + log.info("Read lines: {}", count); + } + + protected void processLineWithoutID(String line, + FastByIDMap<FastIDSet> data, + FastByIDMap<FastByIDMap<Long>> timestamps) { + + if (line.isEmpty() || line.charAt(0) == COMMENT_CHAR) { + return; + } + + Iterator<String> tokens = delimiterPattern.split(line).iterator(); + String userIDString = tokens.next(); + String itemIDString = tokens.next(); + boolean hasPreference = tokens.hasNext(); + String preferenceValueString = hasPreference ? tokens.next() : ""; + boolean hasTimestamp = tokens.hasNext(); + String timestampString = hasTimestamp ? tokens.next() : null; + + long userID = readUserIDFromString(userIDString); + long itemID = readItemIDFromString(itemIDString); + + if (transpose) { + long tmp = userID; + userID = itemID; + itemID = tmp; + } + + if (hasPreference && !hasTimestamp && preferenceValueString.isEmpty()) { + // Then line is of form "userID,itemID,", meaning remove + + FastIDSet itemIDs = data.get(userID); + if (itemIDs != null) { + itemIDs.remove(itemID); + } + + removeTimestamp(userID, itemID, timestamps); + + } else { + + FastIDSet itemIDs = data.get(userID); + if (itemIDs == null) { + itemIDs = new FastIDSet(2); + data.put(userID, itemIDs); + } + itemIDs.add(itemID); + + addTimestamp(userID, itemID, timestampString, timestamps); + + } + } + + private void addTimestamp(long userID, + long itemID, + String timestampString, + FastByIDMap<FastByIDMap<Long>> timestamps) { + if (timestampString != null) { + FastByIDMap<Long> itemTimestamps = timestamps.get(userID); + if (itemTimestamps == null) { + itemTimestamps = new FastByIDMap<>(); + timestamps.put(userID, itemTimestamps); + } + long timestamp = readTimestampFromString(timestampString); + itemTimestamps.put(itemID, timestamp); + } + } + + private static void removeTimestamp(long userID, + long itemID, + FastByIDMap<FastByIDMap<Long>> timestamps) { + FastByIDMap<Long> itemTimestamps = timestamps.get(userID); + if (itemTimestamps != null) { + itemTimestamps.remove(itemID); + } + } + + /** + * Subclasses may wish to override this if ID values in the file are not numeric. This provides a hook by + * which subclasses can inject an {@link org.apache.mahout.cf.taste.model.IDMigrator} to perform + * translation. + */ + protected long readUserIDFromString(String value) { + return Long.parseLong(value); + } + + /** + * Subclasses may wish to override this if ID values in the file are not numeric. This provides a hook by + * which subclasses can inject an {@link org.apache.mahout.cf.taste.model.IDMigrator} to perform + * translation. + */ + protected long readItemIDFromString(String value) { + return Long.parseLong(value); + } + + /** + * Subclasses may wish to override this to change how time values in the input file are parsed. + * By default they are expected to be numeric, expressing a time as milliseconds since the epoch. + */ + protected long readTimestampFromString(String value) { + return Long.parseLong(value); + } + + @Override + public LongPrimitiveIterator getUserIDs() throws TasteException { + return delegate.getUserIDs(); + } + + @Override + public PreferenceArray getPreferencesFromUser(long userID) throws TasteException { + return delegate.getPreferencesFromUser(userID); + } + + @Override + public FastIDSet getItemIDsFromUser(long userID) throws TasteException { + return delegate.getItemIDsFromUser(userID); + } + + @Override + public LongPrimitiveIterator getItemIDs() throws TasteException { + return delegate.getItemIDs(); + } + + @Override + public PreferenceArray getPreferencesForItem(long itemID) throws TasteException { + return delegate.getPreferencesForItem(itemID); + } + + @Override + public Float getPreferenceValue(long userID, long itemID) throws TasteException { + return delegate.getPreferenceValue(userID, itemID); + } + + @Override + public Long getPreferenceTime(long userID, long itemID) throws TasteException { + return delegate.getPreferenceTime(userID, itemID); + } + + @Override + public int getNumItems() throws TasteException { + return delegate.getNumItems(); + } + + @Override + public int getNumUsers() throws TasteException { + return delegate.getNumUsers(); + } + + @Override + public int getNumUsersWithPreferenceFor(long itemID) throws TasteException { + return delegate.getNumUsersWithPreferenceFor(itemID); + } + + @Override + public int getNumUsersWithPreferenceFor(long itemID1, long itemID2) throws TasteException { + return delegate.getNumUsersWithPreferenceFor(itemID1, itemID2); + } + + /** + * Note that this method only updates the in-memory preference data that this {@link FileDataModel} + * maintains; it does not modify any data on disk. Therefore any updates from this method are only + * temporary, and lost when data is reloaded from a file. This method should also be considered relatively + * slow. + */ + @Override + public void setPreference(long userID, long itemID, float value) throws TasteException { + delegate.setPreference(userID, itemID, value); + } + + /** See the warning at {@link #setPreference(long, long, float)}. */ + @Override + public void removePreference(long userID, long itemID) throws TasteException { + delegate.removePreference(userID, itemID); + } + + @Override + public void refresh(Collection<Refreshable> alreadyRefreshed) { + if (dataFile.lastModified() > lastModified + minReloadIntervalMS + || readLastUpdateFileModified() > lastUpdateFileModified + minReloadIntervalMS) { + log.debug("File has changed; reloading..."); + reload(); + } + } + + @Override + public boolean hasPreferenceValues() { + return delegate.hasPreferenceValues(); + } + + @Override + public float getMaxPreference() { + return delegate.getMaxPreference(); + } + + @Override + public float getMinPreference() { + return delegate.getMinPreference(); + } + + @Override + public String toString() { + return "FileDataModel[dataFile:" + dataFile + ']'; + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileIDMigrator.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileIDMigrator.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileIDMigrator.java new file mode 100644 index 0000000..1bcb4ef --- /dev/null +++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileIDMigrator.java @@ -0,0 +1,117 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.model.file; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Collection; +import java.util.concurrent.locks.ReentrantLock; +import org.apache.mahout.cf.taste.common.Refreshable; +import org.apache.mahout.cf.taste.impl.common.FastByIDMap; +import org.apache.mahout.cf.taste.impl.model.AbstractIDMigrator; +import org.apache.mahout.common.iterator.FileLineIterable; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.base.Preconditions; + +/** + * <p> + * An {@link org.apache.mahout.cf.taste.model.IDMigrator} backed by a file. + * This class typically expects a file where each line + * contains a single stringID to be stored in this migrator. + * </p> + * + * <p> + * This class will reload data from the data file when {@link #refresh(Collection)} is called, unless the file + * has been reloaded very recently already. + * </p> + */ +public class FileIDMigrator extends AbstractIDMigrator { + + public static final long DEFAULT_MIN_RELOAD_INTERVAL_MS = 60 * 1000L; // 1 minute? + + private final File dataFile; + private FastByIDMap<String> longToString; + private final ReentrantLock reloadLock; + + private long lastModified; + private final long minReloadIntervalMS; + + private static final Logger log = LoggerFactory.getLogger(FileIDMigrator.class); + + public FileIDMigrator(File dataFile) throws FileNotFoundException { + this(dataFile, DEFAULT_MIN_RELOAD_INTERVAL_MS); + } + + public FileIDMigrator(File dataFile, long minReloadIntervalMS) throws FileNotFoundException { + longToString = new FastByIDMap<>(100); + this.dataFile = Preconditions.checkNotNull(dataFile); + if (!dataFile.exists() || dataFile.isDirectory()) { + throw new FileNotFoundException(dataFile.toString()); + } + + log.info("Creating FileReadonlyIDMigrator for file {}", dataFile); + + this.reloadLock = new ReentrantLock(); + this.lastModified = dataFile.lastModified(); + this.minReloadIntervalMS = minReloadIntervalMS; + + reload(); + } + + @Override + public String toStringID(long longID) { + return longToString.get(longID); + } + + private void reload() { + if (reloadLock.tryLock()) { + try { + longToString = buildMapping(); + } catch (IOException ioe) { + throw new IllegalStateException(ioe); + } finally { + reloadLock.unlock(); + } + } + } + + private FastByIDMap<String> buildMapping() throws IOException { + FastByIDMap<String> mapping = new FastByIDMap<>(); + for (String line : new FileLineIterable(dataFile)) { + mapping.put(toLongID(line), line); + } + lastModified = dataFile.lastModified(); + return mapping; + } + + @Override + public void refresh(Collection<Refreshable> alreadyRefreshed) { + if (dataFile.lastModified() > lastModified + minReloadIntervalMS) { + log.debug("File has changed; reloading..."); + reload(); + } + } + + @Override + public String toString() { + return "FileIDMigrator[dataFile:" + dataFile + ']'; + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/AbstractUserNeighborhood.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/AbstractUserNeighborhood.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/AbstractUserNeighborhood.java new file mode 100644 index 0000000..8d33f60 --- /dev/null +++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/AbstractUserNeighborhood.java @@ -0,0 +1,71 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.neighborhood; + +import java.util.Collection; + +import org.apache.mahout.cf.taste.common.Refreshable; +import org.apache.mahout.cf.taste.impl.common.RefreshHelper; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; + +import com.google.common.base.Preconditions; + +/** + * <p> + * Contains methods and resources useful to all classes in this package. + * </p> + */ +abstract class AbstractUserNeighborhood implements UserNeighborhood { + + private final UserSimilarity userSimilarity; + private final DataModel dataModel; + private final double samplingRate; + private final RefreshHelper refreshHelper; + + AbstractUserNeighborhood(UserSimilarity userSimilarity, DataModel dataModel, double samplingRate) { + Preconditions.checkArgument(userSimilarity != null, "userSimilarity is null"); + Preconditions.checkArgument(dataModel != null, "dataModel is null"); + Preconditions.checkArgument(samplingRate > 0.0 && samplingRate <= 1.0, "samplingRate must be in (0,1]"); + this.userSimilarity = userSimilarity; + this.dataModel = dataModel; + this.samplingRate = samplingRate; + this.refreshHelper = new RefreshHelper(null); + this.refreshHelper.addDependency(this.dataModel); + this.refreshHelper.addDependency(this.userSimilarity); + } + + final UserSimilarity getUserSimilarity() { + return userSimilarity; + } + + final DataModel getDataModel() { + return dataModel; + } + + final double getSamplingRate() { + return samplingRate; + } + + @Override + public final void refresh(Collection<Refreshable> alreadyRefreshed) { + refreshHelper.refresh(alreadyRefreshed); + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/CachingUserNeighborhood.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/CachingUserNeighborhood.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/CachingUserNeighborhood.java new file mode 100644 index 0000000..998e476 --- /dev/null +++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/CachingUserNeighborhood.java @@ -0,0 +1,69 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.neighborhood; + +import java.util.Collection; + +import org.apache.mahout.cf.taste.common.Refreshable; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.common.Cache; +import org.apache.mahout.cf.taste.impl.common.RefreshHelper; +import org.apache.mahout.cf.taste.impl.common.Retriever; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.neighborhood.UserNeighborhood; + +import com.google.common.base.Preconditions; + +/** A caching wrapper around an underlying {@link UserNeighborhood} implementation. */ +public final class CachingUserNeighborhood implements UserNeighborhood { + + private final UserNeighborhood neighborhood; + private final Cache<Long,long[]> neighborhoodCache; + + public CachingUserNeighborhood(UserNeighborhood neighborhood, DataModel dataModel) throws TasteException { + Preconditions.checkArgument(neighborhood != null, "neighborhood is null"); + this.neighborhood = neighborhood; + int maxCacheSize = dataModel.getNumUsers(); // just a dumb heuristic for sizing + this.neighborhoodCache = new Cache<>(new NeighborhoodRetriever(neighborhood), maxCacheSize); + } + + @Override + public long[] getUserNeighborhood(long userID) throws TasteException { + return neighborhoodCache.get(userID); + } + + @Override + public void refresh(Collection<Refreshable> alreadyRefreshed) { + neighborhoodCache.clear(); + Collection<Refreshable> refreshed = RefreshHelper.buildRefreshed(alreadyRefreshed); + RefreshHelper.maybeRefresh(refreshed, neighborhood); + } + + private static final class NeighborhoodRetriever implements Retriever<Long,long[]> { + private final UserNeighborhood neighborhood; + + private NeighborhoodRetriever(UserNeighborhood neighborhood) { + this.neighborhood = neighborhood; + } + + @Override + public long[] get(Long key) throws TasteException { + return neighborhood.getUserNeighborhood(key); + } + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/NearestNUserNeighborhood.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/NearestNUserNeighborhood.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/NearestNUserNeighborhood.java new file mode 100644 index 0000000..7f3a98a --- /dev/null +++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/NearestNUserNeighborhood.java @@ -0,0 +1,122 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.neighborhood; + +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; +import org.apache.mahout.cf.taste.impl.common.SamplingLongPrimitiveIterator; +import org.apache.mahout.cf.taste.impl.recommender.TopItems; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; + +import com.google.common.base.Preconditions; + +/** + * <p> + * Computes a neighborhood consisting of the nearest n users to a given user. "Nearest" is defined by the + * given {@link UserSimilarity}. + * </p> + */ +public final class NearestNUserNeighborhood extends AbstractUserNeighborhood { + + private final int n; + private final double minSimilarity; + + /** + * @param n neighborhood size; capped at the number of users in the data model + * @throws IllegalArgumentException + * if {@code n < 1}, or userSimilarity or dataModel are {@code null} + */ + public NearestNUserNeighborhood(int n, UserSimilarity userSimilarity, DataModel dataModel) throws TasteException { + this(n, Double.NEGATIVE_INFINITY, userSimilarity, dataModel, 1.0); + } + + /** + * @param n neighborhood size; capped at the number of users in the data model + * @param minSimilarity minimal similarity required for neighbors + * @throws IllegalArgumentException + * if {@code n < 1}, or userSimilarity or dataModel are {@code null} + */ + public NearestNUserNeighborhood(int n, + double minSimilarity, + UserSimilarity userSimilarity, + DataModel dataModel) throws TasteException { + this(n, minSimilarity, userSimilarity, dataModel, 1.0); + } + + /** + * @param n neighborhood size; capped at the number of users in the data model + * @param minSimilarity minimal similarity required for neighbors + * @param samplingRate percentage of users to consider when building neighborhood -- decrease to trade quality for + * performance + * @throws IllegalArgumentException + * if {@code n < 1} or samplingRate is NaN or not in (0,1], or userSimilarity or dataModel are + * {@code null} + */ + public NearestNUserNeighborhood(int n, + double minSimilarity, + UserSimilarity userSimilarity, + DataModel dataModel, + double samplingRate) throws TasteException { + super(userSimilarity, dataModel, samplingRate); + Preconditions.checkArgument(n >= 1, "n must be at least 1"); + int numUsers = dataModel.getNumUsers(); + this.n = n > numUsers ? numUsers : n; + this.minSimilarity = minSimilarity; + } + + @Override + public long[] getUserNeighborhood(long userID) throws TasteException { + + DataModel dataModel = getDataModel(); + UserSimilarity userSimilarityImpl = getUserSimilarity(); + + TopItems.Estimator<Long> estimator = new Estimator(userSimilarityImpl, userID, minSimilarity); + + LongPrimitiveIterator userIDs = SamplingLongPrimitiveIterator.maybeWrapIterator(dataModel.getUserIDs(), + getSamplingRate()); + + return TopItems.getTopUsers(n, userIDs, null, estimator); + } + + @Override + public String toString() { + return "NearestNUserNeighborhood"; + } + + private static final class Estimator implements TopItems.Estimator<Long> { + private final UserSimilarity userSimilarityImpl; + private final long theUserID; + private final double minSim; + + private Estimator(UserSimilarity userSimilarityImpl, long theUserID, double minSim) { + this.userSimilarityImpl = userSimilarityImpl; + this.theUserID = theUserID; + this.minSim = minSim; + } + + @Override + public double estimate(Long userID) throws TasteException { + if (userID == theUserID) { + return Double.NaN; + } + double sim = userSimilarityImpl.userSimilarity(theUserID, userID); + return sim >= minSim ? sim : Double.NaN; + } + } +} http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/ThresholdUserNeighborhood.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/ThresholdUserNeighborhood.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/ThresholdUserNeighborhood.java new file mode 100644 index 0000000..d5246e4 --- /dev/null +++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/neighborhood/ThresholdUserNeighborhood.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.neighborhood; + +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.common.FastIDSet; +import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; +import org.apache.mahout.cf.taste.impl.common.SamplingLongPrimitiveIterator; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.similarity.UserSimilarity; + +import com.google.common.base.Preconditions; + +/** + * <p> + * Computes a neigbhorhood consisting of all users whose similarity to the given user meets or exceeds a + * certain threshold. Similarity is defined by the given {@link UserSimilarity}. + * </p> + */ +public final class ThresholdUserNeighborhood extends AbstractUserNeighborhood { + + private final double threshold; + + /** + * @param threshold + * similarity threshold + * @param userSimilarity + * similarity metric + * @param dataModel + * data model + * @throws IllegalArgumentException + * if threshold is {@link Double#NaN}, or if samplingRate is not positive and less than or equal + * to 1.0, or if userSimilarity or dataModel are {@code null} + */ + public ThresholdUserNeighborhood(double threshold, UserSimilarity userSimilarity, DataModel dataModel) { + this(threshold, userSimilarity, dataModel, 1.0); + } + + /** + * @param threshold + * similarity threshold + * @param userSimilarity + * similarity metric + * @param dataModel + * data model + * @param samplingRate + * percentage of users to consider when building neighborhood -- decrease to trade quality for + * performance + * @throws IllegalArgumentException + * if threshold or samplingRate is {@link Double#NaN}, or if samplingRate is not positive and less + * than or equal to 1.0, or if userSimilarity or dataModel are {@code null} + */ + public ThresholdUserNeighborhood(double threshold, + UserSimilarity userSimilarity, + DataModel dataModel, + double samplingRate) { + super(userSimilarity, dataModel, samplingRate); + Preconditions.checkArgument(!Double.isNaN(threshold), "threshold must not be NaN"); + this.threshold = threshold; + } + + @Override + public long[] getUserNeighborhood(long userID) throws TasteException { + + DataModel dataModel = getDataModel(); + FastIDSet neighborhood = new FastIDSet(); + LongPrimitiveIterator usersIterable = SamplingLongPrimitiveIterator.maybeWrapIterator(dataModel + .getUserIDs(), getSamplingRate()); + UserSimilarity userSimilarityImpl = getUserSimilarity(); + + while (usersIterable.hasNext()) { + long otherUserID = usersIterable.next(); + if (userID != otherUserID) { + double theSimilarity = userSimilarityImpl.userSimilarity(userID, otherUserID); + if (!Double.isNaN(theSimilarity) && theSimilarity >= threshold) { + neighborhood.add(otherUserID); + } + } + } + + return neighborhood.toArray(); + } + + @Override + public String toString() { + return "ThresholdUserNeighborhood"; + } + +} http://git-wip-us.apache.org/repos/asf/mahout/blob/410ed16a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractCandidateItemsStrategy.java ---------------------------------------------------------------------- diff --git a/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractCandidateItemsStrategy.java b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractCandidateItemsStrategy.java new file mode 100644 index 0000000..d24ea6a --- /dev/null +++ b/community/mahout-mr/mr/src/main/java/org/apache/mahout/cf/taste/impl/recommender/AbstractCandidateItemsStrategy.java @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.mahout.cf.taste.impl.recommender; + +import org.apache.mahout.cf.taste.common.Refreshable; +import org.apache.mahout.cf.taste.common.TasteException; +import org.apache.mahout.cf.taste.impl.common.FastIDSet; +import org.apache.mahout.cf.taste.model.DataModel; +import org.apache.mahout.cf.taste.model.PreferenceArray; +import org.apache.mahout.cf.taste.recommender.CandidateItemsStrategy; +import org.apache.mahout.cf.taste.recommender.MostSimilarItemsCandidateItemsStrategy; + +import java.util.Collection; + +/** + * Abstract base implementation for retrieving candidate items to recommend + */ +public abstract class AbstractCandidateItemsStrategy implements CandidateItemsStrategy, + MostSimilarItemsCandidateItemsStrategy { + + protected FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel) throws TasteException{ + return doGetCandidateItems(preferredItemIDs, dataModel, false); + } + + @Override + public FastIDSet getCandidateItems(long userID, PreferenceArray preferencesFromUser, DataModel dataModel, + boolean includeKnownItems) throws TasteException { + return doGetCandidateItems(preferencesFromUser.getIDs(), dataModel, includeKnownItems); + } + + @Override + public FastIDSet getCandidateItems(long[] itemIDs, DataModel dataModel) + throws TasteException { + return doGetCandidateItems(itemIDs, dataModel, false); + } + + protected abstract FastIDSet doGetCandidateItems(long[] preferredItemIDs, DataModel dataModel, + boolean includeKnownItems) throws TasteException; + + @Override + public void refresh(Collection<Refreshable> alreadyRefreshed) {} +}
