Author: srowen Date: Fri Feb 5 18:24:37 2010 New Revision: 907037 URL: http://svn.apache.org/viewvc?rev=907037&view=rev Log: FileDataModel improvements when update files only, not main file have changed -- won't reload main data
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/jester/JesterDataModel.java Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java?rev=907037&r1=907036&r2=907037&view=diff ============================================================================== --- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java (original) +++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericBooleanPrefDataModel.java Fri Feb 5 18:24:37 2010 @@ -124,6 +124,13 @@ return (FastByIDMap<FastIDSet>) (FastByIDMap<?>) data; } + /** + * This is used mostly internally to the framework, and shouldn't be relied upon otherwise. + */ + public FastByIDMap<FastIDSet> getRawUserData() { + return this.preferenceFromUsers; + } + @Override public LongPrimitiveArrayIterator getUserIDs() { return new LongPrimitiveArrayIterator(userIDs); Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java?rev=907037&r1=907036&r2=907037&view=diff ============================================================================== --- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java (original) +++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/GenericDataModel.java Fri Feb 5 18:24:37 2010 @@ -140,6 +140,13 @@ return data; } + /** + * This is used mostly internally to the framework, and shouldn't be relied upon otherwise. + */ + public FastByIDMap<PreferenceArray> getRawUserData() { + return this.preferenceFromUsers; + } + @Override public LongPrimitiveArrayIterator getUserIDs() { return new LongPrimitiveArrayIterator(userIDs); Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java?rev=907037&r1=907036&r2=907037&view=diff ============================================================================== --- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java (original) +++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/cf/taste/impl/model/file/FileDataModel.java Fri Feb 5 18:24:37 2010 @@ -21,6 +21,7 @@ import org.apache.mahout.cf.taste.common.TasteException; import org.apache.mahout.cf.taste.impl.common.FastByIDMap; import org.apache.mahout.cf.taste.impl.common.FastIDSet; +import org.apache.mahout.cf.taste.impl.model.GenericUserPreferenceArray; import org.apache.mahout.common.FileLineIterator; import org.apache.mahout.cf.taste.impl.common.LongPrimitiveIterator; import org.apache.mahout.cf.taste.impl.model.GenericBooleanPrefDataModel; @@ -77,8 +78,8 @@ * that, a JDBC-backed {...@link DataModel} and a database are more appropriate.</p> * * <p>It is possible and likely useful to subclass this class and customize its behavior to accommodate - * application-specific needs and input formats. See {...@link #processLine(String, FastByIDMap, char)} and - * {...@link #processLineWithoutID(String, FastByIDMap, char)} + * application-specific needs and input formats. See {...@link #processLine(String, FastByIDMap)} and + * {...@link #processLineWithoutID(String, FastByIDMap)} */ public class FileDataModel implements DataModel { @@ -89,6 +90,9 @@ private final File dataFile; private long lastModified; + private long lastUpdateFileModified; + private final char delimiter; + private final boolean hasPrefValues; private boolean loaded; private DataModel delegate; private final ReentrantLock reloadLock; @@ -98,12 +102,17 @@ * @param dataFile file containing preferences data. If file is compressed (and name ends in .gz or .zip accordingly) * it will be decompressed as it is read) * @throws FileNotFoundException if dataFile does not exist + * @throws IOException if file can't be read */ - public FileDataModel(File dataFile) throws FileNotFoundException { + public FileDataModel(File dataFile) throws IOException { this(dataFile, false); } - public FileDataModel(File dataFile, boolean transpose) throws FileNotFoundException { + /** + * @param transpose transposes user IDs and item IDs -- convenient for 'flipping' the data model this way + * @see #FileDataModel(File) + */ + public FileDataModel(File dataFile, boolean transpose) throws IOException { if (dataFile == null) { throw new IllegalArgumentException("dataFile is null"); } @@ -115,6 +124,18 @@ this.dataFile = dataFile.getAbsoluteFile(); this.lastModified = dataFile.lastModified(); + this.lastUpdateFileModified = readLastUpdateFileModified(); + + FileLineIterator iterator = new FileLineIterator(dataFile, false); + String firstLine = iterator.peek(); + while (firstLine.length() == 0 || firstLine.charAt(0) == COMMENT_CHAR) { + iterator.next(); + firstLine = iterator.peek(); + } + iterator.close(); + delimiter = determineDelimiter(firstLine, 2); + hasPrefValues = firstLine.indexOf(delimiter, firstLine.indexOf(delimiter) + 1) >= 0; + this.reloadLock = new ReentrantLock(); this.transpose = transpose; } @@ -123,6 +144,10 @@ return dataFile; } + public char getDelimiter() { + return delimiter; + } + protected void reload() { if (!reloadLock.isLocked()) { reloadLock.lock(); @@ -138,29 +163,67 @@ } protected DataModel buildModel() throws IOException { - FileLineIterator iterator = new FileLineIterator(dataFile, false); - String firstLine = iterator.peek(); - while (firstLine.length() == 0 || firstLine.charAt(0) == COMMENT_CHAR) { - iterator.next(); - firstLine = iterator.peek(); - } - char delimiter = determineDelimiter(firstLine, 2); - boolean hasPrefValues = firstLine.indexOf(delimiter, firstLine.indexOf(delimiter) + 1) >= 0; + + long newLastModified = dataFile.lastModified(); + long newLastUpdateFileModified = readLastUpdateFileModified(); + + boolean loadFreshData = delegate == null || newLastModified > lastModified + MIN_RELOAD_INTERVAL_MS; + + lastModified = newLastModified; + lastUpdateFileModified = newLastUpdateFileModified; if (hasPrefValues) { - FastByIDMap<Collection<Preference>> data = new FastByIDMap<Collection<Preference>>(); - processFile(iterator, data, delimiter); - for (File updateFile : findUpdateFiles()) { - processFile(new FileLineIterator(updateFile, false), data, delimiter); + + if (loadFreshData) { + + FastByIDMap<Collection<Preference>> data = new FastByIDMap<Collection<Preference>>(); + FileLineIterator iterator = new FileLineIterator(dataFile, false); + processFile(iterator, data); + + for (File updateFile : findUpdateFiles()) { + processFile(new FileLineIterator(updateFile, false), data); + } + + return new GenericDataModel(GenericDataModel.toDataMap(data, true)); + + } else { + + FastByIDMap<PreferenceArray> rawData = ((GenericDataModel) delegate).getRawUserData(); + + for (File updateFile : findUpdateFiles()) { + processFile(new FileLineIterator(updateFile, false), rawData); + } + + return new GenericDataModel(rawData); + } - return new GenericDataModel(GenericDataModel.toDataMap(data, true)); + } else { - FastByIDMap<FastIDSet> data = new FastByIDMap<FastIDSet>(); - processFileWithoutID(iterator, data, delimiter); - for (File updateFile : findUpdateFiles()) { - processFileWithoutID(new FileLineIterator(updateFile, false), data, delimiter); + + if (loadFreshData) { + + FastByIDMap<FastIDSet> data = new FastByIDMap<FastIDSet>(); + FileLineIterator iterator = new FileLineIterator(dataFile, false); + processFileWithoutID(iterator, data); + + for (File updateFile : findUpdateFiles()) { + processFileWithoutID(new FileLineIterator(updateFile, false), data); + } + + return new GenericBooleanPrefDataModel(data); + + } else { + + FastByIDMap<FastIDSet> rawData = ((GenericBooleanPrefDataModel) delegate).getRawUserData(); + + for (File updateFile : findUpdateFiles()) { + processFileWithoutID(new FileLineIterator(updateFile, false), rawData); + } + + return new GenericBooleanPrefDataModel(rawData); + } - return new GenericBooleanPrefDataModel(data); + } } @@ -185,6 +248,14 @@ return updateFiles; } + private long readLastUpdateFileModified() { + long mostRecentModification = Long.MIN_VALUE; + for (File updateFile : findUpdateFiles()) { + mostRecentModification = Math.max(mostRecentModification, updateFile.lastModified()); + } + return mostRecentModification; + } + public static char determineDelimiter(String line, int maxDelimiters) { char delimiter; if (line.indexOf(',') >= 0) { @@ -212,14 +283,13 @@ } protected void processFile(FileLineIterator dataOrUpdateFileIterator, - FastByIDMap<Collection<Preference>> data, - char delimiter) { + FastByIDMap<?> data) { log.info("Reading file info..."); AtomicInteger count = new AtomicInteger(); while (dataOrUpdateFileIterator.hasNext()) { String line = dataOrUpdateFileIterator.next(); if (line.length() > 0) { - processLine(line, data, delimiter); + processLine(line, data); int currentCount = count.incrementAndGet(); if (currentCount % 1000000 == 0) { log.info("Processed {} lines", currentCount); @@ -240,7 +310,7 @@ * @param line line from input data file * @param data all data read so far, as a mapping from user IDs to preferences */ - protected void processLine(String line, FastByIDMap<Collection<Preference>> data, char delimiter) { + protected void processLine(String line, FastByIDMap<?> data) { if (line.length() == 0 || line.charAt(0) == COMMENT_CHAR) { return; @@ -274,37 +344,120 @@ userID = itemID; itemID = tmp; } - Collection<Preference> prefs = data.get(userID); - if (prefs == null) { - prefs = new ArrayList<Preference>(2); - data.put(userID, prefs); - } - if (preferenceValueString.length() == 0) { - // remove pref - Iterator<Preference> prefsIterator = prefs.iterator(); - while (prefsIterator.hasNext()) { - Preference pref = prefsIterator.next(); - if (pref.getItemID() == itemID) { - prefsIterator.remove(); - break; + // This is kind of gross but need to handle two types of storage + Object maybePrefs = data.get(userID); + if (maybePrefs instanceof PreferenceArray) { + + PreferenceArray prefs = (PreferenceArray) maybePrefs; + if (preferenceValueString.length() == 0) { + if (prefs != null) { + boolean exists = false; + int length = prefs.length(); + for (int i = 0; i < length; i++) { + if (prefs.getItemID(i) == itemID) { + exists = true; + break; + } + } + if (exists) { + if (length == 1) { + data.remove(userID); + } else { + PreferenceArray newPrefs = new GenericUserPreferenceArray(length - 1); + for (int i = 0, j = 0; i < length; i++, j++) { + if (prefs.getItemID(i) == itemID) { + j--; + } else { + newPrefs.set(j, prefs.get(i)); + } + } + } + } + } + + } else { + + float preferenceValue = Float.parseFloat(preferenceValueString); + + boolean exists = false; + if (prefs != null) { + for (int i = 0; i < prefs.length(); i++) { + if (prefs.getItemID(i) == itemID) { + exists = true; + prefs.setValue(i, preferenceValue); + break; + } + } + } + + if (!exists) { + if (prefs == null) { + prefs = new GenericUserPreferenceArray(1); + ((FastByIDMap<PreferenceArray>) data).put(userID, prefs); + } else { + PreferenceArray newPrefs = new GenericUserPreferenceArray(prefs.length() + 1); + for (int i = 0, j = 1; i < prefs.length(); i++, j++) { + newPrefs.set(j, prefs.get(i)); + } + } + prefs.setUserID(0, userID); + prefs.setItemID(0, itemID); + prefs.setValue(0, preferenceValue); } } + } else { - float preferenceValue = Float.parseFloat(preferenceValueString); - prefs.add(new GenericPreference(userID, itemID, preferenceValue)); + + Collection<Preference> prefs = (Collection<Preference>) maybePrefs; + + if (preferenceValueString.length() == 0) { + if (prefs != null) { + // remove pref + Iterator<Preference> prefsIterator = prefs.iterator(); + while (prefsIterator.hasNext()) { + Preference pref = prefsIterator.next(); + if (pref.getItemID() == itemID) { + prefsIterator.remove(); + break; + } + } + } + } else { + + float preferenceValue = Float.parseFloat(preferenceValueString); + + boolean exists = false; + if (prefs != null) { + for (Preference pref : prefs) { + if (pref.getItemID() == itemID) { + exists = true; + pref.setValue(preferenceValue); + break; + } + } + } + + if (!exists) { + if (prefs == null) { + prefs = new ArrayList<Preference>(2); + ((FastByIDMap<Collection<Preference>>) data).put(userID, prefs); + } + prefs.add(new GenericPreference(userID, itemID, preferenceValue)); + } + } + } } protected void processFileWithoutID(FileLineIterator dataOrUpdateFileIterator, - FastByIDMap<FastIDSet> data, - char delimiter) { + FastByIDMap<FastIDSet> data) { log.info("Reading file info..."); AtomicInteger count = new AtomicInteger(); while (dataOrUpdateFileIterator.hasNext()) { String line = dataOrUpdateFileIterator.next(); if (line.length() > 0) { - processLineWithoutID(line, data, delimiter); + processLineWithoutID(line, data); int currentCount = count.incrementAndGet(); if (currentCount % 100000 == 0) { log.info("Processed {} lines", currentCount); @@ -314,7 +467,7 @@ log.info("Read lines: {}", count.get()); } - protected void processLineWithoutID(String line, FastByIDMap<FastIDSet> data, char delimiter) { + protected void processLineWithoutID(String line, FastByIDMap<FastIDSet> data) { if (line.length() == 0 || line.charAt(0) == COMMENT_CHAR) { return; @@ -438,13 +591,9 @@ @Override public void refresh(Collection<Refreshable> alreadyRefreshed) { - long mostRecentModification = dataFile.lastModified(); - for (File updateFile : findUpdateFiles()) { - mostRecentModification = Math.max(mostRecentModification, updateFile.lastModified()); - } - if (mostRecentModification > lastModified + MIN_RELOAD_INTERVAL_MS) { + if (dataFile.lastModified() > lastModified + MIN_RELOAD_INTERVAL_MS || + readLastUpdateFileModified() > lastUpdateFileModified + MIN_RELOAD_INTERVAL_MS) { log.debug("File has changed; reloading..."); - lastModified = mostRecentModification; reload(); } } Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/jester/JesterDataModel.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/jester/JesterDataModel.java?rev=907037&r1=907036&r2=907037&view=diff ============================================================================== --- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/jester/JesterDataModel.java (original) +++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/cf/taste/example/jester/JesterDataModel.java Fri Feb 5 18:24:37 2010 @@ -57,13 +57,14 @@ protected DataModel buildModel() throws IOException { FastByIDMap<Collection<Preference>> data = new FastByIDMap<Collection<Preference>>(); FileLineIterator iterator = new FileLineIterator(getDataFile(), false); - processFile(iterator, data, ','); + processFile(iterator, data); return new GenericDataModel(GenericDataModel.toDataMap(data, true)); } @Override - protected void processLine(String line, FastByIDMap<Collection<Preference>> data, char delimiter) { - String[] jokePrefs = line.split(String.valueOf(delimiter)); + protected void processLine(String line, FastByIDMap<?> rawData) { + FastByIDMap<Collection<Preference>> data = (FastByIDMap<Collection<Preference>>) rawData; + String[] jokePrefs = line.split(","); int count = Integer.parseInt(jokePrefs[0]); Collection<Preference> prefs = new ArrayList<Preference>(count); for (int itemID = 1; itemID < jokePrefs.length; itemID++) { // yes skip first one, just a count