Author: smarthi Date: Sun Dec 8 23:35:02 2013 New Revision: 1549353 URL: http://svn.apache.org/r1549353 Log: MAHOUT-1349: Clusterdumper/loadTermDictionary crashes when highest index in (sparse) dictionary vector is larger than dictionary vector size
Modified: mahout/trunk/CHANGELOG mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java Modified: mahout/trunk/CHANGELOG URL: http://svn.apache.org/viewvc/mahout/trunk/CHANGELOG?rev=1549353&r1=1549352&r2=1549353&view=diff ============================================================================== --- mahout/trunk/CHANGELOG (original) +++ mahout/trunk/CHANGELOG Sun Dec 8 23:35:02 2013 @@ -64,6 +64,8 @@ Release 0.9 - unreleased MAHOUT-1261: TasteHadoopUtils.idToIndex can return an int that has size Integer.MAX_VALUE (Carl Clark, smarthi) + MAHOUT-1249: Clusterdumper/loadTermDictionary crashes when highest index in (sparse) dictionary vector is larger than dictionary vector size (Andrew Musselman via smarthi) + MAHOUT-1242: No key redistribution function for associative maps (Tharindu Rusira via smarthi) MAHOUT-1030: Regression: Clustered Points Should be WeightedPropertyVectorWritable not WeightedVectorWritable (Andrew Musselman, Pat Ferrel, Jeff Eastman, Lars Norskog, smarthi) Modified: mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java?rev=1549353&r1=1549352&r2=1549353&view=diff ============================================================================== --- mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java (original) +++ mahout/trunk/integration/src/main/java/org/apache/mahout/utils/vectors/VectorHelper.java Sun Dec 8 23:35:02 2013 @@ -193,12 +193,18 @@ public final class VectorHelper { */ public static String[] loadTermDictionary(Configuration conf, String filePattern) { OpenObjectIntHashMap<String> dict = new OpenObjectIntHashMap<String>(); + int maxIndexValue = 0; for (Pair<Text, IntWritable> record : new SequenceFileDirIterable<Text, IntWritable>(new Path(filePattern), PathType.GLOB, null, null, true, conf)) { dict.put(record.getFirst().toString(), record.getSecond().get()); + if (record.getSecond().get() > maxIndexValue) { + maxIndexValue = record.getSecond().get(); + } } - String[] dictionary = new String[dict.size()]; + // Set dictionary size to greater of (maxIndexValue + 1, dict.size()) + int maxDictionarySize = maxIndexValue + 1 > dict.size() ? maxIndexValue + 1 : dict.size(); + String[] dictionary = new String[maxDictionarySize]; for (String feature : dict.keys()) { dictionary[dict.get(feature)] = feature; } Modified: mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java URL: http://svn.apache.org/viewvc/mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java?rev=1549353&r1=1549352&r2=1549353&view=diff ============================================================================== --- mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java (original) +++ mahout/trunk/integration/src/test/java/org/apache/mahout/utils/vectors/VectorHelperTest.java Sun Dec 8 23:35:02 2013 @@ -19,13 +19,64 @@ package org.apache.mahout.utils.vectors; import com.google.common.collect.Iterables; +import com.google.common.io.Closeables; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.IntWritable; +import org.apache.hadoop.io.SequenceFile; +import org.apache.hadoop.io.Text; import org.apache.mahout.common.MahoutTestCase; +import org.apache.mahout.common.RandomUtils; import org.apache.mahout.math.SequentialAccessSparseVector; import org.apache.mahout.math.Vector; +import org.junit.Before; import org.junit.Test; +import java.util.Random; + public final class VectorHelperTest extends MahoutTestCase { + private static final int NUM_DOCS = 100; + + private Path inputPathOne; + private Path inputPathTwo; + + private Configuration conf; + + @Override + @Before + public void setUp() throws Exception { + super.setUp(); + conf = getConfiguration(); + + inputPathOne = getTestTempFilePath("documents/docs-one.file"); + FileSystem fs = FileSystem.get(inputPathOne.toUri(), conf); + SequenceFile.Writer writer = new SequenceFile.Writer(fs, conf, inputPathOne, Text.class, IntWritable.class); + try { + Random rd = RandomUtils.getRandom(); + for (int i = 0; i < NUM_DOCS; i++) { + // Make all indices higher than dictionary size + writer.append(new Text("Document::ID::" + i), new IntWritable(NUM_DOCS + rd.nextInt(NUM_DOCS))); + } + } finally { + Closeables.close(writer, false); + } + + inputPathTwo = getTestTempFilePath("documents/docs-two.file"); + fs = FileSystem.get(inputPathTwo.toUri(), conf); + writer = new SequenceFile.Writer(fs, conf, inputPathTwo, Text.class, IntWritable.class); + try { + Random rd = RandomUtils.getRandom(); + for (int i = 0; i < NUM_DOCS; i++) { + // Keep indices within number of documents + writer.append(new Text("Document::ID::" + i), new IntWritable(rd.nextInt(NUM_DOCS))); + } + } finally { + Closeables.close(writer, false); + } + } + @Test public void testJsonFormatting() throws Exception { Vector v = new SequentialAccessSparseVector(10); @@ -85,4 +136,12 @@ public final class VectorHelperTest exte v.set(8, 0.0); assertEquals(0, VectorHelper.topEntries(v, 6).size()); } + + @Test + public void testLoadTermDictionary() throws Exception { + // With indices higher than dictionary size + VectorHelper.loadTermDictionary(conf, inputPathOne.toString()); + // With dictionary size higher than indices + VectorHelper.loadTermDictionary(conf, inputPathTwo.toString()); + } }