Author: robinanil Date: Mon Feb 8 12:51:51 2010 New Revision: 907642 URL: http://svn.apache.org/viewvc?rev=907642&view=rev Log: Transforming code to use Mahout-math collections instead of HashMap. Only the easier ones. No Changes made in public functions
Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthMapper.java lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/TransactionSortingMapper.java lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthMapper.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthMapper.java?rev=907642&r1=907641&r2=907642&view=diff ============================================================================== --- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthMapper.java (original) +++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthMapper.java Mon Feb 8 12:51:51 2010 @@ -18,18 +18,18 @@ package org.apache.mahout.fpm.pfpgrowth; import java.io.IOException; -import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; -import java.util.Map; -import java.util.Map.Entry; import java.util.Set; +import java.util.Map.Entry; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.Mapper; import org.apache.mahout.common.Pair; import org.apache.mahout.common.Parameters; +import org.apache.mahout.math.map.OpenIntLongHashMap; +import org.apache.mahout.math.map.OpenObjectIntHashMap; /** * {...@link ParallelFPGrowthMapper} maps each transaction to all unique items @@ -40,7 +40,7 @@ public class ParallelFPGrowthMapper extends Mapper<LongWritable,TransactionTree,LongWritable,TransactionTree> { - private final Map<Integer,Long> gListInt = new HashMap<Integer,Long>(); + private final OpenIntLongHashMap gListInt = new OpenIntLongHashMap(); @Override protected void map(LongWritable offset, @@ -83,7 +83,7 @@ Parameters params = Parameters.fromString(context.getConfiguration().get( "pfp.parameters", "")); - Map<String,Integer> fMap = new HashMap<String,Integer>(); + OpenObjectIntHashMap<String> fMap = new OpenObjectIntHashMap<String>(); int i = 0; for (Pair<String,Long> e : PFPGrowth.deserializeList(params, "fList", context.getConfiguration())) { Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java?rev=907642&r1=907641&r2=907642&view=diff ============================================================================== --- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java (original) +++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/ParallelFPGrowthReducer.java Mon Feb 8 12:51:51 2010 @@ -21,7 +21,6 @@ import java.util.ArrayList; import java.util.Collections; import java.util.Comparator; -import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; @@ -40,6 +39,9 @@ import org.apache.mahout.fpm.pfpgrowth.convertors.string.TopKStringPatterns; import org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPGrowth; import org.apache.mahout.fpm.pfpgrowth.fpgrowth.FPTreeDepthCache; +import org.apache.mahout.math.list.IntArrayList; +import org.apache.mahout.math.map.OpenLongObjectHashMap; +import org.apache.mahout.math.map.OpenObjectIntHashMap; /** * {...@link ParallelFPGrowthReducer} takes each group of transactions and runs @@ -55,11 +57,11 @@ private final List<String> featureReverseMap = new ArrayList<String>(); - private final Map<String,Integer> fMap = new HashMap<String,Integer>(); + private final OpenObjectIntHashMap<String> fMap = new OpenObjectIntHashMap<String>(); private final List<String> fRMap = new ArrayList<String>(); - private final Map<Long,List<Integer>> groupFeatures = new HashMap<Long,List<Integer>>(); + private final OpenLongObjectHashMap<IntArrayList> groupFeatures = new OpenLongObjectHashMap<IntArrayList>(); private int maxHeapSize = 50; @@ -100,17 +102,18 @@ }); FPGrowth<Integer> fpGrowth = new FPGrowth<Integer>(); - fpGrowth.generateTopKFrequentPatterns( - cTree.getIterator(), - localFList, - minSupport, - maxHeapSize, - new HashSet<Integer>(groupFeatures.get(key.get())), - new IntegerStringOutputConverter( - new ContextWriteOutputCollector<LongWritable,TransactionTree,Text,TopKStringPatterns>( - context), featureReverseMap), - new ContextStatusUpdater<LongWritable,TransactionTree,Text,TopKStringPatterns>( - context)); + fpGrowth + .generateTopKFrequentPatterns( + cTree.getIterator(), + localFList, + minSupport, + maxHeapSize, + new HashSet<Integer>(groupFeatures.get(key.get()).toList()), + new IntegerStringOutputConverter( + new ContextWriteOutputCollector<LongWritable,TransactionTree,Text,TopKStringPatterns>( + context), featureReverseMap), + new ContextStatusUpdater<LongWritable,TransactionTree,Text,TopKStringPatterns>( + context)); } @Override @@ -135,12 +138,12 @@ .getConfiguration()); for (Entry<String,Long> entry : gList.entrySet()) { - List<Integer> groupList = groupFeatures.get(entry.getValue()); + IntArrayList groupList = groupFeatures.get(entry.getValue()); Integer itemInteger = fMap.get(entry.getKey()); if (groupList != null) { groupList.add(itemInteger); } else { - groupList = new ArrayList<Integer>(); + groupList = new IntArrayList(); groupList.add(itemInteger); groupFeatures.put(entry.getValue(), groupList); } Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/TransactionSortingMapper.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/TransactionSortingMapper.java?rev=907642&r1=907641&r2=907642&view=diff ============================================================================== --- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/TransactionSortingMapper.java (original) +++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/TransactionSortingMapper.java Mon Feb 8 12:51:51 2010 @@ -21,10 +21,8 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; -import java.util.HashMap; import java.util.HashSet; import java.util.List; -import java.util.Map; import java.util.Set; import java.util.regex.Pattern; @@ -33,6 +31,7 @@ import org.apache.hadoop.mapreduce.Mapper; import org.apache.mahout.common.Pair; import org.apache.mahout.common.Parameters; +import org.apache.mahout.math.map.OpenObjectIntHashMap; /** * {...@link TransactionSortingMapper} maps each transaction to all unique items @@ -43,7 +42,7 @@ public class TransactionSortingMapper extends Mapper<LongWritable,Text,LongWritable,TransactionTree> { - private final Map<String,Integer> fMap = new HashMap<String,Integer>(); + private final OpenObjectIntHashMap<String> fMap = new OpenObjectIntHashMap<String>(); private Pattern splitter; Modified: lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java?rev=907642&r1=907641&r2=907642&view=diff ============================================================================== --- lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java (original) +++ lucene/mahout/trunk/core/src/main/java/org/apache/mahout/fpm/pfpgrowth/fpgrowth/FrequentPatternMaxHeap.java Mon Feb 8 12:51:51 2010 @@ -17,12 +17,12 @@ package org.apache.mahout.fpm.pfpgrowth.fpgrowth; -import java.util.HashMap; import java.util.HashSet; -import java.util.Map; import java.util.PriorityQueue; import java.util.Set; +import org.apache.mahout.math.map.OpenLongObjectHashMap; + /** {...@link FrequentPatternMaxHeap} keeps top K Attributes in a TreeSet */ public final class FrequentPatternMaxHeap { @@ -34,7 +34,7 @@ private boolean subPatternCheck; - private Map<Long,Set<Pattern>> patternIndex; + private OpenLongObjectHashMap<Set<Pattern>> patternIndex; private PriorityQueue<Pattern> queue; @@ -42,7 +42,7 @@ maxSize = numResults; queue = new PriorityQueue<Pattern>(maxSize); this.subPatternCheck = subPatternCheck; - patternIndex = new HashMap<Long,Set<Pattern>>(); + patternIndex = new OpenLongObjectHashMap<Set<Pattern>>(); for (Pattern p : queue) { Long index = p.support(); Set<Pattern> patternList; Modified: lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java?rev=907642&r1=907641&r2=907642&view=diff ============================================================================== --- lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java (original) +++ lucene/mahout/trunk/examples/src/main/java/org/apache/mahout/text/SequenceFilesFromDirectory.java Mon Feb 8 12:51:51 2010 @@ -85,6 +85,7 @@ maxChunkSizeInBytes = chunkSizeInMB * 1024 * 1024; this.outputDir = outputDir; fs = FileSystem.get(conf); + currentChunkID = 0; writer = new SequenceFile.Writer(fs, conf, getPath(currentChunkID), Text.class, Text.class); Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java?rev=907642&r1=907641&r2=907642&view=diff ============================================================================== --- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java (original) +++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/DictionaryVectorizer.java Mon Feb 8 12:51:51 2010 @@ -43,7 +43,6 @@ import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.common.StringTuple; import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.utils.vectors.common.PartialVectorMergeReducer; import org.apache.mahout.utils.vectors.common.PartialVectorMerger; import org.apache.mahout.utils.vectors.text.term.TFPartialVectorReducer; import org.apache.mahout.utils.vectors.text.term.TermCountMapper; Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java?rev=907642&r1=907641&r2=907642&view=diff ============================================================================== --- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java (original) +++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/document/SequenceFileTokenizerMapper.java Mon Feb 8 12:51:51 2010 @@ -31,7 +31,6 @@ import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.analysis.tokenattributes.TermAttribute; import org.apache.mahout.common.StringTuple; -import org.apache.mahout.utils.vectors.text.DictionaryVectorizer; import org.apache.mahout.utils.vectors.text.DocumentProcessor; /** @@ -57,7 +56,7 @@ document.add(new String(termAtt.termBuffer(), 0, termAtt.termLength())); } } - output.collect(key,document); + output.collect(key, document); } @Override Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java?rev=907642&r1=907641&r2=907642&view=diff ============================================================================== --- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java (original) +++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TFPartialVectorReducer.java Mon Feb 8 12:51:51 2010 @@ -32,7 +32,6 @@ import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reducer; import org.apache.hadoop.mapred.Reporter; -import org.apache.lucene.analysis.Analyzer; import org.apache.mahout.common.StringTuple; import org.apache.mahout.math.RandomAccessSparseVector; import org.apache.mahout.math.Vector; @@ -44,7 +43,6 @@ */ public class TFPartialVectorReducer extends MapReduceBase implements Reducer<Text,StringTuple,Text,VectorWritable> { - private Analyzer analyzer; private final OpenObjectIntHashMap<String> dictionary = new OpenObjectIntHashMap<String>(); private final VectorWritable vectorWritable = new VectorWritable(); Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java?rev=907642&r1=907641&r2=907642&view=diff ============================================================================== --- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java (original) +++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/text/term/TermCountMapper.java Mon Feb 8 12:51:51 2010 @@ -18,8 +18,6 @@ package org.apache.mahout.utils.vectors.text.term; import java.io.IOException; -import java.util.HashMap; -import java.util.Map; import java.util.Map.Entry; import org.apache.commons.lang.mutable.MutableLong; @@ -30,6 +28,8 @@ import org.apache.hadoop.mapred.OutputCollector; import org.apache.hadoop.mapred.Reporter; import org.apache.mahout.common.StringTuple; +import org.apache.mahout.math.function.ObjectLongProcedure; +import org.apache.mahout.math.map.OpenObjectLongHashMap; /** * TextVectorizer Term Count Mapper. Tokenizes a text document and outputs the @@ -41,20 +41,24 @@ @Override public void map(Text key, StringTuple value, - OutputCollector<Text,LongWritable> output, - Reporter reporter) throws IOException { - - Map<String,MutableLong> wordCount = new HashMap<String,MutableLong>(); + final OutputCollector<Text,LongWritable> output, + final Reporter reporter) throws IOException { + OpenObjectLongHashMap<String> wordCount = new OpenObjectLongHashMap<String>(); for (String word : value.getEntries()) { if (wordCount.containsKey(word) == false) { - wordCount.put(word, new MutableLong(0)); - } - wordCount.get(word).increment(); - } - - for (Entry<String,MutableLong> entry : wordCount.entrySet()) { - output.collect(new Text(entry.getKey()), new LongWritable(entry - .getValue().longValue())); + wordCount.put(word, 1); + } else wordCount.put(word, wordCount.get(word) + 1); } + wordCount.forEachPair(new ObjectLongProcedure<String>() { + @Override + public boolean apply(String first, long second) { + try { + output.collect(new Text(first), new LongWritable(second)); + } catch (IOException e) { + reporter.incrCounter("Exception", "Output IO Exception", 1); + } + return true; + } + }); } } Modified: lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java URL: http://svn.apache.org/viewvc/lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java?rev=907642&r1=907641&r2=907642&view=diff ============================================================================== --- lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java (original) +++ lucene/mahout/trunk/utils/src/main/java/org/apache/mahout/utils/vectors/tfidf/TFIDFConverter.java Mon Feb 8 12:51:51 2010 @@ -43,7 +43,6 @@ import org.apache.mahout.common.HadoopUtil; import org.apache.mahout.common.Pair; import org.apache.mahout.math.VectorWritable; -import org.apache.mahout.utils.vectors.common.PartialVectorMergeReducer; import org.apache.mahout.utils.vectors.common.PartialVectorMerger; import org.apache.mahout.utils.vectors.text.term.TermDocumentCountMapper; import org.apache.mahout.utils.vectors.text.term.TermDocumentCountReducer;