Author: srowen
Date: Wed Sep 14 09:21:30 2011
New Revision: 1170492

URL: http://svn.apache.org/viewvc?rev=1170492&view=rev
Log:
MAHOUT-808 Avoid filtering too much by applying minSupport in combiner

Added:
    
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountCombiner.java
      - copied, changed from r1170071, 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java
Modified:
    
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
    
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java?rev=1170492&r1=1170491&r2=1170492&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
 Wed Sep 14 09:21:30 2011
@@ -49,6 +49,7 @@ import org.apache.mahout.math.VectorWrit
 import org.apache.mahout.vectorizer.collocations.llr.CollocDriver;
 import org.apache.mahout.vectorizer.common.PartialVectorMerger;
 import org.apache.mahout.vectorizer.term.TFPartialVectorReducer;
+import org.apache.mahout.vectorizer.term.TermCountCombiner;
 import org.apache.mahout.vectorizer.term.TermCountMapper;
 import org.apache.mahout.vectorizer.term.TermCountReducer;
 
@@ -332,7 +333,7 @@ public final class DictionaryVectorizer 
     job.setMapperClass(TermCountMapper.class);
     
     job.setInputFormatClass(SequenceFileInputFormat.class);
-    job.setCombinerClass(TermCountReducer.class);
+    job.setCombinerClass(TermCountCombiner.class);
     job.setReducerClass(TermCountReducer.class);
     job.setOutputFormatClass(SequenceFileOutputFormat.class);
     

Copied: 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountCombiner.java
 (from r1170071, 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java)
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountCombiner.java?p2=mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountCombiner.java&p1=mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java&r1=1170071&r2=1170492&rev=1170492&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountCombiner.java
 Wed Sep 14 09:21:30 2011
@@ -17,19 +17,16 @@
 
 package org.apache.mahout.vectorizer.term;
 
-import java.io.IOException;
-
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.vectorizer.DictionaryVectorizer;
+
+import java.io.IOException;
 
 /**
- * Can also be used as a local Combiner. This accumulates all the words and 
the weights and sums them up.
+ * @see TermCountReducer
  */
-public class TermCountReducer extends Reducer<Text, LongWritable, Text, 
LongWritable> {
-
-  private int minSupport;
+public class TermCountCombiner extends Reducer<Text, LongWritable, Text, 
LongWritable> {
 
   @Override
   protected void reduce(Text key, Iterable<LongWritable> values, Context 
context)
@@ -38,16 +35,7 @@ public class TermCountReducer extends Re
     for (LongWritable value : values) {
       sum += value.get();
     }
-    if (sum >= minSupport) {
-      context.write(key, new LongWritable(sum));
-    }
-  }
-
-  @Override
-  protected void setup(Context context) throws IOException, 
InterruptedException {
-    super.setup(context);
-    minSupport = 
context.getConfiguration().getInt(DictionaryVectorizer.MIN_SUPPORT,
-                                                   
DictionaryVectorizer.DEFAULT_MIN_SUPPORT);
+    context.write(key, new LongWritable(sum));
   }
 
 }

Modified: 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java
URL: 
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java?rev=1170492&r1=1170491&r2=1170492&view=diff
==============================================================================
--- 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java
 (original)
+++ 
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java
 Wed Sep 14 09:21:30 2011
@@ -25,7 +25,9 @@ import org.apache.hadoop.mapreduce.Reduc
 import org.apache.mahout.vectorizer.DictionaryVectorizer;
 
 /**
- * Can also be used as a local Combiner. This accumulates all the words and 
the weights and sums them up.
+ * This accumulates all the words and the weights and sums them up.
+ *
+ * @see TermCountCombiner
  */
 public class TermCountReducer extends Reducer<Text, LongWritable, Text, 
LongWritable> {
 


Reply via email to