Author: srowen
Date: Wed Sep 14 09:21:30 2011
New Revision: 1170492
URL: http://svn.apache.org/viewvc?rev=1170492&view=rev
Log:
MAHOUT-808 Avoid filtering too much by applying minSupport in combiner
Added:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountCombiner.java
- copied, changed from r1170071,
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java?rev=1170492&r1=1170491&r2=1170492&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/DictionaryVectorizer.java
Wed Sep 14 09:21:30 2011
@@ -49,6 +49,7 @@ import org.apache.mahout.math.VectorWrit
import org.apache.mahout.vectorizer.collocations.llr.CollocDriver;
import org.apache.mahout.vectorizer.common.PartialVectorMerger;
import org.apache.mahout.vectorizer.term.TFPartialVectorReducer;
+import org.apache.mahout.vectorizer.term.TermCountCombiner;
import org.apache.mahout.vectorizer.term.TermCountMapper;
import org.apache.mahout.vectorizer.term.TermCountReducer;
@@ -332,7 +333,7 @@ public final class DictionaryVectorizer
job.setMapperClass(TermCountMapper.class);
job.setInputFormatClass(SequenceFileInputFormat.class);
- job.setCombinerClass(TermCountReducer.class);
+ job.setCombinerClass(TermCountCombiner.class);
job.setReducerClass(TermCountReducer.class);
job.setOutputFormatClass(SequenceFileOutputFormat.class);
Copied:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountCombiner.java
(from r1170071,
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java)
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountCombiner.java?p2=mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountCombiner.java&p1=mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java&r1=1170071&r2=1170492&rev=1170492&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountCombiner.java
Wed Sep 14 09:21:30 2011
@@ -17,19 +17,16 @@
package org.apache.mahout.vectorizer.term;
-import java.io.IOException;
-
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
-import org.apache.mahout.vectorizer.DictionaryVectorizer;
+
+import java.io.IOException;
/**
- * Can also be used as a local Combiner. This accumulates all the words and
the weights and sums them up.
+ * @see TermCountReducer
*/
-public class TermCountReducer extends Reducer<Text, LongWritable, Text,
LongWritable> {
-
- private int minSupport;
+public class TermCountCombiner extends Reducer<Text, LongWritable, Text,
LongWritable> {
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context
context)
@@ -38,16 +35,7 @@ public class TermCountReducer extends Re
for (LongWritable value : values) {
sum += value.get();
}
- if (sum >= minSupport) {
- context.write(key, new LongWritable(sum));
- }
- }
-
- @Override
- protected void setup(Context context) throws IOException,
InterruptedException {
- super.setup(context);
- minSupport =
context.getConfiguration().getInt(DictionaryVectorizer.MIN_SUPPORT,
-
DictionaryVectorizer.DEFAULT_MIN_SUPPORT);
+ context.write(key, new LongWritable(sum));
}
}
Modified:
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java
URL:
http://svn.apache.org/viewvc/mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java?rev=1170492&r1=1170491&r2=1170492&view=diff
==============================================================================
---
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java
(original)
+++
mahout/trunk/core/src/main/java/org/apache/mahout/vectorizer/term/TermCountReducer.java
Wed Sep 14 09:21:30 2011
@@ -25,7 +25,9 @@ import org.apache.hadoop.mapreduce.Reduc
import org.apache.mahout.vectorizer.DictionaryVectorizer;
/**
- * Can also be used as a local Combiner. This accumulates all the words and
the weights and sums them up.
+ * This accumulates all the words and the weights and sums them up.
+ *
+ * @see TermCountCombiner
*/
public class TermCountReducer extends Reducer<Text, LongWritable, Text,
LongWritable> {